# 数据操作

### 1. 创建data

In [10]:
import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True) # 创建文件夹
data_file = os.path.join('..', 'data', 'house_tiny.csv') # 创建文件，返回文件路径
with open(data_file, 'w') as f:
    f.write('NumRooms,RoofType,Price\n') # 列名
    f.write('NA,NA,127500\n') # 每行表示一个数据样本
    f.write('2,Conver,106000\n')
    f.write('4,Slate,178100\n')
    f.write('NA,NA,140000\n')

### 2. 读取data

In [11]:
import pandas as pd
data = pd.read_csv(data_file)
print(data)

   NumRooms RoofType   Price
0       NaN      NaN  127500
1       2.0   Conver  106000
2       4.0    Slate  178100
3       NaN      NaN  140000


### 3. 处理缺失值——插值

- 对分类指标进行处理——将其转换成0/1.

In [12]:
inputs, targets = data.iloc[:, 0:2], data.iloc[:,2]
print(inputs)
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)

   NumRooms RoofType
0       NaN      NaN
1       2.0   Conver
2       4.0    Slate
3       NaN      NaN
   NumRooms  RoofType_Conver  RoofType_Slate  RoofType_nan
0       NaN            False           False          True
1       2.0             True           False         False
2       4.0            False            True         False
3       NaN            False           False          True


- 对数值进行处理

In [14]:
inputs = inputs.fillna(inputs.mean()) # 用该列的均值进行插值
print(inputs)

   NumRooms  RoofType_Conver  RoofType_Slate  RoofType_nan
0       3.0            False           False          True
1       2.0             True           False         False
2       4.0            False            True         False
3       3.0            False           False          True


### 4. 将data转换成tensor

In [16]:
import torch
X = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(targets.to_numpy(dtype=float))
print(X,y)

tensor([[3., 0., 0., 1.],
        [2., 1., 0., 0.],
        [4., 0., 1., 0.],
        [3., 0., 0., 1.]], dtype=torch.float64) tensor([127500., 106000., 178100., 140000.], dtype=torch.float64)


### 5. 练习  
创建包含更多行和列的原始数据集，删除缺失值最多的列。

In [28]:

def drop_col(data):
    results = data.isna().sum() # 统计每列缺省值个数
    print(results)
    id_col = results.idxmax() # 获取缺失值最多的列名
    return data.drop(columns=id_col) # 按列名删除

# 上述过程可以描述为：data.drop(data.isna().sum().idxmax(), columns=id_col)

mydata_file = os.path.join('E:\Deep_Learning', 'exercise_data.csv')
with open(mydata_file, 'w') as f:
    f.write('NumRooms,RoofType,Price\n') # 列名
    f.write('NA,NA,127500\n') # 每行表示一个数据样本
    f.write('2,NA,106000\n')
    f.write('4,Slate,178100\n')
    f.write('NA,NA,140000\n')
    f.write('NA,NA,127500\n')
    f.write('2,NA,106000\n')
    f.write('4,Slate,178100\n')
    f.write('NA,NA,140000\n')
mydata = pd.read_csv(mydata_file)
print(mydata)
mydata = drop_col(mydata)
print(mydata)

   NumRooms RoofType   Price
0       NaN      NaN  127500
1       2.0      NaN  106000
2       4.0    Slate  178100
3       NaN      NaN  140000
4       NaN      NaN  127500
5       2.0      NaN  106000
6       4.0    Slate  178100
7       NaN      NaN  140000
NumRooms    4
RoofType    6
Price       0
dtype: int64
   NumRooms   Price
0       NaN  127500
1       2.0  106000
2       4.0  178100
3       NaN  140000
4       NaN  127500
5       2.0  106000
6       4.0  178100
7       NaN  140000
