# 2.2 数据预处理
pandas

## 2.2.1创建并读取数据集

In [1]:
import os

In [2]:
os.makedirs(os.path.join("..","data"),exist_ok=True)
data_file=os.path.join("..","data","house_tiny.csv")
# w+新建写、读
with open(data_file,'w+') as f:
    f.write('NumRooms,Alley,Price\n')  # 列名
    f.write('NA,Pave,127500\n')  # 每行表示一个数据样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [3]:
import pandas as pd

In [4]:
data=pd.read_csv(data_file)
data

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


In [5]:
print(data.describe())
print(data.dtypes)

       NumRooms         Price
count  2.000000       4.00000
mean   3.000000  137900.00000
std    1.414214   30255.68817
min    2.000000  106000.00000
25%    2.500000  122125.00000
50%    3.000000  133750.00000
75%    3.500000  149525.00000
max    4.000000  178100.00000
NumRooms    float64
Alley        object
Price         int64
dtype: object


## 2.2.2 处理缺失值

对于缺失值，我们可以采用插值，或者删除

In [6]:
inputs,outputs=data.iloc[:,0:-1],data.iloc[:,-1:]
print(inputs)
print(outputs)

   NumRooms Alley
0       NaN  Pave
1       2.0   NaN
2       4.0   NaN
3       NaN   NaN
    Price
0  127500
1  106000
2  178100
3  140000


In [7]:
inputs.isnull().sum()

NumRooms    2
Alley       3
dtype: int64

对于数值列，我们适用平均值填充

In [8]:
inputs=inputs.fillna(inputs.mean())

对于文本对象，我们采用独热编码

In [9]:
# dummy_na : bool, default False
#     Add a column to indicate NaNs, if False NaNs are ignored.
inputs=pd.get_dummies(inputs,dummy_na=True)
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


In [10]:
import torch

In [11]:
inputs.shape

(4, 3)

In [12]:
inputs=torch.tensor(inputs)
inputs

ValueError: could not determine the shape of object type 'DataFrame'

torch.tensor() 无法直接处理 Pandas DataFrame 或 Series。

In [13]:
inputs=torch.tensor(inputs.values)
inputs

tensor([[3., 1., 0.],
        [2., 0., 1.],
        [4., 0., 1.],
        [3., 0., 1.]], dtype=torch.float64)

inputs = torch.from_numpy(inputs.to_numpy())  # 共享内存，更高效
这个共享内存就是指向同一个

```
import torch

X = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(outputs.to_numpy(dtype=float))
X, y
```