### 读取和获取数据集

In [5]:
%matplotlib inline
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import sys
sys.path.append("..")

print(torch.__version__)
torch.set_default_tensor_type(torch.FloatTensor)

1.6.0


训练数据集包括1460个样本,80个特征和1个标签  
测试数据集包括1459个样本和80个特征

In [13]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')
print(train_data.shape)
print(test_data.shape) 

(1460, 81)
(1459, 80)


In [14]:
# 查看前4个特征,后两个特征和标签(SalePrice))
train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,WD,Normal,208500
1,2,20,RL,80.0,WD,Normal,181500
2,3,60,RL,68.0,WD,Normal,223500
3,4,70,RL,60.0,WD,Abnorml,140000


In [20]:
all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
all_features.shape

(2919, 79)

### 预处理数据

#### 对连续数值的特征做标准化　　
* 将该特征的每个值先减去均值，再除以标准差．　　　　
* 对于缺失的特征值，将其替换成该特征的均值．　　

In [23]:
# dataframe中的字符串dtype=object

# pandas.DataFrame.dtypes：pandas提供的这个方法用于返回这个DataFrame中所有列的dtype，以一个pandas.Series形式返回。这个Series的index就是DataFrame的各列名。

# numeric_features接受的是数据全部为数字的列名.  
# 含NaN的列也为object,(此处有疑问???)
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
print(numeric_features)

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')


In [30]:
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / x.std()
)
# 标准化后，均值为０，故直接用０来替换缺失值
# (这两个步骤是否考虑换一下???)
# fillna(number) 直接对数据块中为NaN替换为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)

#### 将离散数值转成指示特征
* 相当于一个onehot操作

In [32]:
# dummy_na=True将缺失值也当作合法的特征值并为其创建指示特征
# 特征数从79增加到了331
all_features = pd.get_dummies(all_features,dummy_na=True)
all_features.shape

(2919, 331)

* 通过values属性得到numpy格式的数据,并转成Tensor方便后面的训练.

In [None]:
# train_data.shape=(1460, 81)
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)

In [27]:
a = [1, 2, 3, 'NaN']
b = ["a", "b", "c", "NaN"]
df = pd.DataFrame({"A": a, "B": b})
df.dtypes

A    object
B    object
dtype: object

In [33]:
train_data.shape

(1460, 81)