# 前置准备
## 下载数据
手工从https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz 下载到了本地的 dataassets/housing目录

# 数据洞察

In [5]:
import pandas as pd

HOUSING_DATA_SET = "./datasets/housing/housing.csv"

housing = pd.read_csv(HOUSING_DATA_SET)
housing.info()
housing.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


# 划分数据集
需要提前划出数据的验证集(测试集)，防止出现数据窥探偏误(data snooping bias)
划分数据集是抽样过程，需要尽可能反应总体样本的分布情况。

## 随机抽样
抽样的数据集合足够大(相较于属性的数量)
**随机产生测试集**

    简单能工作，但是由于完全随机，每次产生的测试集都不同，理论上执行几次之后测试集所有内容会全部都出现在了训练集中
**采用唯一标识分割测试集**

    需要数据有唯一标识，慎用行索引(需要考虑中间插入/删除数据情况)
## 分层抽样
按照一个或多个属性的分布情况进行抽样，确保产生的测试集和完整集合在这些属性的分布上接近。


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# 拆分数据集
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

# 准备训练数据

# 拆分数据和标签, 为什么要分割？如何关联起来？
train_set_data = train_set.drop("median_house_value", axis=1)
train_set_label = train_set["median_house_value"].copy()

# 处理空数据
train_set_data_num = train_set_data.drop("ocean_proximity", axis=1)
imputer = SimpleImputer(strategy="median")
# x is numpy array
x = imputer.fit_transform(train_set_data_num)
train_set_data_tr = pd.DataFrame(x, columns=train_set_data_num.columns, index=train_set_data_num.index)
train_set_data_tr.describe()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.58229,35.643149,28.608285,2642.004784,538.496851,1426.453004,499.986919,3.880754
std,2.005654,2.136665,12.602499,2174.646744,419.007096,1137.05638,380.967964,1.904294
min,-124.35,32.55,1.0,2.0,1.0,3.0,1.0,0.4999
25%,-121.81,33.93,18.0,1454.0,296.75,789.0,280.0,2.5667
50%,-118.51,34.26,29.0,2129.0,437.0,1167.0,410.0,3.5458
75%,-118.01,37.72,37.0,3160.0,647.0,1726.0,606.0,4.773175
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001
