# 获取数据

In [None]:
import os
import tarfile
from six.moves import urllib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL=DOWNLOAD_ROOT + HOUSING_PATH +"/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
    '''从网络获取数据下载到本地目录'''
    housing_path=HOUSING_PATH
    housing_url=HOUSING_URL
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path,"housing.tgz")
    urllib.request.urlretrieve(housing_url,tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(housing_path)
    housing_tgz.close()
    
def load_housing_data(housing_path=HOUSING_PATH):
    '''从文件读取数据并返回dataframe对象'''
    csv_path=os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)

In [None]:
#fetch_housing_data()
housing = load_housing_data()
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
def split_train_test(data,test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices=shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

train_set,test_set=split_train_test(housing,0.2)
print (len(train_set),"train+",len(test_set),"test")

In [None]:
import hashlib
def test_set_check(identifier,test_ratio,hash):
    return hash(np.int64(identifier)).digest()[-1]<256*test_ratio

def split_train_test_by_id(data,test_tatio,id_column,hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_:test_set_check(id_,test_tatio,hash))
    return data.loc[~in_test_set],data.loc[in_test_set]

#方法一：使用行索引作为唯一标识符，需要确保在数据集末尾添加新数据，并且不会删除任何行
housing_with_id = housing.reset_index()
#方法二：使用经纬度作为唯一标识符，最稳定的特征
housing_with_id["id"] = housing["longitude"]*1000+housing["latitude"]
train_set,test_set = split_train_test_by_id(housing_with_id,0.2,"id")

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split


housing['income_cat']=np.ceil(housing['median_income']/1.5)
housing['income_cat'].where(housing['income_cat']<5,5.0,inplace=True)

train_set,test_set=train_test_split(housing,test_size=0.2,random_state=42)
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(housing,housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
print(housing['income_cat'].value_counts()/len(housing))
print(strat_test_set['income_cat'].value_counts()/len(strat_test_set))
print(test_set['income_cat'].value_counts()/len(strat_test_set))

for set in (strat_train_set,strat_test_set):
    set.drop(["income_cat"],axis=1,inplace=True)

# 从数据探索和可视化获得洞见

In [None]:
#将地理数据可视化
housing = strat_train_set.copy()
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.1,
            s=housing['population']/100,label='population',
            c='median_house_value',cmap=plt.get_cmap('jet'),colorbar=True)
plt.legend()

In [None]:
#寻找相关性
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes =['median_house_value','median_income','total_rooms','housing_median_age']
scatter_matrix(housing[attributes],figsize=(12,8))

In [None]:
housing.plot(kind='scatter',x='median_income',y='median_house_value',alpha=0.1)

In [None]:
#试验不同属性的组合
housing['rooms_per_household']=housing['total_rooms']/housing['households']#家庭每户平均房间数
housing['bedrooms_per_room']=housing['total_bedrooms']/housing['total_rooms']#卧室占比
housing['population_per_household']=housing['population']/housing['households']#每个家庭人口数
corr_matrix=housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)