In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import tarfile
from six.moves import urllib

# Download and Read data

In [None]:
HOUSING_PATH = r"E:\jupyter_code\datasets\housing"
HOUSING_URL = r"https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"

# def fetch(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
#     if not os.path.isdir(HOUSING_PATH):
#         os.makedirs(HOUSING_PATH)
#     HOUSING_FILE = os.path.join(HOUSING_PATH, "housing.tgz")
#     urllib.request.urlretrieve(HOUSING_URL, HOUSING_FILE)
#     with tarfile.open(HOUSING_FILE, "r") as tar:
#         tar.extractall(HOUSING_PATH)

# fetch()

In [None]:
from pandas.plotting import scatter_matrix
housing = pd.read_csv(HOUSING_FILE)
# 每个属性数值，两个属性关系
# housing.info()
# housing.describe()
# housing["ocean_proximity"].value_counts()
housing.hist(bins=50, figsize=(20, 15))
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, c="median_house_value", cmap=plt.cm.jet, s=housing["population"]/100)
corr = housing.corr()
corr["median_house_value"].sort_values(ascending=False)
scatter_matrix(housing[["median_house_value", "median_income", "total_rooms"]], figsize=(12, 8))
plt.show()

# Split Data: Test And Train; Train: Num And Cat

In [None]:
# 抽样训练集
# from sklearn.model_selection import train_test_split
# train_set, test_set = train_test_split(housing, test_size=0.2)
# 分层抽样训练集
from sklearn.model_selection import StratifiedShuffleSplit
income_cat = np.ceil(housing["median_income"] / 1.5)
income_cat.where(income_cat < 5, 5.0, inplace=True)
# income_cat.hist(bins=20)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
for train, test in split.split(housing, income_cat):
    strat_train_set = housing.loc[train]
    strat_test_set = housing.loc[test]

# 分开标签
train_housing = strat_train_set.drop("median_house_value", axis=1)
train_housing_labels = strat_train_set["median_house_value"].copy()
# 分开数值属性
num_housing = train_housing.drop("ocean_proximity", axis=1)
cat_housing = train_housing["ocean_proximity"].copy()

# Preprocessing: Num And Cat

In [None]:
# 填充数值属性的缺失值
# median = housing["total_bedrooms"].median()
# housing["total_bedrooms"].fillna(median, inplace=True)

# from sklearn.preprocessing import Imputer
# imputer = Imputer(strategy="median")
# X = imputer.fit_transform(housing_num)

# imputer.fit(housing_num)
# X = imputer.transform(housing_num)

In [None]:
# 二进制处理类别属性
# from sklearn.preprocessing import LabelEncoder
# encoder_label = LabelEncoder()
# cat_housing_encoded  = encoder_label.fit_transform(cat_housing)

# from sklearn.preprocessing import OneHotEncoder
# encoder_one = OneHotEncoder(categories="auto")
# X = encoder_one.fit_transform(cat_housing_encoded.reshape(-1, 1)).toarray()

# from sklearn.preprocessing import LabelBinarizer
# encoder = LabelBinarizer()
# X = encoder.fit_transform(cat_housing)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, CategoricalEncoder, OneHotEncoder, OrdinalEncoder
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attrs):
        self.attrs = attrs
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attrs].values
class MyLabelBinarizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.encode = None
        return self
    def transform(self, X, y=None):
        if self.encode:
            return self.encode.transform(X)
        self.encode = LabelBinarizer()
        return self.encode.fit_transform(X)

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
num_attrs = num_housing.columns.tolist()
cat_attrs = ["ocean_proximity"]
num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_attrs)),
    ("imputer", SimpleImputer(strategy="median")),
    ("attribs_adder", CombinedAttributesAdder()),
    ("std", StandardScaler())
])
cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_attribs)),
#     ("label_binarizer", LabelBinarizer())  # LabelBinarizer 处理标签的, 只有 y, 没有 X
#     ("label_binarizer", CategoricalEncoder())  # CategoricalEncoder into OneHotEncoder and OrdinalEncoder
    ("label_binarizer", MyLabelBinarizer())
])
full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
], n_jobs=-1)
train_housing_prepared = full_pipeline.fit_transform(train_housing)

# OneHot attr categories_
# full_pipeline.transformer_list[1][1].named_steps["label"].categories_
# LabelBinarizer attr classes
# full_pipeline.transformer_list[1][1].named_steps["label"].encode.classes_

# LinearRegression AND DecisionTreeRegressor

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
# 线性回归模型
lin_reg = LinearRegression(n_jobs=-1)
lin_reg.fit(train_housing_prepared, train_housing_labels)
# 均方根误差
lin_predict = lin_reg.predict(train_housing_prepared)
lin_rmse = np.sqrt(mean_squared_error(lin_predict, train_housing_labels))
# 决策树模型
tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_housing_prepared, train_housing_labels)
# 均方根误差
tree_predict = tree_reg.predict(train_housing_prepared)
tree_rmse = np.sqrt(mean_squared_error(tree_predict, train_housing_labels))

In [None]:
# cross_val_score 交叉验证模型
from sklearn.model_selection import cross_val_score, cross_val_predict
estimators = [LinearRegression(n_jobs=-1), DecisionTreeRegressor()]
scores = [cross_val_score(estimator, train_housing_prepared, train_housing_labels,
                         scoring="neg_mean_squared_error", cv=10, n_jobs=-1) for estimator in estimators]

def display(score):
    print("score:", score)
    print("mean:", score.mean())
    print("std:", score.std())

for score in scores:
    display(np.sqrt(-score))

# GRID: Change Params

In [None]:
# GridSearchCV: search params
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
rand_forest_reg = RandomForestRegressor(n_estimators=10)
param_grid = [
    {"max_depth":[20,30,40], "max_features":[4,6,8]},
    {"bootstrap":[False], "max_depth":[20,30,40], "max_features":[4,6,8]}
]
grid = GridSearchCV(rand_forest_reg, param_grid, scoring="neg_mean_squared_error",
                    n_jobs=-1, cv=5, return_train_score=True)
grid.fit(train_housing_prepared, train_housing_labels)

# print(grid.best_estimator_, grid.best_index_, grid.best_params_, grid.best_score_)
cv_res = grid_search.cv_results_
for mean_score, params in zip(cv_res["mean_test_score"], cv_res["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
# RandomizedSearchCV: random search
param_distributions = {"bootstrap":[True, False], "max_depth":range(20,40), "max_features":range(4,8)}
rand_grid = RandomizedSearchCV(rand_forest_reg, param_distributions, n_iter=20,
                               scoring="neg_mean_squared_error", n_jobs=-1, cv=5, return_train_score=True)
rand_grid.fit(train_housing_prepared, train_housing_labels)
# for mean, param in zip(rand_grid.cv_results_["params"], np.sqrt(-grid.cv_results_["mean_train_score"])):
#     print(mean, param)

In [None]:
# sort best feature
feature_importances = rand_grid.best_estimator_.feature_importances_
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = full_pipeline.transformer_list[1][1].named_steps["label"].encode.classes_.tolist()
attributes = num_attrs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

# Test Check

In [None]:
# 模型检验
final_estimator = rand_grid.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
# 在测试集中补缺失值，标准化等用到的值都是训练集上的中值，平均值等，而不是测试集上的。因为必须把数据放缩到同一尺度
X_prepared = full_pipeline.transform(X_test)
final_prediction = np.sqrt(mean_squared_error(final_estimator.predict(X_prepared), y_test))
final_prediction

# SAVE AND READ

In [None]:
# from sklearn.externals import joblib
# joblib.dump(anything, file_path) # anything: model, predict...
# joblib.load(file_path)