In [61]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing

In [51]:
california_housing = fetch_california_housing()

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to C:\Users\fangxiang\scikit_learn_data


In [55]:
x_data = california_housing.data
y_data = california_housing.target

In [56]:
california_housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [57]:
y_data

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [67]:
x_data

df = pd.DataFrame(data=x_data,columns=california_housing.feature_names)
df["fangjia"]=y_data
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,fangjia
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


## 1.数据集拆分
+ ### 1)找到相关性最高的那个特征，按照那个特征进行分层抽样
## 2.数据预处理
+ ### 1)缺失值处理：无需
+ ### 2)类别转one-hot编码：无需
+ ### 3)特征放缩:标准化
## 3.

In [109]:
from sklearn.model_selection import StratifiedShuffleSplit

#1.找到与房价最相关的特征名称
featurename_impotances = np.absolute(df.corr()["fangjia"]).sort_values(ascending=False).index[1]
#2.将该特征值离散化
df["temp"] = np.ceil(df[featurename_impotances])

sss = StratifiedShuffleSplit(n_splits=1,train_size=0.8,random_state=42)

for train_index, test_index in sss.split(df, df["temp"]):
    train_set, test_set = df.loc[train_index], df.loc[test_index]

train_set.drop("temp",axis=1,inplace=True)
test_set.drop("temp",axis=1,inplace=True)

In [130]:
from sklearn.preprocessing import StandardScaler

X_train_set = train_set.iloc[:,:-1]
Y_train_set = train_set["fangjia"]

scaler = StandardScaler()
X_train_set = scaler.fit_transform(X_train_set)

In [146]:
from sklearn.svm import SVR

In [170]:
#训练模型
estimator_SVR = SVR()
estimator_SVR.fit(X_train_set,Y_train_set)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [171]:
#评估模型
from sklearn.metrics import mean_squared_error

Y_predict = estimator_SVR.predict(X_train_set)
mse = mean_squared_error(Y_train_set,Y_predict)
rmse = np.sqrt(mse)
print(rmse)

0.5798517940307067


In [147]:
from sklearn.ensemble import RandomForestRegressor

In [172]:
estimator_RFR = RandomForestRegressor()
estimator_RFR.fit(X_train_set,Y_train_set)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [173]:
Y_predict = estimator_RFR.predict(X_train_set)
mse = mean_squared_error(Y_train_set,Y_predict)
rmse = np.sqrt(mse)
print(rmse)

0.1881949199728859


In [None]:
from sklearn.model_selection import cross_val_score

In [175]:
scores = cross_val_score(estimator_RFR,Y_predict.reshape(-1,1),Y_train_set,scoring ="neg_mean_squared_error",cv=5)
rmse = np.sqrt(-scores)
print(rmse)

[0.21206016 0.20530853 0.21280539 0.20752853 0.20528484]


In [176]:
from sklearn.model_selection import GridSearchCV

In [185]:
from sklearn.model_selection import RandomizedSearchCV

In [184]:
params = [{'n_estimators':[80,100,120],'max_features':[4,6,8]}]
gcv = GridSearchCV(estimator_RFR,param_grid=params)
gcv.fit(X_train_set,Y_train_set)
RandomizedSearchCV()

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n

In [179]:
myestimator = gcv.best_estimator_

In [182]:
y_my_predict = myestimator.predict(X_train_set)

In [183]:
mse = mean_squared_error(Y_train_set,y_my_predict)
rmse = np.sqrt(mse)
print(rmse)

0.1987257990638445
