In [28]:
import joblib
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [29]:
housing = fetch_california_housing()

In [30]:
X, y = housing['data'], housing['target']
df_house = pd.DataFrame(X, columns=housing["feature_names"])

In [31]:
df_house['AvgBedsPerRoom'] = df_house['AveBedrms']/df_house['AveRooms']

In [32]:
df_house.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,AvgBedsPerRoom
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,0.213075
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,0.058023
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.1
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,0.175426
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,0.203181
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,0.239834
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,1.0


In [33]:
X_train, X_test, y_train, y_test = train_test_split(df_house, y, test_size=0.3, random_state=42)

In [63]:
features_to_transform = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population','AveOccup', 'Latitude', 'Longitude', 'AvgBedsPerRoom']
transformer = Pipeline(
    steps=[("standard_scaler", StandardScaler())]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("std", transformer, features_to_transform),
    ]
)

regressor = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", RandomForestRegressor())]
)

In [64]:
regressor.fit(X_train, y_train)
print("model score: %.3f" % regressor.score(X_test, y_test))

model score: 0.806


In [68]:
joblib.dump(regressor, "../model/regressor.gzip", compress="gzip")

['../model/regressor.gzip']

In [45]:
X_test.iloc[0]

MedInc               1.681200
HouseAge            25.000000
AveRooms             4.192201
AveBedrms            1.022284
Population        1392.000000
AveOccup             3.877437
Latitude            36.060000
Longitude         -119.010000
AvgBedsPerRoom       0.243854
Name: 20046, dtype: float64

In [59]:
X_test.iloc[0:1].to_dict(orient='list')

{'MedInc': [1.6812],
 'HouseAge': [25.0],
 'AveRooms': [4.192200557103064],
 'AveBedrms': [1.0222841225626742],
 'Population': [1392.0],
 'AveOccup': [3.8774373259052926],
 'Latitude': [36.06],
 'Longitude': [-119.01],
 'AvgBedsPerRoom': [0.24385382059800667]}

In [53]:
regressor.predict(X_test.iloc[0:1,:])

array([0.49805])

In [69]:
joblib.load("../model/regressor.gzip").predict(X_test.iloc[0:1,:])

array([0.49292])