In [27]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Prepare Dataset

In [28]:
import pandas as pd

df = pd.read_csv("berlin-houses.txt")

columns = ["balcony", "builtin_kitchen",
           "energy_certificate", "has_new_flag", "living_space",
           "number_rooms", "private_offer",
           "quarter", "garden", "warm_price"]

In [30]:
df = df[columns].dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1018 entries, 1 to 1037
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   balcony             1018 non-null   bool   
 1   builtin_kitchen     1018 non-null   bool   
 2   energy_certificate  1018 non-null   bool   
 3   has_new_flag        1018 non-null   bool   
 4   living_space        1018 non-null   float64
 5   number_rooms        1018 non-null   float64
 6   private_offer       1018 non-null   bool   
 7   quarter             1018 non-null   object 
 8   garden              1018 non-null   bool   
 9   warm_price          1018 non-null   float64
dtypes: bool(6), float64(3), object(1)
memory usage: 45.7+ KB


In [31]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("warm_price", axis=1), df["warm_price"],
                                                    test_size=0.2, random_state=0)

## Pipeline

In [32]:
# create separate transformer pipelines for numerical and categorical data
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

num_columns = ["living_space"]
cat_columns = ["quarter"]


full_pipe = ColumnTransformer([
    ("num", StandardScaler(), num_columns),
    ("cat", OneHotEncoder(), cat_columns)
])

housing_prepared = full_pipe.fit_transform(X_train)
housing_prepared

<814x12 sparse matrix of type '<class 'numpy.float64'>'
	with 1628 stored elements in Compressed Sparse Row format>

## Modeling

In [33]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 814 entries, 804 to 704
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   balcony             814 non-null    bool   
 1   builtin_kitchen     814 non-null    bool   
 2   energy_certificate  814 non-null    bool   
 3   has_new_flag        814 non-null    bool   
 4   living_space        814 non-null    float64
 5   number_rooms        814 non-null    float64
 6   private_offer       814 non-null    bool   
 7   quarter             814 non-null    object 
 8   garden              814 non-null    bool   
dtypes: bool(6), float64(2), object(1)
memory usage: 30.2+ KB


In [34]:
X_train.number_rooms.unique()

array([2., 3., 5., 4.])

In [35]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, y_train)

LinearRegression()

In [36]:
some_data = df.iloc[:10].drop("warm_price", axis=1)
some_label = df.iloc[:10]["warm_price"]

some_data_prepared = full_pipe.transform(some_data)

In [37]:
lin_reg.predict(some_data_prepared)

array([ 916.05160666,  643.23665409, 1013.71957452,  982.75822215,
       1090.41239916,  868.33613389,  976.59303424,  989.44034524,
        978.52048379,  990.6469578 ])

In [38]:
some_label

1      630.98
2      952.47
3     1139.00
4      735.51
5     1364.62
6      915.00
7      975.00
8      784.17
9      985.00
10     982.99
Name: warm_price, dtype: float64

## Evaluation

In [39]:
import numpy as np
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(y_train, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mse

42230.47882459177

In [40]:
# try decision tree regressor
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, y_train)

DecisionTreeRegressor()

In [41]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(y_train, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_mse

4948.438061909242

In [42]:
# try decision tree regressor
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor()
rf_reg.fit(housing_prepared, y_train)

RandomForestRegressor()

In [43]:
housing_predictions = rf_reg.predict(housing_prepared)
rf_mse = mean_squared_error(y_train, housing_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_mse

9740.924266327864

## Fine Tunning


In [44]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {"n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8]},
    {"bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4]},
]

rf_reg = RandomForestRegressor()

grid_search = GridSearchCV(rf_reg, param_grid, cv=5,
                           scoring="neg_mean_squared_error",
                           return_train_score=True)

grid_search.fit(housing_prepared, y_train)


GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [67]:
import joblib

joblib.dump(grid_search.best_estimator_, "housing_model.pkl")

['housing_model.pkl']

In [62]:
cvres = grid_search.cv_results_
print("Best score:", np.sqrt(-grid_search.best_score_))

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

Best score: 218.05050933341406
239.25498419471933 {'max_features': 2, 'n_estimators': 3}
231.65764728239168 {'max_features': 2, 'n_estimators': 10}
225.55886839897906 {'max_features': 2, 'n_estimators': 30}
236.4503846481286 {'max_features': 4, 'n_estimators': 3}
226.1620567545255 {'max_features': 4, 'n_estimators': 10}
222.65499535371524 {'max_features': 4, 'n_estimators': 30}
233.61039097606184 {'max_features': 6, 'n_estimators': 3}
226.2819801949895 {'max_features': 6, 'n_estimators': 10}
222.32191003197593 {'max_features': 6, 'n_estimators': 30}
240.54722243986834 {'max_features': 8, 'n_estimators': 3}
221.6311204670002 {'max_features': 8, 'n_estimators': 10}
218.05050933341406 {'max_features': 8, 'n_estimators': 30}
253.04568082688598 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
251.96432764085574 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
250.38725802570136 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
251.0653788460082 {'bootstrap': Fa

In [47]:
final_model = grid_search.best_estimator_

some_data = df.iloc[:10].drop("warm_price", axis=1)
some_label = df.iloc[:10]["warm_price"]

some_data_prepared = full_pipe.transform(some_data)

final_predictions = final_model.predict(some_data_prepared)


print(grid_search.best_score_)
print(final_predictions)
print(list(some_label))


-47546.024620561286
[ 903.29304312  560.95866667 1163.08103175  811.711      1347.875
  870.32466667  930.41066667  800.87033333  887.92833333  967.83173889]
[630.98, 952.47, 1139.0, 735.51, 1364.62, 915.0, 975.0, 784.17, 985.0, 982.99]


In [63]:
some_data.iloc[0]

balcony                              True
builtin_kitchen                     False
energy_certificate                  False
has_new_flag                        False
living_space                         59.0
number_rooms                          2.0
private_offer                       False
quarter               Neukölln (Neukölln)
garden                              False
Name: 1, dtype: object

In [71]:
clf = joblib.load("housing_model.pkl")
clf.predict(
    full_pipe.transform(some_data)
    )

array([ 903.29304312,  560.95866667, 1163.08103175,  811.711     ,
       1347.875     ,  870.32466667,  930.41066667,  800.87033333,
        887.92833333,  967.83173889])

In [48]:
results_df = pd.DataFrame()
results_df["prediction"] = final_predictions
results_df["ground_truth"] = some_label
results_df["error"] = abs(results_df.prediction - results_df.ground_truth)
results_df

Unnamed: 0,prediction,ground_truth,error
0,903.293043,,
1,560.958667,630.98,70.021333
2,1163.081032,952.47,210.611032
3,811.711,1139.0,327.289
4,1347.875,735.51,612.365
5,870.324667,1364.62,494.295333
6,930.410667,915.0,15.410667
7,800.870333,975.0,174.129667
8,887.928333,784.17,103.758333
9,967.831739,985.0,17.168261


In [49]:
results_df.describe()

Unnamed: 0,prediction,ground_truth,error
count,10.0,9.0,9.0
mean,924.428448,942.416667,225.005403
std,211.599718,219.508995,212.623241
min,560.958667,630.98,15.410667
25%,826.364417,784.17,70.021333
50%,895.610688,952.47,174.129667
75%,958.476471,985.0,327.289
max,1347.875,1364.62,612.365
