In [1]:
#import matplotlib.pyplot as plt
#import numpy as np
#import seaborn as sns
#import math
#from scipy import stats
#print("imported " + str(len(raw)) + " rows")
#raw.columns.tolist()

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from IPython.display import HTML, display
import tabulate
import numpy as np
from matplotlib import cm
raw = pd.pandas.read_csv('../setup_data/pred_1x0_20181130.csv',sep=",",low_memory=False)

## Functions

def print_list(list_in):
    display(HTML(tabulate.tabulate(list_in, tablefmt='html')))

# Setup Data Set

In [2]:
### CLEAN DATA FUNC

def clean_func(data,predictors):
    
    ## DO IMPUTATION 
    # living_area
    floor = Imputer(missing_values="NaN", strategy="mean")
    floor.fit(data[["floor"]])
    data['floor'] = floor.transform(data[["floor"]]).ravel()
    
    ## REMOVE ROWS w. NULL
    data = data[pd.notnull(data[predictors]).all(axis=1)]
    return data


In [3]:
## set predictors
predictors = ['min_distance_to_metro','living_area'
              ,'floor','sqm_list_price','sqm_rent_price'
              ,'area_from_geo']

## set numeric predictors
num_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64','uint8']
train_data_numerics = raw[predictors].select_dtypes(include=num_types).columns

In [4]:
## Set target and clean data (se function above)
target = 'sqm_sold_price'
train_data = clean_func(raw,predictors)

train_data_predictors = train_data[predictors]
train_data_target = train_data[target].values

# Try different Models

### Split Dataset

In [82]:
## Split Train/test
X_train, X_test, y_train, y_test = train_test_split(train_data_predictors, train_data_target, test_size=0.33, random_state=42)

### Linear Regression

In [269]:
## TRAIN MODEL
from sklearn import linear_model
lin_reg = linear_model.LinearRegression()

lin_reg.fit(X_train[train_data_numerics],y_train)

print("Coefficients")
print_list(list(zip(train_data_numerics,lin_reg.coef_)))

Coefficients


0,1
min_distance_to_metro,-1.18663
living_area,-141.889
floor,59.1866
sqm_list_price,1.07014
sqm_rent_price,-56.2497


In [270]:
## PREDICT
lin_reg_pred_y_test = lin_reg.predict(X_test[train_data_numerics])
lin_reg_pred_y_train = lin_reg.predict(X_train[train_data_numerics])


In [281]:
## METRICS
import seaborn as sns
errors_tets = (((y_test - lin_reg_pred_y_test)**2)**0.5)
errors_train = (((y_train - lin_reg_pred_y_train)**2)**0.5)
print("Mean error train:")
print(errors_train.mean())
print("Mean error test:")
print(errors_tets.mean())

%matplotlib notebook
sns.residplot(lin_reg_pred_y_train, y_train, label="train_error");
sns.residplot(lin_reg_pred_y_test, y_test, label="test_error");
plt.show()

Mean error train:
5422.63486872
Mean error test:
5126.32507062


<IPython.core.display.Javascript object>

### Ridge Regression

In [105]:
## TRAIN MODEL
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV

lin_ridge_reg = linear_model.Lasso(alpha=0.5)
lin_ridge_reg.fit(X_train[train_data_numerics],y_train)
print("Coefficients")
print_list(list(zip(train_data_numerics,lin_ridge_reg.coef_)))
## PREDICT
lin_ridge_reg_pred_y_test = lin_ridge_reg.predict(X_test[train_data_numerics])
lin_ridge_reg_pred_y_train = lin_ridge_reg.predict(X_train[train_data_numerics])

## METRICS
import seaborn as sns
errors_tets = (((y_test - lin_ridge_reg_pred_y_test)**2)**0.5)
errors_train = (((y_train - lin_ridge_reg_pred_y_train)**2)**0.5)
explained_variance_score = 1-np.var(y_test - lin_ridge_reg_pred_y_test)/np.var(y_test)
print("Mean error train:")
print(errors_train.mean())
print("Mean error test:")
print(errors_tets.mean())
print("Explained variance score")
print(explained_variance_score)

%matplotlib notebook
sns.residplot(lin_ridge_reg_pred_y_train, y_train, label="train_error");
sns.residplot(lin_ridge_reg_pred_y_test, y_test, label="test_error");
plt.show()
lin_ridge_reg.coef_

Coefficients


0,1
min_distance_to_metro,-1.18662
living_area,-141.886
floor,59.0547
sqm_list_price,1.07015
sqm_rent_price,-56.2435


Mean error train:
5422.63390266
Mean error test:
5126.3252223
Explained variance score
0.908114248145


<IPython.core.display.Javascript object>

array([  -1.186618  , -141.88564972,   59.05467884,    1.07014754,
        -56.24347214])

### Random Forest Regressor

In [348]:
## TRAIN MODEL
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(max_depth=7, random_state=0, n_estimators=80)
rf_reg.fit(X_train[train_data_numerics],y_train)

print("Feature Importance")
print_list(list(zip(train_data_numerics,rf_reg.feature_importances_)))

Feature Importance


0,1
min_distance_to_metro,0.00637774
living_area,0.0409433
floor,0.00277084
sqm_list_price,0.944223
sqm_rent_price,0.00568534


In [349]:
## PREDICT
rf_reg_pred_y_test = rf_reg.predict(X_test[train_data_numerics])
rf_reg_pred_y_train = rf_reg.predict(X_train[train_data_numerics])


In [350]:
## METRICS
import seaborn as sns
errors_tets = (((y_test - rf_reg_pred_y_test)**2)**0.5)
errors_train = (((y_train - rf_reg_pred_y_train)**2)**0.5)
explained_variance_score = 1-np.var(y_test - rf_reg_pred_y_test)/np.var(y_test)
print("Mean error train:")
print(errors_train.mean())
print("Mean error test:")
print(errors_tets.mean())
print("Explained variance score")
print(explained_variance_score)

%matplotlib notebook
sns.residplot(rf_reg_pred_y_train, y_train, label="train_error");
sns.residplot(rf_reg_pred_y_test, y_test, label="test_error");
plt.show()

Mean error train:
4281.59226528
Mean error test:
4781.85924671
Explained variance score
0.920648275164


<IPython.core.display.Javascript object>

### GradientBoostingRegressor

In [251]:
## TRAIN MODEL
from sklearn.ensemble import GradientBoostingRegressor
gb_reg = GradientBoostingRegressor(n_estimators=30,learning_rate=0.1)#loss='quantile', learning_rate=0.0001, n_estimators=50, max_features='log2', min_samples_split=2, max_depth=1)
gb_reg.fit(X_train[train_data_numerics],y_train)

print("Feature Importance")
print_list(list(zip(train_data_numerics,gb_reg.feature_importances_)))


Feature Importance


0,1
min_distance_to_metro,0.0192185
living_area,0.135505
floor,0.000959025
sqm_list_price,0.824037
sqm_rent_price,0.0202805


In [252]:
## PREDICT
gb_reg_pred_y_test = gb_reg.predict(X_test[train_data_numerics])
gb_reg_pred_y_train = gb_reg.predict(X_train[train_data_numerics])


In [283]:
## METRICS
import seaborn as sns
errors_tets = (((y_test - gb_reg_pred_y_test)**2)**0.5) 
errors_train = (((y_train - gb_reg_pred_y_train)**2)**0.5) ## mean_squared_error(y_train,gb_reg_pred_y_train)
print("Mean error train:")
print(errors_train.mean())
print(np.median(errors_train))
print("Mean error test:")
print(errors_tets.mean())
print(np.median(errors_tets))

%matplotlib notebook
sns.residplot(gb_reg_pred_y_train, y_train, label="train_error");
sns.residplot(gb_reg_pred_y_test, y_test, label="test_error");
plt.show()

Mean error train:
5049.61671055
4105.40521557
Mean error test:
4989.94851688
3926.22758707


<IPython.core.display.Javascript object>

# Cross Validation

In [293]:
from sklearn.model_selection import cross_val_score
rf_reg = RandomForestRegressor(max_depth=7, random_state=0, n_estimators=70)
# scores sätts till explained_variance_score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.explained_variance_score.html

scores = cross_val_score(rf_reg, X_train[train_data_numerics], y_train, cv=5)#,scoring='neg_mean_squared_error')

In [294]:
print(np.sqrt(abs(scores)))

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.95670453  0.95536718  0.95959129  0.95273771  0.94756229]
Accuracy: 0.91 (+/- 0.02)


# Exhaustive Grid Search own dev

In [129]:
# create function that can make grid search for metaparameters
# Expected structure
# param_gs_gb = {
#     'model':GradientBoostingRegressor,
#     'hyper_param':{
#         'learning_rate': [0.12,0.1,0.08,0.06],
#         'n_estimators': [90, 100,110,120,130]
#     }
# }
from sklearn.ensemble import GradientBoostingRegressor
from mpl_toolkits.mplot3d import Axes3D


def grid_search(param_grid):
    hyper_param_names = list(param_grid['hyper_param'].keys())
    
    ## define empty 2d array to fill with eval metrics
    model_measures_test =  np.full((len(param_grid['hyper_param'][hyper_param_names[0]]), len(param_grid['hyper_param'][hyper_param_names[1]])), 0)
    model_measures_train =  np.full((len(param_grid['hyper_param'][hyper_param_names[0]]), len(param_grid['hyper_param'][hyper_param_names[1]])), 0)
    
    for i,param_1 in enumerate(param_grid['hyper_param'][hyper_param_names[0]]):
        for j,param_2 in enumerate(param_grid['hyper_param'][hyper_param_names[1]]):
            #create dict of parameters to call modeling function with
            parameters = {}
            parameters[hyper_param_names[0]] = param_1
            parameters[hyper_param_names[1]] = param_2
            
            # call the model given as input paramter, unpack parameters with **
            model = param_grid['model'](**parameters)
            model.fit(X_train[train_data_numerics],y_train)
            model_pred_y_test = model.predict(X_test[train_data_numerics])
            model_pred_y_train = model.predict(X_train[train_data_numerics])

            ### save errors to matrix
            model_output_test_res = (((y_test - model_pred_y_test)**2)**0.5).mean()
            model_output_train_res = (((y_train - model_pred_y_train)**2)**0.5).mean()
            model_measures_test[i,j] = model_output_test_res
            model_measures_train[i,j] = model_output_train_res
            
    return [model_measures_test,model_measures_train]

def plot_gs(param_gs,result_gs):
    from mpl_toolkits.mplot3d import Axes3D

    %matplotlib notebook
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    # Make data.
    xx, yy = np.meshgrid(np.asarray(param_gs['hyper_param'][list(param_gs['hyper_param'].keys())[1]]),np.asarray(param_gs['hyper_param'][list(param_gs['hyper_param'].keys())[0]]), sparse=True)
    
    # Plot the surface.
    ax.set_xlabel(list(param_gs['hyper_param'].keys())[1])
    ax.set_ylabel(list(param_gs['hyper_param'].keys())[0])
    ax.set_zlabel('mean pricing error')
    surf = ax.plot_surface(xx, yy, result_gs, cmap=cm.coolwarm,
                           linewidth=0, antialiased=False)
    



In [118]:
param_gs_rf = {
    'model':RandomForestRegressor,
    'hyper_param':{
        'max_depth': [6,7,8,9,10],
        'n_estimators': [40, 50, 60, 70]#, 80,90, 100]
    }
}
gs_rf = grid_search(param_gs_rf)

In [130]:
param_gs_gb = {
    'model':GradientBoostingRegressor,
    'hyper_param':{
        'learning_rate': [0.12,0.1,0.08,0.06],
        'n_estimators': [90, 100,110,120,130]
    }
}
gs_gb = grid_search(param_gs_gb)

In [122]:
## Plot Grid Search for Random Forest Regressor
plot_gs(param_gs_rf,gs_rf)

<IPython.core.display.Javascript object>

In [134]:
## Plot Grid Search for Gradient Boosting Regressor
plot_gs(param_gs_gb,gs_gb[0])


<IPython.core.display.Javascript object>

# Exhaustive Grid Search (GridSearchCV)

In [66]:
## Plot function for 2 params
def plot_gscv(clf):
    from mpl_toolkits.mplot3d import Axes3D
    %matplotlib notebook
    fig = plt.figure()
    ax = fig.gca(projection='3d')
        
    var1_name = list(clf.get_params()['param_grid'].keys())[0]
    var2_name = list(clf.get_params()['param_grid'].keys())[1]
    var1_params = np.asarray(clf.get_params()['param_grid'][var1_name])
    var2_params = np.asarray(clf.get_params()['param_grid'][var2_name])
    xx, yy = np.meshgrid(var1_params,var2_params, sparse=True)
    
     # Plot the surface.
    ax.set_xlabel(var1_name)
    ax.set_ylabel(var2_name)
    ax.set_zlabel('explained_variance_score')
    surf = ax.plot_surface(xx, yy, clf.cv_results_['mean_test_score'].reshape(var2_params.shape[0],var1_params.shape[0]), cmap=cm.coolwarm,
                           linewidth=0, antialiased=False)




### RandomForestRegressor

In [71]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

parameters = {
    'max_depth':[6,7,10,12,15,20],
    'n_estimators':[50,70, 80, 90]
}
rf_reg = RandomForestRegressor()
clf_rf = GridSearchCV(rf_reg, parameters, cv=2,error_score='raise')
clf_rf.fit(train_data_predictors[train_data_numerics], train_data_target)

GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [6, 7, 10, 12, 15, 20], 'n_estimators': [50, 70, 80, 90]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [70]:
#clf_rf.cv_results_['mean_test_score']
plot_gscv(clf_rf)

<IPython.core.display.Javascript object>

### GradientBoostingRegressor

In [72]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

parameters = {
    'max_depth':[2,3,4,5,6,7],
    'n_estimators':[20,30,40, 50, 60, 70, 80, 90, 100]
}
gb_reg = GradientBoostingRegressor()
clf_gb = GridSearchCV(gb_reg, parameters, cv=2,error_score='raise')
clf_gb.fit(train_data_predictors[train_data_numerics], train_data_target)

GridSearchCV(cv=2, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [2, 3, 4, 5, 6, 7], 'n_estimators': [20, 30, 40, 50, 60, 70, 80, 90, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [73]:
plot_gscv(clf_gb)

<IPython.core.display.Javascript object>

In [79]:
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV

parameters = {
    'alpha':[0.1,0.5,0.99]
}
lin_ridge_reg = linear_model.Ridge(tol=0.001)
clf_lin_ridge = GridSearchCV(lin_ridge_reg, parameters, cv=2,error_score='raise')
clf_lin_ridge.fit(train_data_predictors[train_data_numerics], train_data_target)



GridSearchCV(cv=2, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 0.5, 0.99]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [80]:
clf_lin_ridge.cv_results_['mean_test_score']

array([ 0.9016738,  0.9016738,  0.9016738])