In [4]:
import pandas as pd
import sklearn
import sys
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [12]:
sys.path.append('src/')

import plot as p
import utils as u
import model_experiments as exp

pd.set_option('display.max_columns', 500)

In [13]:
# Load data 
df = pd.read_csv('training_data/kc_house_data.csv') 

In [14]:
# dropping duplicate rows keeping latest ones
df.sort_values(['id', 'price']).drop_duplicates(subset = 'id', keep = 'last', inplace = True)

## Modelling experiments

In [15]:
# get feature and target dataframes
feat_df = df[df.loc[ :,df.columns != 'price'].columns]
target_df = df['price']

# get train_test_split dfs
feat_train, feat_test, target_train, target_test = u.get_train_test_split(
                                                            feat_df,
                                                            target_df,
                                                            random_state = 1
                                                  )

In [16]:
feature_sets_dict = { 
                      'all_preprocessed_features': feat_df.columns,
                      'high_correlated_features': [
                                                    'date', 'id', 'sqft_basement', 'sqft_above', 'lat', 
                                                    'long', 'yr_built', 'yr_renovated', 'zipcode', 'sqft_lot',
                                                    'sqft_living15', 'sqft_lot15', 'floors',
                                                    'bedrooms', 'sqft_living', 'grade',
                                                    'bathrooms', 'view', 'waterfront'
                                                   ]
                    }

# list to store cross validation metrics from all experiments
metrics = []

In [7]:
exp.run_model_experiments?

### Linear Regression

In [17]:
lm = LinearRegression()

metrics = exp.run_model_experiments(
                    lm,
                    'linear_regression',
                    feat_train,
                    target_train,
                    feature_sets_dict,
                    metrics
)

[31m[TRAINING] linear_regression ------[0m
[32m[EXP 0] all_preprocessed_features [DONE][0m
[31m[TRAINING] linear_regression ------[0m
[32m[EXP 1] high_correlated_features [DONE][0m


### Ridge regression

In [18]:
# run model experiments on ridge regression model
RidgeModel = Ridge(alpha = 0.1)
metrics = exp.run_model_experiments(
                    RidgeModel,
                    'ridge_regression',
                    feat_train,
                    target_train,
                    feature_sets_dict,
                    metrics
)

# get fitted grid search object
grid = u.get_fitted_grid_object(
                        RidgeModel,
                        [{'model__alpha': [0.001,0.1,1, 10, 100, 1000, 10000]}],
                        feat_train,
                        target_train
                )
            
best_params_dict = {
                     k.split('__')[1] : v
                         for k,v in grid.best_params_.items()
                   }

# run model experiments with grid search best parameters on ridge model
RidgeModel_grid = Ridge(**best_params_dict)
metrics = exp.run_model_experiments(
                    RidgeModel_grid,
                    'ridge_regression_grid',
                    feat_train,
                    target_train,
                    feature_sets_dict,
                    metrics
)

[31m[TRAINING] ridge_regression ------[0m
[32m[EXP 0] all_preprocessed_features [DONE][0m
[31m[TRAINING] ridge_regression ------[0m
[32m[EXP 1] high_correlated_features [DONE][0m
[34m[Running Gridsearch][0m
[31m[TRAINING] ridge_regression_grid ------[0m
[32m[EXP 0] all_preprocessed_features [DONE][0m
[31m[TRAINING] ridge_regression_grid ------[0m
[32m[EXP 1] high_correlated_features [DONE][0m


In [9]:
best_params_dict

{'alpha': 100}

### Random Forest Regressior

In [19]:
# run model experiments for random forest
RF = RandomForestRegressor(n_estimators=150, n_jobs=2, max_features=3, max_depth=3, random_state=1)
metrics = exp.run_model_experiments(
                    RF,
                    'random_forest_regressor',
                    feat_train,
                    target_train,
                    feature_sets_dict,
                    metrics
)

params = [
            {
                'model__n_estimators': [150, 200],
                'model__max_features': [8, 10, 20],
                'model__max_depth': [10, 15, 20]
            }
         ]

# fit grid search object
grid = u.get_fitted_grid_object(
                    RF,
                    params,
                    feat_train,
                    target_train
                    )
          
best_params_dict = {
                     k.split('__')[1] : v
                         for k,v in grid.best_params_.items()
                   }

# run model experiments for random forest with grid search best params
RF_grid = RandomForestRegressor(**best_params_dict, random_state=1)
metrics = exp.run_model_experiments(
                    RF_grid,
                    'random_forest_regressor_grid',
                    feat_train,
                    target_train,
                    feature_sets_dict,
                    metrics
)

[31m[TRAINING] random_forest_regressor ------[0m
[32m[EXP 0] all_preprocessed_features [DONE][0m
[31m[TRAINING] random_forest_regressor ------[0m
[32m[EXP 1] high_correlated_features [DONE][0m
[34m[Running Gridsearch][0m
[31m[TRAINING] random_forest_regressor_grid ------[0m
[32m[EXP 0] all_preprocessed_features [DONE][0m
[31m[TRAINING] random_forest_regressor_grid ------[0m
[32m[EXP 1] high_correlated_features [DONE][0m


In [22]:
best_params_dict

{'max_depth': 15, 'max_features': 10, 'n_estimators': 200}

### Gradient Boosting Regressor

In [21]:
# run model experiments for gradient boosting 
gb = GradientBoostingRegressor(random_state=1)
metrics = exp.run_model_experiments(
                    gb,
                    'gradient_boosting_regressor',
                    feat_train,
                    target_train,
                    feature_sets_dict,
                    metrics
)

params = [
            {
                'model__n_estimators': [100, 150],
                'model__max_features': [12, 20],
                'model__max_depth': [12, 20]
            }
         ]

# fit grid search object
grid = u.get_fitted_grid_object(
                        gb,
                        params,
                        feat_train,
                        target_train
                )
            
best_params_dict = {
                     k.split('__')[1] : v
                         for k,v in grid.best_params_.items()
                   }

# run model experiments for random forest with grid search best params
gb_grid = GradientBoostingRegressor(**best_params_dict, random_state=1)
metrics = exp.run_model_experiments(
                    RF_grid,
                    'gradient_boosting_regressor_grid',
                    feat_train,
                    target_train,
                    feature_sets_dict,
                    metrics
)

[31m[TRAINING] gradient_boosting_regressor ------[0m
[32m[EXP 0] all_preprocessed_features [DONE][0m
[31m[TRAINING] gradient_boosting_regressor ------[0m
[32m[EXP 1] high_correlated_features [DONE][0m
[34m[Running Gridsearch][0m
[31m[TRAINING] gradient_boosting_regressor_grid ------[0m
[32m[EXP 0] all_preprocessed_features [DONE][0m
[31m[TRAINING] gradient_boosting_regressor_grid ------[0m
[32m[EXP 1] high_correlated_features [DONE][0m


In [22]:
best_params_dict

{'max_depth': 12, 'max_features': 12, 'n_estimators': 150}

### Polynomial transformation

In [23]:
# Fit and transform the variables with 2nd order polynomial and run exp on linear regression
poly_lm = LinearRegression()
metrics = exp.run_model_experiments(
                    poly_lm,
                    'poly_linear_regression',
                    feat_train,
                    target_train,
                    feature_sets_dict,
                    metrics,
                    is_poly=True
)

# Fit and transform the variables with 2nd order polynomial and run exp on ridge regression
poly_ridge = Ridge(alpha = 0.01)
metrics = exp.run_model_experiments(
                    poly_ridge,
                    'poly_ridge_regression',
                    feat_train,
                    target_train,
                    feature_sets_dict,
                    metrics,
                    is_poly=True
)

[31m[TRAINING] poly_linear_regression ------[0m
[32m[EXP 0] all_preprocessed_features [DONE][0m
[31m[TRAINING] poly_linear_regression ------[0m
[32m[EXP 1] high_correlated_features [DONE][0m
[31m[TRAINING] poly_ridge_regression ------[0m
[32m[EXP 0] all_preprocessed_features [DONE][0m
[31m[TRAINING] poly_ridge_regression ------[0m
[32m[EXP 1] high_correlated_features [DONE][0m


In [24]:
metrics_df = pd.DataFrame(metrics).sort_values(by='test_mae_mean', ascending = True)
metrics_df

Unnamed: 0,model_name,feature_set,train_mse_mean,test_mse_mean,train_mse_std,test_mse_std,train_mae_mean,test_mae_mean,train_mae_std,test_mae_std,train_r2_mean,test_r2_mean,train_r2_std,test_r2_std
8,random_forest_regressor_grid,all_preprocessed_features,61559.385207,145046.2,9432.659907,33062.99,36832.465863,88247.07,437.576104,2355.828,0.969755,0.831955,0.000544,0.005391982
12,gradient_boosting_regressor_grid,all_preprocessed_features,61559.385207,145046.2,9432.659907,33062.99,36832.465863,88247.07,437.576104,2355.828,0.969755,0.831955,0.000544,0.005391982
13,gradient_boosting_regressor_grid,high_correlated_features,61416.146621,145543.5,8820.48938,32100.81,36972.112762,88839.86,339.298819,1887.229,0.969893,0.8307903,0.000553,0.004887752
9,random_forest_regressor_grid,high_correlated_features,61416.146621,145543.5,8820.48938,32100.81,36972.112762,88839.86,339.298819,1887.229,0.969893,0.8307903,0.000553,0.004887752
10,gradient_boosting_regressor,all_preprocessed_features,118893.106552,139853.1,21243.697036,28744.15,71765.909782,89268.45,916.912488,2438.414,0.887175,0.8436929,0.003417,0.005362001
11,gradient_boosting_regressor,high_correlated_features,118812.405336,141718.8,20630.498047,28537.17,72001.241436,91162.46,848.704054,2648.621,0.887329,0.8394764,0.003161,0.005601768
4,ridge_regression_grid,all_preprocessed_features,163251.008832,168662.0,37995.881416,81848.02,94474.472579,97190.14,326.394665,1020.057,0.787267,0.7732251,0.011708,0.04965921
2,ridge_regression,all_preprocessed_features,164238.267539,169681.7,38684.157776,84199.62,94532.531831,97241.25,326.281419,1145.849,0.784686,0.7705483,0.01214,0.05252925
0,linear_regression,all_preprocessed_features,164677.560694,171431.2,37319.196303,90548.77,94604.897623,97369.4,263.628717,1316.984,0.783516,0.7659447,0.011637,0.06095818
5,ridge_regression_grid,high_correlated_features,166267.340683,172151.4,40559.016816,87690.53,95565.82766,98471.64,387.77242,1122.012,0.779339,0.7638568,0.013229,0.05717036


## Plot evaluation metrics of experiments

In [31]:
x = metrics_df[~(metrics_df['model_name']=='poly_linear_regression')]
fig = px.line(x, x="model_name", y="test_mae_mean", title='MAE vs Model experiment', color = 'feature_set')
fig.show()

## Results --------

     ------- We looked at three evaluation metrics - RMSE - Root Mean Squared Error, MAE - Mean Absolute Error & R^2
             from 5-fold cross-validation. We intend to optimize for `MAE` & `R^2` as the metric considering the target
             variable has outliers and MAE is not sensitive to outliers.
     ------- The best model so far - `Gradient boosting regressor` with gridsearch parameters on all preprocessed 
             features leading to MAE on test set - 99872 & R^2 on test set - 79%.