## Model Tuning

In [2]:
%config IPCompleter.greedy=True

In [3]:
import numpy as np
import pandas as pd

In [4]:
import sys
import inspect

#Add the scripts directory to the sys path
sys.path.append("../src/data")
sys.path.append("../src/features")

from make_dataset import get_data
from data_processor import DataProcessor

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

In [6]:
# Show all rows and columns in the display
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
#get the train data
X_train, y_train = get_data(data_string="train")

In [9]:
#get the test data
X_test, y_test = get_data(data_string="test")

### Baseline Model 

Lets create a baseline model. In this case, let's say our predicted value is median of y_train

In [10]:
y_pred = pd.Series(np.zeros(len(y_test)))
y_pred[:] = y_train.median()

In [11]:
mean_absolute_error(y_test, y_pred)

0.06970475324840691

##### Baseline model MAE is 0.0697

### Other Models

There are more than 500 columns and hence trying different models such as Random Forest, Gradient Boosting, XGBoost will take a lot of time. Hence, lets fix with XGBoost model and hypertune the parameters.

#### 1. Linear Regression

In [12]:
dp = DataProcessor(cols_to_remove=["parcelid", "propertyzoningdesc", "rawcensustractandblock", "regionidneighborhood", "regionidzip", "censustractandblock"], 
                  datecol="transactiondate")

lin_reg = LinearRegression(n_jobs=-1)

pipeline = Pipeline([
    ("dataprocessor", dp),
    ("lin_reg", lin_reg)
])
    
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("mean absolute error: {0:.4f}".format(mean_absolute_error(y_test, y_pred)))

mean absolute error: 0.2805


##### Linear Regression MAE is worse than our baseline model. 

#### 2. XGBoost

In [15]:
dp = DataProcessor(cols_to_remove=["parcelid", "propertyzoningdesc", "rawcensustractandblock", "regionidneighborhood", "regionidzip", "censustractandblock"], 
                  datecol="transactiondate")

xgb_reg = xgb.sklearn.XGBRegressor(learning_rate=0.1, n_estimators = 100, objective='reg:squarederror', 
                                    eval_metric="mae", random_state = 42, verbosity=1, n_thread=-1)

pipeline = Pipeline([
    ("dataprocessor", dp),
    ("xgb_reg", xgb_reg)
])
    
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("mean absolute error: {0:.4f}".format(mean_absolute_error(y_test, y_pred)))

mean absolute error: 0.0895


Gradient Boosting has the best score among all three models.. 

XGBoost is generally better over Gradient Boosting Model as XGBoost is a regualarized model that controls over fitting and in addition XGBoost is also better in terms of speed and memory utilization

Lets find the best XGBoost model using hyperparameter tuning

In [9]:
def xgb_gridsearch(param_test, xgb_reg):
    
    dp = DataProcessor(cols_to_remove=["parcelid", "propertyzoningdesc", "rawcensustractandblock", "regionidneighborhood", "regionidzip", "censustractandblock"], 
                      datecol="transactiondate")

    pipeline = Pipeline([
        ("dataprocessor", dp),
        ("xgb_reg", xgb_reg)
    ])

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_test, n_jobs=-1, cv=3, verbose=1)

    grid_search.fit(X_train, y_train)

    print("----- Grid Search cv results ----- \n")
    for mean_score, params in zip(grid_search.cv_results_["mean_test_score"], grid_search.cv_results_["params"]):
        print(-(mean_score), params)

    print("\n----- Grid Search best parameters ------ \n", grid_search.best_params_)
    print("\n")
    print("----- Grid Search best score ------ \n", -(grid_search.best_score_))

   

#### Step 1: Fix learning_rate and n_estimators

In [10]:
param_test = {
    'xgb_reg__learning_rate': [0.1, 0.2, 0.3]
}

xgb_reg = xgb.sklearn.XGBRegressor(learning_rate=0.1, n_estimators=1000, max_depth=5, 
                                    min_child_weight=1, gamma=0, max_delta_step=0, 
                                    subsample=0.8, colsample_bytree=0.8, colsample_bylevel=1, 
                                    colsample_bynode=1, reg_lambda=1, reg_alpha=0, 
                                    scale_pos_weight=1, missing=None, objective='reg:squarederror', 
                                    eval_metric='mae', seed=0, booster='gbtree')# , silent=0, nthread=-1)

xgb_gridsearch(param_test, xgb_reg)


Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  6.2min finished


Grid Search cv results ----- 

0.059001987296740964 {'xgb_reg__learning_rate': 0.1}
0.12138837303615395 {'xgb_reg__learning_rate': 0.2}
0.1974494930753917 {'xgb_reg__learning_rate': 0.3}

Grid Search best parameters ------ 
 {'xgb_reg__learning_rate': 0.1}


Grid Search best score ------ 
 -0.059001987296740964


#### Step 2: Tune max_depth and min_child_weight

#### Step 3: Tune gamma

#### Step 4: Tune subsample and colsample_bytree

#### Step 5: Tune regularization parameters