# Final model with just the selected parameters

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('subset_data.csv/part-00000-03fb537b-9382-4a5b-bac8-26d245ab88ab-c000.csv')

In [3]:
df.head()

Unnamed: 0,x005,x276,x056,x193,x171,x075,x275,x281,x236,x057,...,x059,x055,x076,x274,x239,x245,x287,x014,x003,y
0,8.0,0,0,0,0,0,,0,0,,...,0,0,0,0,,1,,0,,706
1,4.0,0,0,0,0,22375,,5206,0,,...,8,0,0,0,,0,1.0,0,3.0,558
2,96.0,0,2,0,0,3287,0.4814,0,0,,...,3,0,1,1613,,1,,0,,577
3,258.0,1,2,8,0,684,1.1153,0,-156,1.52,...,5,0,1,4796,14.0,1,2.0,1,14.0,526
4,34.0,1,0,1,1,2810,0.143,0,710,0.62,...,7,0,0,186,29.5,1,,0,25.0,496


In [4]:
features = df[df.columns.difference(['y'])]
target = df.y

# Train test split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=12345)

In [6]:
selected_columns = X_train.columns

In [7]:
y_train = pd.Series(np.array(y_train), index=X_train.index)
y_test = pd.Series(np.array(y_test), index=X_test.index)

# Median imputation

In [8]:
median_values = X_train.median()

In [9]:
imputed_X_train = X_train.fillna(median_values)
imputed_X_test = X_test.fillna(median_values)

# Min Max scaler

In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(imputed_X_train)
X_train = pd.DataFrame(scaler.transform(imputed_X_train), columns=imputed_X_train.columns)
X_test = pd.DataFrame(scaler.transform(imputed_X_test), columns=imputed_X_test.columns)

In [11]:
range_size = 850 - 300

scaled_y_train = y_train.apply(lambda x: (x-300)/range_size)
scaled_y_test = y_test.apply(lambda x: (x-300)/range_size)

In [12]:
(783 - 300)/range_size

0.8781818181818182

In [13]:
(780 - 300)/range_size

0.8727272727272727

In [14]:
0.8781818181818182 - 0.8727272727272727

0.00545454545454549

So, the 3 difference in scaled version translates to 0.00545

# Model Evaluation metrics helper functions

In [15]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt 

# define model evaluation rmse function
def rmse(y, pred):
    rmse = sqrt(mean_squared_error(y, pred))
    return rmse

# define accuracy evaluation function
def accuracy(y, pred, cut_off):
    accuracy = (np.abs(y - pred) <= cut_off).astype(int)
    return np.sum(accuracy)/len(accuracy)

# define model fit function
def model_fit(reg, X_train, X_test, y_train, y_test, min_value, max_value):
    
    reg.fit(X_train, y_train.ravel())

    #prediction results
    pred_train = reg.predict(X_train)
    pred_test = reg.predict(X_test)    
    pred_train[pred_train > max_value] = max_value
    pred_test[pred_test > max_value] = max_value
    pred_train[pred_train < min_value] = min_value
    pred_test[pred_test < min_value] = min_value
    
    if max_value == 1:
#         pred_train = pred_train * 550 + 300
#         pred_test = pred_test * 550 + 300
#         temp_y_train = y_train.ravel() * 550 + 300
#         temp_y_test = y_test.ravel() * 550 + 300
        cut_off = 0.00545
    else:    
        cut_off = 3
        
    temp_y_train = y_train.ravel()
    temp_y_test = y_test.ravel()
    rmse_train = rmse(temp_y_train, pred_train)
    rmse_test = rmse(temp_y_test, pred_test)
    acc_train = accuracy(temp_y_train, pred_train, cut_off)
    acc_test = accuracy(temp_y_test, pred_test, cut_off)
    r2_train = r2_score(temp_y_train, pred_train)
    r2_test = r2_score(temp_y_test, pred_test)
    
    print("\n")
    print("Train RMSE - ", rmse_train)
    print("Test RMSE - ", rmse_test)
    print("\n")
    print("Train Accuracy - ", acc_train)
    print("Test Accuracy - ", acc_test)
    print("\n")
    print("Train R squared - ", r2_train)
    print("Test R squared - ", r2_test)

    return rmse_train, rmse_test, acc_train, acc_test, r2_train, r2_test

In [16]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

In [17]:
reg = xgb.XGBRegressor()

In [18]:
r2_test = model_fit(reg, X_train, X_test, y_train, y_test, 300, 850)



Train RMSE -  25.749922203581587
Test RMSE -  30.303113948660062


Train Accuracy -  0.13428571428571429
Test Accuracy -  0.12436666666666667


Train R squared -  0.9528531251465733
Test R squared -  0.9342287132553931


In [19]:
reg = xgb.XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

In [20]:
r2_test = model_fit(reg, X_train, X_test, y_train, y_test, 300, 850)



Train RMSE -  26.769303333578453
Test RMSE -  28.874080672421766


Train Accuracy -  0.128
Test Accuracy -  0.1205


Train R squared -  0.9490463609909204
Test R squared -  0.9402857266688193


# Pickle files

In [21]:
import pickle

In [22]:
list_to_dumb = [selected_columns, reg, scaler, median_values]

with open('model_objects.pkl', 'wb') as handle:
    pickle.dump(list_to_dumb, handle)