In [629]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, ParameterSampler, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from xgboost import XGBRegressor, plot_importance
import xgboost as xgb
from scipy.stats import uniform

In [630]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

display(train_df.head(), test_df.head())

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [631]:
display(train_df.shape, test_df.shape)

(4209, 378)

(4209, 377)

### Remove columns with a variance of zero

In [632]:
train_df.var(numeric_only=True)

ID      5.941936e+06
y       1.607667e+02
X10     1.313092e-02
X11     0.000000e+00
X12     6.945713e-02
            ...     
X380    8.014579e-03
X382    7.546747e-03
X383    1.660732e-03
X384    4.750593e-04
X385    1.423823e-03
Length: 370, dtype: float64

In [633]:
test_df.var(numeric_only=True)

ID      5.871311e+06
X10     1.865006e-02
X11     2.375861e-04
X12     6.885074e-02
X13     5.734498e-02
            ...     
X380    8.014579e-03
X382    8.715481e-03
X383    4.750593e-04
X384    7.124196e-04
X385    1.660732e-03
Length: 369, dtype: float64

### checking for zero variance in the train data

In [634]:
zero_variance = train_df.var(numeric_only=True)[train_df.var(numeric_only=True) == 0].index.values
train_df = train_df.drop(zero_variance, axis=1)

In [635]:
# drop ID column in both train and test
train_df = train_df.drop('ID', axis=1)
test_df = test_df.drop('ID', axis=1)

In [636]:
display(train_df.head(), train_df.shape)

Unnamed: 0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,130.81,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
1,88.53,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
2,76.26,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
3,80.62,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
4,78.02,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0


(4209, 365)

In [637]:
train_col = train_df.iloc[:, 1:]
train_col_names = train_col.columns.values

In [638]:
test_df = test_df[train_col_names]

In [639]:
display(test_df.head(), test_df.shape)

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,az,v,n,f,d,t,a,w,0,0,...,0,0,0,1,0,0,0,0,0,0
1,t,b,ai,a,d,b,g,y,0,0,...,0,0,1,0,0,0,0,0,0,0
2,az,v,as,f,d,a,j,j,0,0,...,0,0,0,1,0,0,0,0,0,0
3,az,l,n,f,d,z,l,n,0,0,...,0,0,0,1,0,0,0,0,0,0
4,w,s,as,c,d,y,i,m,0,0,...,1,0,0,0,0,0,0,0,0,0


(4209, 364)

### checking for categorical data

In [640]:
display(train_df.describe(include=object), test_df.describe(include=object))

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
count,4209,4209,4209,4209,4209,4209,4209,4209
unique,47,27,44,7,4,29,12,25
top,z,aa,as,c,d,w,g,j
freq,360,833,1659,1942,4205,231,1042,277


Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
count,4209,4209,4209,4209,4209,4209,4209,4209
unique,49,27,45,7,4,32,12,25
top,ak,aa,as,c,d,v,g,e
freq,432,826,1658,1900,4203,246,1073,274


In [641]:
train_df['X4'].value_counts

<bound method IndexOpsMixin.value_counts of 0       d
1       d
2       d
3       d
4       d
       ..
4204    d
4205    d
4206    d
4207    d
4208    d
Name: X4, Length: 4209, dtype: object>

dropping x4 since it has almost no variance

In [642]:
train_df.drop('X4', axis=1, inplace=True)
test_df.drop('X4', axis=1, inplace=True)

###  Check for null and unique values for test and train sets

In [643]:
# checking for null values
print(train_df.isnull().sum().sum())
print(test_df.isnull().sum().sum())

0
0


In [644]:
# checking for unique values
print(train_df.nunique().sum())
print(test_df.nunique().sum())

3448
904


### Apply label encoder on train data

In [645]:
train_df.info() 
# found 8 columns that have dtype object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 364 entries, y to X385
dtypes: float64(1), int64(356), object(7)
memory usage: 11.7+ MB


found 8 columns that have dtype object

###  filter out columns with object type

In [646]:
obj_dtype_train = train_df.select_dtypes(include=[object])
obj_dtype_train

Unnamed: 0,X0,X1,X2,X3,X5,X6,X8
0,k,v,at,a,u,j,o
1,k,t,av,e,y,l,o
2,az,w,n,c,x,j,x
3,az,t,n,f,x,l,e
4,az,v,n,f,h,d,n
...,...,...,...,...,...,...,...
4204,ak,s,as,c,aa,d,q
4205,j,o,t,d,aa,h,h
4206,ak,v,r,a,aa,g,e
4207,al,r,e,f,aa,l,u


In [647]:
le = LabelEncoder()

for col in obj_dtype_train.columns:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.fit_transform(test_df[col])

In [648]:
display(train_df.head(), test_df.head())

Unnamed: 0,y,X0,X1,X2,X3,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,130.81,32,23,17,0,24,9,14,0,0,...,0,0,1,0,0,0,0,0,0,0
1,88.53,32,21,19,4,28,11,14,0,0,...,1,0,0,0,0,0,0,0,0,0
2,76.26,20,24,34,2,27,9,23,0,0,...,0,0,0,0,0,0,1,0,0,0
3,80.62,20,21,34,5,27,11,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,78.02,20,23,34,5,12,3,13,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,X0,X1,X2,X3,X5,X6,X8,X10,X12,X13,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,21,23,34,5,26,0,22,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,42,3,8,0,9,6,24,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,21,23,17,5,0,9,9,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,21,13,34,5,31,11,13,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,45,20,17,2,30,8,12,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### Apply label encoder on test data

In [649]:
obj_dtype_test = test_df.select_dtypes(include=[object])
obj_dtype_test

0
1
2
3
4
...
4204
4205
4206
4207
4208


In [650]:
le = LabelEncoder()

for col in obj_dtype_test.columns:
    test_df[col] = le.fit_transform(test_df[col])

In [651]:
test_df.head()

Unnamed: 0,X0,X1,X2,X3,X5,X6,X8,X10,X12,X13,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,21,23,34,5,26,0,22,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,42,3,8,0,9,6,24,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,21,23,17,5,0,9,9,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,21,13,34,5,31,11,13,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,45,20,17,2,30,8,12,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### Perform dimensionality reduction using RFE

In [652]:
features = train_df.iloc[:,1:].values
label = train_df.iloc[:,0].values

model = Ridge(alpha=0.1)
rfe = RFE(estimator=model, step=1)
fit = rfe.fit(features, label)

x_train_rfe = rfe.transform(features)
x_test_rfe = rfe.transform(test_df)

print('Num Featues: %d' % fit.n_features_)
print('Selected Features: %s' % fit.support_)
print('Feature Ranking: %s' % fit.ranking_)
print(x_test_rfe.shape)


Num Featues: 181
Selected Features: [False False False False False False False False False  True False  True
  True  True  True False False False False False  True False False  True
  True  True False False  True  True False False False  True  True False
 False  True False  True  True False  True False  True False False False
  True  True  True False  True False False  True  True  True  True False
 False False False False False False  True False  True  True  True False
 False  True False False False  True False False  True  True  True False
  True  True  True  True  True  True  True  True  True False  True  True
 False  True  True  True  True False  True False False False False  True
 False False  True  True  True  True  True False False False  True  True
  True  True  True  True  True False  True  True False False False  True
 False False  True False False False False  True False False False False
 False  True False False False False  True False False False False  True
 False  True Fa



In [653]:
features = features[:, fit.support_]

In [655]:
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=0.2, random_state=81)

In [656]:
dtrain_reg = xgb.DMatrix(X_train, y_train)
dtest_reg = xgb.DMatrix(X_val, y_val)

In [657]:
param_grid = {
    'max_depth': range(2, 7), # max depth of a tree, larger max_depth causes overfitting
    'gamma': uniform(loc=0.0, scale=3), # min of the loss function reduction required to make a split
    'min_child_weight': range(3, 6), # min sum of weights of all observations required in a child
    'colsample_bytree': uniform(loc=0.1, scale=0.9), # subsample ratio of columns when constructing each tree
    'subsample': uniform(loc=0.1, scale=0.5), # subsample ratio of the training instances
    'learning_rate': uniform(loc=0.01, scale=0.99), # learning rate, smaller values cause overfitting
}

In [658]:
rng = np.random.RandomState(20)
n_iter = 500
param_list = list(ParameterSampler(param_grid, n_iter=n_iter, random_state=rng))

In [659]:
param_list[0]

{'colsample_bytree': 0.6293177209695467,
 'gamma': 2.6931411837282537,
 'learning_rate': 0.8926154221799609,
 'max_depth': 3,
 'min_child_weight': 3,
 'subsample': 0.11794479280842665}

### Base Model

In [660]:
model = xgb.train(param_list[0], dtrain=dtrain_reg, num_boost_round=n_iter)
preds = model.predict(dtest_reg)
rmse = mean_squared_error(y_val, preds, squared=False)
print('RMSE: {}'.format(rmse))

RMSE: 15.709405666692136


In [661]:
evals = [(dtrain_reg, 'train'), (dtest_reg, 'eval')]
model = xgb.train(params=param_list[0], 
                  dtrain=dtrain_reg, 
                  num_boost_round=n_iter, 
                  evals=evals,
                  verbose_eval=250,
                  early_stopping_rounds=50)

[0]	train-rmse:14.51663	eval-rmse:12.68441
[50]	train-rmse:9.67702	eval-rmse:8.79363


### Cross validation

In [662]:
xgbr = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, verbosity=1)
best_score = float('inf')
counter = 0

for params in param_list:

  xgbr.set_params(**params, eval_metric='rmse', early_stopping_rounds=50)
  results = xgb.cv(params=params,
                 dtrain=dtrain_reg,
                 num_boost_round=n_iter,
                 nfold=5,
                 early_stopping_rounds=50
                 )
  score = abs(results['test-rmse-mean']).min()

  if score < best_score:
      best_score = score
      best_params = params

  if counter % 50 == 0:
        print('iter =', counter, 
              'best_score =', best_score)
  counter += 1
  

iter = 0 best_score = 9.085951957457459
iter = 50 best_score = 8.661563525110267
iter = 100 best_score = 8.661563525110267
iter = 150 best_score = 8.661563525110267
iter = 200 best_score = 8.661563525110267
iter = 250 best_score = 8.661563525110267
iter = 300 best_score = 8.661563525110267
iter = 350 best_score = 8.661563525110267
iter = 400 best_score = 8.661563525110267
iter = 450 best_score = 8.661563525110267


In [663]:
print('best score =', best_score)
print('best params =', best_params)

best score = 8.661563525110267
best params = {'colsample_bytree': 0.5556098351779073, 'gamma': 1.930607616226828, 'learning_rate': 0.26163932663715717, 'max_depth': 2, 'min_child_weight': 3, 'subsample': 0.49443766726939475}


In [664]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,78.224108,0.084411,78.223532,0.412931
1,60.568924,0.076004,60.574446,0.464419
2,47.135519,0.038258,47.125415,0.544091
3,37.023244,0.071653,37.020424,0.577114
4,29.360159,0.146046,29.367291,0.698424


### Final model

In [668]:
model = xgb.train(best_params, dtrain=dtrain_reg, num_boost_round=n_iter)
y_test_pred, y_train_pred = model.predict(dtest_reg), model.predict(dtrain_reg)

R2_train, R2_test = r2_score(y_train, y_train_pred), r2_score(y_val, y_test_pred)
MAE_train, MAE_test = mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_val, y_test_pred)
MSE_train, MSE_test = mean_squared_error(y_train, y_train_pred), mean_squared_error(y_val, y_test_pred)
RMSE_train, RMSE_test = np.sqrt(mean_squared_error(y_train, y_train_pred)), np.sqrt(mean_squared_error(y_val, y_test_pred))
MAPE_train, MAPE_test = np.mean(np.abs((y_train - y_train_pred) / y_train)) * 100, np.mean(np.abs((y_val - y_test_pred) / y_val)) * 100
MDAPE_train, MDAPE_test = np.median(np.abs((y_train - y_train_pred) / y_train)) * 100, np.median(np.abs((y_val - y_test_pred) / y_val)) * 100

print('R2 train, test:', R2_train, R2_test)
print('MAE train, test:', MAE_train, MAE_test)
print('MSE train, test:', MSE_train, MSE_test)
print('RMSE train, test:', RMSE_train, RMSE_test)
print('MAPE train, test:', MAPE_train, MAPE_test)
print('MDAPE train, test:', MDAPE_train, MDAPE_test)

R2 train, test: 0.5921835704442879 0.6346511267269923
MAE train, test: 5.277801578658818 5.08278292150792
MSE train, test: 68.05570537101444 49.45508443545352
RMSE train, test: 8.249588194026078 7.0324309051318465
MAPE train, test: 5.007405952106151 4.981437632453695
MDAPE train, test: 4.102950424819325 4.2762500566809445
