In [26]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from datetime import datetime
from sklearn.model_selection import train_test_split

In [27]:
# Creating a timer to check how long did the code take to execute.

def timer(start_time = None):
    
    if not start_time:
        
        start_time = datetime.now()
        return start_time
    
    elif start_time:
        
        hour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, sec = divmod(temp_sec, 60)
        
        print('\n Time Taken : %i hours %i minutes and %s seconds.' % (hour, tmin, round(sec,2)))
        

In [28]:
otto_train = pd.read_csv('train.csv')

In [29]:
otto_train.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [30]:
otto_train.drop(columns='id', inplace=True)

In [31]:
otto_train.head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,1,0,0,1,6,1,5,0,0,1,...,0,1,2,0,0,0,0,0,0,Class_1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [32]:
# As the class is not uniformly distributed we have to use stratified KFold

otto_train['target'].value_counts()

Class_2    16122
Class_6    14135
Class_8     8464
Class_3     8004
Class_9     4955
Class_7     2839
Class_5     2739
Class_4     2691
Class_1     1929
Name: target, dtype: int64

In [33]:
# Splitting the data into predictor and response values

X = otto_train.iloc[:,:93]
y = otto_train['target']

In [34]:
# Label Encoding the y values then One-hot-encoding as we have more than two classes

y_label_encoded = LabelEncoder().fit_transform(y)
Y = pd.Series(y_label_encoded)
Y = pd.to_numeric(Y)
Y = Y.astype('int64')

In [35]:
X.head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_84,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,6,1,5,0,0,1,...,22,0,1,2,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


## Tuning Number Of Decision Trees Using Grid Search

In [36]:
# Creating the XBGClassifier object

model = XGBClassifier(n_estimators = 300, verbose_eval=False, min_child_weight=3,
                     )

In [37]:
param_grid = dict(learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3], subsample = [0.1, 0.3, 0.5, 0.7, 0.9],
                 max_depth = [4, 5, 6, 7], colsample_bylevel = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0])

In [39]:
# In XGBoost the number of trees is handeled by the parameter n_estimators

KFold = StratifiedKFold(n_splits=3,random_state=7)

random_search = RandomizedSearchCV(model, param_grid, scoring='neg_log_loss', n_jobs=4, cv=KFold, verbose=3)

start_time = timer(None)
random_search.fit(X,Y)
timer(start_time)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 36.4min finished


Parameters: { verbose_eval } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time Taken : 0 hours 37 minutes and 29.68 seconds.


In [43]:
random_search.best_params_

{'subsample': 0.9,
 'max_depth': 5,
 'learning_rate': 0.2,
 'colsample_bylevel': 0.5}

In [44]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=5,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbose_eval=False,
              verbosity=None)

In [41]:
# Now, we will use the above parameter to increase the efficiency of our model
# First we split the data into predictors and Response variables

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

In [48]:
model = XGBClassifier(n_estimators=300, min_child_weight=3, colsample_bylevel=0.5, learning_rate=0.2, 
                      max_depth=5,subsample=0.9)

In [50]:
start_time = timer(None)
model.fit(X_train, y_train)
timer(start_time)


 Time Taken : 0 hours 0 minutes and 46.28 seconds.


In [51]:
y_output = model.predict(X_test)

In [52]:
model.score(X_test,y_test)

0.8124877571008815

In [53]:
# Extracting the test datset of otto classification

test_data = pd.read_csv('test.csv')

In [56]:
test_data.head()
test_data.drop(columns='id', inplace=True)
test_data.head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_84,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93
0,0,0,0,0,0,0,0,0,0,3,...,0,0,11,1,20,0,0,0,0,0
1,2,2,14,16,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,2,0
2,0,1,12,1,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,1
3,0,0,0,1,0,0,0,0,0,0,...,0,3,1,0,0,0,0,0,0,0
4,1,0,0,1,0,0,1,2,0,3,...,0,0,0,0,0,0,0,9,0,0


In [55]:
test_data.shape

(144368, 94)

In [57]:
start_time = timer(None)

y_response = model.predict(test_data)

timer(start_time)


 Time Taken : 0 hours 0 minutes and 3.79 seconds.


In [58]:
y_response

array([3, 5, 5, ..., 1, 3, 1], dtype=int64)

In [66]:
y_response = pd.DataFrame(y_response, columns=['target'])

In [71]:
y_response_dataframe = OneHotEncoder(sparse=False).fit_transform(y_response)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [74]:
y_response_df = pd.DataFrame(y_response_dataframe, columns=['Class_1', 'Class_2', 'Class_3','Class_4','Class_5','Class_6','Class_7',
                                           'Class_8','Class_9'])

In [76]:
y_response_df.head()

Unnamed: 0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [97]:
y_response_df

Unnamed: 0_level_0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [98]:
y_response_df.index = np.arange(1, len(y_response_df)+1)

In [100]:
y_response_df.index.name = 'id'

In [101]:
y_response_df

Unnamed: 0_level_0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [102]:
y_response_df.to_csv('otto_classification_results4', sep=',')