In [142]:
import numpy as np
import pandas as pd
from xgboost import *
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, IsolationForest, GradientBoostingClassifier
from imblearn.over_sampling import *
from imblearn.under_sampling import *
from sklearn.feature_selection import *

from   sklearn.compose            import *
from   sklearn.experimental       import enable_iterative_imputer
from   sklearn.impute             import *
from   sklearn.linear_model       import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
from   sklearn.metrics            import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score # Evaluation metric 2.0 
from   sklearn.pipeline           import Pipeline
from   sklearn.preprocessing      import *
from   sklearn.tree               import DecisionTreeClassifier, ExtraTreeClassifier
from   sklearn.model_selection    import train_test_split
from   sklearn.model_selection    import RandomizedSearchCV
from   sklearn.svm                import LinearSVC
from   sklearn.base               import BaseEstimator
from   sklearn.decomposition      import PCA
from   sklearn.model_selection    import cross_val_score, cross_val_predict
from   sklearn.cluster            import KMeans
from   sklearn.metrics            import plot_confusion_matrix
from   sklearn.ensemble           import StackingClassifier
from   sklearn.pipeline           import make_pipeline


# Read in data and EDA

In [143]:
data = pd.read_csv('data/train_ml2_2021.csv')
test = pd.read_csv('data/test0.csv')

In [144]:
X = data.drop(['target'], axis=1)
y = data['target']

In [145]:
# all features already scaled? 0 to 1 range?
X

Unnamed: 0,problem_id,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v968,v969,v970,v971,v972,v973,v974,v975,v976,v977
0,0,0.35,0.36,0.77,0.42,0.48,0.12,0.43,0.49,0.19,...,1.00,0.77,0.57,0.56,0.40,0.75,0.08,0.14,0.43,0.88
1,0,0.12,0.17,0.38,0.43,0.57,0.09,0.11,0.58,0.35,...,0.39,0.07,0.59,0.06,0.17,0.09,0.04,0.04,0.45,0.14
2,0,0.67,0.16,0.85,0.41,0.57,0.27,0.83,0.73,0.26,...,0.65,0.24,0.42,0.57,0.58,0.37,0.01,0.10,0.38,0.06
3,0,0.70,0.20,0.62,0.41,0.41,0.10,0.80,0.52,0.82,...,0.96,0.04,0.52,0.25,0.07,0.23,0.04,0.76,0.41,0.59
4,0,0.72,0.75,0.74,0.42,0.41,0.76,0.34,0.72,0.06,...,0.99,0.81,0.41,0.16,0.03,0.21,0.10,0.12,0.53,0.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8297,20,0.91,0.76,0.45,0.81,0.99,0.20,0.36,0.28,0.02,...,0.81,0.75,0.92,0.86,0.84,0.86,0.52,0.04,0.77,0.68
8298,20,0.31,0.93,0.45,0.61,0.99,0.09,0.73,0.25,0.56,...,0.53,0.36,0.93,0.61,0.65,0.65,0.31,0.06,0.89,0.11
8299,20,0.92,0.82,0.52,0.61,0.94,0.67,0.65,0.34,0.25,...,0.40,0.80,0.74,0.94,0.76,0.61,0.72,0.14,0.11,0.91
8300,20,0.93,0.85,0.51,0.64,0.95,0.76,0.76,0.36,0.07,...,0.71,0.33,0.92,0.24,0.89,0.84,0.54,0.13,0.91,0.95


In [146]:
# problem_id = data set id
len(X['problem_id'].unique())

21

In [147]:
X['problem_id'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20])

In [148]:
# some datasets contribute more than others
# below are perecentage contributions
X['problem_id'].value_counts()/X.shape[0] * 100

16    6.588774
13    6.588774
18    5.649241
19    5.649241
15    4.914478
20    4.914478
1     4.914478
14    4.914478
2     4.914478
3     4.914478
6     4.733799
7     4.673573
5     4.444712
0     4.444712
8     4.444712
17    4.239942
9     4.239942
4     4.179716
12    3.854493
11    3.734040
10    3.047458
Name: problem_id, dtype: float64

In [149]:
# will remove from consideration for now - Danh (I stick to keeping it in)
X1 = X.drop(['problem_id'], axis=1)

In [150]:
# high dimensionality! 
X.shape, y.shape

((8302, 979), (8302,))

In [151]:
# perecent portion of targets - imbalanced!
y.value_counts()/len(y) * 100

0    49.674777
1    38.243797
2     8.781017
3     3.071549
4     0.228861
Name: target, dtype: float64

In [152]:
y.value_counts()

0    4124
1    3175
2     729
3     255
4      19
Name: target, dtype: int64

In [153]:
# to be predicted - only Yannet knows real values
test['target'].value_counts()

0    2041
Name: target, dtype: int64

In [154]:
# no null values
X.isnull().values.sum(), y.isnull().values.sum()

(0, 0)

# Feature Engineering

In [155]:
# balance
# dimensionality reduction
# check variance of each column?

In [156]:
# imbalanced - in a perfect world, each would have 20% representation
y.value_counts()/len(y) * 100

0    49.674777
1    38.243797
2     8.781017
3     3.071549
4     0.228861
Name: target, dtype: float64

In [157]:
# Over-sample minority classes
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)

In [158]:
# 20% reached....
y_ros.value_counts()/len(y_ros) * 100

0    20.0
1    20.0
2    20.0
3    20.0
4    20.0
Name: target, dtype: float64

In [159]:
# but, that means the 19 obs for target 4 are now 4,124 obs - 
# that is over 200x representation -
# will lead to overfitting/high variance model
y.value_counts(), y_ros.value_counts()

(0    4124
 1    3175
 2     729
 3     255
 4      19
 Name: target, dtype: int64,
 0    4124
 1    4124
 2    4124
 3    4124
 4    4124
 Name: target, dtype: int64)

In [160]:
# Under-sample majority classes
rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X, y)

In [161]:
# 20% reached....
y_rus.value_counts()/len(y_rus) * 100

0    20.0
1    20.0
2    20.0
3    20.0
4    20.0
Name: target, dtype: float64

In [162]:
# but, that means the 4,124 obs for target 0 are now 19 -
# will lead to underfitting/high bias model
y.value_counts(), y_rus.value_counts()

(0    4124
 1    3175
 2     729
 3     255
 4      19
 Name: target, dtype: int64,
 0    19
 1    19
 2    19
 3    19
 4    19
 Name: target, dtype: int64)

In [163]:
# let see how both over- and under- sampling performs

# maybe figure out how to do both undersampling of majority 
# and oversampling of minority to a not so extreme level

In [164]:
# Variance Threshold

In [165]:
X.shape

(8302, 979)

In [166]:
selector = VarianceThreshold()
X_ = selector.fit_transform(X)

In [167]:
X_.shape # no features are constant, none dropped

(8302, 979)

# Modeling

In [23]:
# split
# get a model
# hyperparameter tuning
# evaluate

In [24]:
# non resampled data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75)

In [121]:
# Fit RandomForest on original data
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [122]:
# overfit to training data
y_pred = rf.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
accuracy

1.0

In [123]:
y_pred = rf.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
accuracy

0.7023121387283237

In [124]:
# check out feature importance - descending
sorted(rf.feature_importances_, reverse = True)

[0.004518163617833128,
 0.00447536739562426,
 0.004359714115253675,
 0.0042623575428220005,
 0.00371749702070532,
 0.0037121710605280116,
 0.003290085117583231,
 0.003185295522202025,
 0.003157608809849424,
 0.003012871497968927,
 0.0028627099982661925,
 0.0028381861473817456,
 0.0027713902866918377,
 0.002656797454885737,
 0.0026264527225986225,
 0.0026120707686654527,
 0.0025799901022582243,
 0.0025780907685411638,
 0.0025037296246504122,
 0.0024925027893231188,
 0.002488269356244792,
 0.0024686390057724055,
 0.0024614344968439906,
 0.0024345333106297043,
 0.002385193277564634,
 0.0023325637154129387,
 0.00223150924727242,
 0.002228405383637481,
 0.002189948236823147,
 0.00218140167685926,
 0.0021479637115464118,
 0.0021325361595440624,
 0.002098315958053031,
 0.0020352356929929758,
 0.0019883350701895104,
 0.0019826180253064395,
 0.001977363072097057,
 0.0019704081143554386,
 0.0019669315261297655,
 0.001873715435875909,
 0.0018583098170190818,
 0.0017966499965105646,
 0.00179544153

In [194]:
# the dataset id column is the most important by a lot!
example[:5]

[(0, 0.032054969380460846),
 (1, 0.021418343769760928),
 (2, 0.026547024136874854),
 (3, 0.029243423405607632),
 (4, 0.02466870629526944)]

In [114]:
# lets get the column's index for most important features
most_important = list()
for ind,x in enumerate(rf.feature_importances_):
    if x > 0.002:
        most_important.append(ind)

In [115]:
len(most_important), most_important[:5]

(38, [0, 5, 13, 117, 140])

In [136]:
# removing index 0 - dataset id 
most_important = most_important[1:]

In [137]:
# name refers to chosen threshold for importance
X001_train = X_train.iloc[:,most_important]
X001_train

Unnamed: 0,v4,v12,v116,v139,v140,v230,v253,v258,v287,v363,...,v727,v733,v784,v789,v847,v915,v916,v925,v929,v946
7152,0.01,0.35,0.66,0.31,0.99,0.33,0.40,0.62,0.67,0.93,...,0.98,0.02,0.57,0.05,0.16,0.69,0.80,0.35,0.31,0.98
4620,0.26,0.96,0.97,0.13,0.37,0.51,0.18,0.58,0.32,0.09,...,0.41,0.62,0.90,0.21,0.19,0.50,0.70,0.99,0.09,0.07
6381,0.05,0.48,0.92,0.76,0.32,0.37,0.42,0.58,0.97,0.89,...,0.77,0.74,0.71,0.96,0.92,0.57,0.60,0.74,0.37,0.79
8146,1.00,0.19,0.25,0.87,0.99,0.52,0.96,0.63,0.39,0.85,...,0.69,0.75,0.92,0.98,0.62,0.06,0.81,0.80,0.07,0.88
5897,0.99,0.39,0.05,0.77,0.99,0.30,0.97,0.70,0.63,1.00,...,0.15,0.04,0.98,0.74,0.88,0.41,0.91,0.66,0.07,0.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1026,0.02,0.14,0.54,0.15,0.02,0.61,0.10,0.18,0.78,0.39,...,0.55,0.39,0.01,0.24,0.08,0.48,0.03,0.58,0.55,0.12
3879,0.98,0.83,0.48,0.77,1.00,0.75,0.98,0.64,0.54,0.18,...,0.88,1.00,0.62,0.96,0.93,0.16,0.89,0.01,0.07,0.87
6845,0.21,1.00,0.14,0.30,0.50,0.11,0.30,0.30,0.93,0.48,...,0.72,0.02,0.63,0.97,0.69,0.69,0.14,0.38,0.50,0.27
6350,0.05,0.79,0.04,0.91,0.21,0.66,0.35,0.37,0.08,0.06,...,0.91,0.45,0.60,0.10,0.06,0.47,0.40,0.95,0.61,0.05


In [138]:
X001_valid = X_valid.iloc[:,most_important]
X001_valid

Unnamed: 0,v4,v12,v116,v139,v140,v230,v253,v258,v287,v363,...,v727,v733,v784,v789,v847,v915,v916,v925,v929,v946
449,0.08,0.74,0.19,0.97,0.67,0.43,0.24,0.05,0.58,0.25,...,0.46,0.92,0.95,0.62,0.67,0.81,0.98,0.79,0.77,0.65
6874,0.61,0.02,0.13,0.30,0.55,0.74,0.09,0.57,0.86,0.29,...,0.39,0.22,0.70,0.78,0.64,0.36,0.41,0.28,0.61,0.24
5829,0.01,0.78,0.06,0.19,0.02,0.43,0.03,0.52,0.28,0.31,...,0.72,0.01,0.19,0.03,0.15,0.03,0.22,0.05,0.02,0.60
4708,0.33,0.07,0.13,0.15,0.04,0.34,0.88,0.28,0.62,0.22,...,0.09,0.96,0.43,0.30,0.20,0.16,0.96,0.05,0.17,0.84
2595,0.86,0.31,0.57,0.55,0.57,0.93,0.90,0.42,0.29,0.81,...,0.83,0.49,0.24,0.99,0.61,0.34,0.48,0.18,0.35,0.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2559,0.82,0.83,0.53,0.39,0.96,0.94,0.94,0.38,0.03,0.26,...,0.70,0.47,0.32,1.00,0.51,0.01,0.52,0.91,0.60,0.93
4709,0.75,0.98,0.79,0.92,0.05,0.86,0.75,0.24,0.62,0.40,...,0.93,0.26,0.76,0.27,0.40,0.88,0.94,0.46,0.69,0.83
7527,0.98,0.78,0.40,0.69,0.84,0.99,0.38,0.38,0.50,0.91,...,0.08,0.41,0.96,1.00,0.38,0.69,0.53,0.09,0.85,0.49
6767,0.28,0.01,0.35,0.34,0.48,0.13,0.09,0.43,0.07,0.23,...,0.64,0.80,0.69,0.84,0.58,0.58,0.46,0.39,0.67,0.74


In [139]:
# Fit RandomForest on most important features
rf = RandomForestClassifier()
rf.fit(X001_train, y_train)

RandomForestClassifier()

In [140]:
# overfit to training data
y_pred = rf.predict(X001_train)
accuracy = accuracy_score(y_train, y_pred)
accuracy

1.0

In [141]:
y_pred = rf.predict(X001_valid)
accuracy = accuracy_score(y_valid, y_pred)
accuracy

0.6917148362235067

In [34]:
# Fit XGB on orginal data
model = XGBClassifier(use_label_encoder=False)
model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [37]:
# train error - overfit to training data
pred = model.predict(X_train)
accuracy = accuracy_score(y_train, pred)
accuracy

1.0

In [36]:
# validation error
pred = model.predict(X_valid)
accuracy = accuracy_score(y_valid, pred)
accuracy

0.7273603082851637

### With Oversampled Data

In [176]:
# over-sampled data - performs too well, def overfit
X_ros_train, X_ros_valid, y_ros_train, y_ros_valid = train_test_split(X_ros, y_ros, train_size=0.75)

In [177]:
# Fit RandomForest on oversampled data
rf_over = RandomForestClassifier()
rf_over.fit(X_ros_train, y_ros_train)

RandomForestClassifier()

In [178]:
# train error - overfit to training data
y_ros_pred = rf_over.predict(X_ros_train)
accuracy_ros = accuracy_score(y_ros_train, y_ros_pred)
accuracy_ros

1.0

In [179]:
# validation error
y_ros_pred = rf_over.predict(X_ros_valid)
accuracy_ros = accuracy_score(y_ros_valid, y_ros_pred)
accuracy_ros

0.9103782735208535

In [187]:
# check out feature importance - descending
sorted(rf_over.feature_importances_, reverse = True)[:]

[0.012598100684561814,
 0.010123197659670909,
 0.00981948001048126,
 0.008832399836845816,
 0.007235385776469262,
 0.007203986906198293,
 0.0071284060455284026,
 0.007079033202572015,
 0.006726199919374707,
 0.006404154713978157,
 0.005723804777407513,
 0.005639617891155511,
 0.004964955477869256,
 0.004500240128649535,
 0.004053915645112182,
 0.003985684196842678,
 0.003967301241270802,
 0.003966946702432126,
 0.003947371280687471,
 0.00392944845300316,
 0.0039055714273108777,
 0.0038287174974636386,
 0.0036182578836112295,
 0.003506363398403674,
 0.0033420466428471475,
 0.0032688876747082067,
 0.0032537374792193548,
 0.003186019222775993,
 0.0031044110455642217,
 0.0030136315871171953,
 0.002915266166576837,
 0.0028806041898687624,
 0.0028591668364194023,
 0.0028533806417332472,
 0.0028409814079390977,
 0.0028094471668764056,
 0.0027849144930198015,
 0.002762072641016696,
 0.0027569171127223035,
 0.0027320916842569565,
 0.002706152461274601,
 0.002650489023999157,
 0.0026358858408859

In [186]:
# the dataset id column is the most important by a lot!
rf_over.feature_importances_[:10]

array([0.0125981 , 0.00058629, 0.00066182, 0.00067256, 0.00060698,
       0.00140491, 0.00081018, 0.00086387, 0.00158845, 0.00076848])

In [188]:
# lets get the column's index for most important features
most_important = list()
for ind,x in enumerate(rf_over.feature_importances_):
    if x > 0.001:
        most_important.append(ind)

In [189]:
# name refers to chosen threshold for importance
X001_train = X_train.iloc[:,most_important]
X001_train

Unnamed: 0,problem_id,v4,v7,v11,v12,v15,v17,v26,v29,v36,...,v940,v945,v951,v953,v954,v957,v958,v959,v963,v974
7152,18,0.01,0.68,0.64,0.35,0.63,0.20,0.47,0.23,0.95,...,0.93,0.07,0.05,0.06,0.84,0.67,0.92,0.30,0.80,0.10
4620,12,0.26,0.67,0.19,0.96,0.68,0.31,0.13,0.18,0.22,...,0.54,0.91,0.48,0.51,0.51,0.97,0.40,1.00,0.50,0.84
6381,16,0.05,0.69,0.38,0.48,0.38,0.36,0.32,0.08,0.60,...,0.95,0.80,0.57,0.38,0.51,1.00,0.67,1.00,0.60,0.30
8146,20,1.00,0.62,0.17,0.19,0.46,0.13,0.48,0.60,0.35,...,0.98,0.74,0.74,0.38,0.40,0.91,0.80,0.80,1.00,0.23
5897,15,0.99,0.45,0.51,0.39,0.46,0.34,0.88,0.48,0.06,...,0.10,0.68,0.21,0.29,0.40,0.85,0.86,0.68,0.98,0.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1026,2,0.02,0.42,0.13,0.14,0.62,0.51,0.25,0.66,0.33,...,0.64,0.94,0.12,0.07,0.12,0.16,0.40,0.06,0.07,0.33
3879,10,0.98,0.75,0.77,0.83,0.43,0.25,0.75,0.51,0.37,...,0.16,0.32,0.92,0.88,0.66,0.52,0.75,0.75,0.11,0.53
6845,17,0.21,0.08,1.00,1.00,0.05,0.93,0.88,0.42,0.83,...,0.99,0.52,0.98,0.95,0.93,0.34,0.37,0.34,0.68,0.20
6350,16,0.05,0.80,0.49,0.79,0.48,0.61,0.45,0.09,0.53,...,0.72,0.65,0.36,0.37,0.50,0.01,0.59,0.13,0.32,0.51


In [190]:
X001_valid = X_valid.iloc[:,most_important]
X001_valid

Unnamed: 0,problem_id,v4,v7,v11,v12,v15,v17,v26,v29,v36,...,v940,v945,v951,v953,v954,v957,v958,v959,v963,v974
449,1,0.08,0.55,0.09,0.74,0.89,0.69,0.28,0.14,0.46,...,0.27,0.76,0.01,0.92,0.15,0.02,0.38,0.93,0.26,0.60
6874,17,0.61,0.08,0.86,0.02,0.17,0.10,0.90,0.71,1.00,...,0.01,0.52,0.19,0.02,0.71,0.41,0.88,0.88,0.43,0.31
5829,15,0.01,0.50,0.61,0.78,0.40,0.14,0.19,0.49,0.13,...,0.12,0.56,0.82,0.32,0.53,0.74,0.19,0.97,0.02,0.75
4708,13,0.33,0.90,0.71,0.07,0.11,0.81,0.79,0.39,0.15,...,0.05,0.88,0.89,0.08,0.28,0.99,0.53,0.17,0.96,0.41
2595,6,0.86,0.49,0.75,0.31,0.52,0.31,0.25,0.99,0.63,...,0.26,0.37,0.34,0.83,0.26,0.26,0.31,0.73,0.15,0.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2559,6,0.82,0.60,0.98,0.83,0.51,0.10,0.78,0.94,0.29,...,0.88,0.24,0.24,0.94,0.36,0.76,0.93,0.11,0.99,0.56
4709,13,0.75,0.10,0.04,0.98,0.15,0.14,0.87,0.62,0.08,...,0.23,0.73,0.84,0.90,0.32,0.95,0.61,0.79,0.85,0.43
7527,19,0.98,0.44,0.99,0.78,0.85,0.86,0.62,0.96,0.66,...,0.67,0.20,0.54,0.98,0.38,0.58,0.20,0.59,0.57,0.19
6767,17,0.28,0.05,0.26,0.01,0.21,0.75,0.19,0.37,1.00,...,0.07,0.54,0.42,0.02,0.60,0.46,0.73,0.32,0.52,0.21


In [191]:
# Fit RandomForest on oversampled's model most important features 
rf = RandomForestClassifier()
rf.fit(X001_train, y_train)

RandomForestClassifier()

In [192]:
y_pred = rf.predict(X001_train)
accuracy = accuracy_score(y_train, y_pred)
accuracy

1.0

In [193]:
y_pred = rf.predict(X001_valid)
accuracy = accuracy_score(y_valid, y_pred)
accuracy

0.708092485549133

In [43]:
# Fit XGB on Oversampled data
xgb_over = XGBClassifier(use_label_encoder=False,
                         n_estimators=500,
                         eval_metric = 'logloss')

xgb_over.fit(X_ros_train, y_ros_train)

0.9173617846750728

In [44]:
# train error
pred_ros = xgb_over.predict(X_ros_train)
accuracy_ros = accuracy_score(y_ros_train, pred_ros)
accuracy_ros

1.0

In [45]:
# validation error
pred_ros = xgb_over.predict(X_ros_valid)
accuracy_ros = accuracy_score(y_ros_valid, pred_ros)
accuracy_ros

0.9173617846750728

In [None]:
# XGB with Lower max_depth to reduce overfitting - not as good
xgb_over_low_md = XGBClassifier(use_label_encoder=False,
                                n_estimators=500,
                                eval_metric = mlogloss,
                                max_depth=3)

xgb_over_low_md.fit(X_ros_train, y_ros_train)
pred_ros = xgb_over_low_md.predict(X_ros_valid)
accuracy_ros = accuracy_score(y_ros_valid, pred_ros)
accuracy_ros

### With Undersampled Data

In [None]:
# under-sampled data - performs horribly
X_rus_train, X_rus_valid, y_rus_train, y_rus_valid = train_test_split(X_rus, y_rus, train_size=0.75)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_rus_train, y_rus_train)
y_rus_pred = rf.predict(X_rus_valid)
accuracy_rus = accuracy_score(y_rus_valid, y_rus_pred)
accuracy_rus

In [None]:
model = XGBClassifier(use_label_encoder=False, eval_metric = mlogloss)
model.fit(X_rus_train, y_rus_train)
pred_rus = model.predict(X_rus_valid)
accuracy_rus = accuracy_score(y_rus_valid, pred_rus)
accuracy_rus

### Stacked Models on Oversampled Data

In [None]:
# # Stacked models - XGB and Logistic
# estimators = [('xgb_clf', XGBClassifier(use_label_encoder=False,
#                                         n_estimators=500,
#                                         eval_metric = mlogloss,
#                                         random_state=1))]

# clf_stacked = StackingClassifier(estimators=estimators, 
#                                 final_estimator=LogisticRegression(max_iter=500,
#                                                                    random_state=1))

# clf_stacked.fit(X_ros_train, y_ros_train)
# y_pred = clf_stacked.predict(X_ros_valid)
# accuracy = accuracy_score(y_ros_valid, y_pred)
# print(f"Accuracy Score: {accuracy}")

In [None]:
# Evaluation of stacked models with cross validation
cv_accuracy_score = cross_val_score(clf_stacked, X, y, cv=5,scoring='accuracy')
print("Mean 5-Fold accuracy score: {}".format(np.mean(cv_accuracy_score)))

# Get final predictions

In [None]:
# apply same engineering/transformations as you 
# did to train and valid sets!

In [None]:
# Fit chosen model to whole train data set
# clf_stacked.fit(X,y)

In [None]:
X_test = test.drop(['target', 'obs_id'], axis=1)
y_submission = test[['target','obs_id']]
y_submission = y_submission.set_index('obs_id')

In [None]:
# Find final predictions
# y_pred = xgb_over.predict(X_test)

In [None]:
#
y_submission['target'] = pd.Series(y_pred)

In [None]:
y_submission

In [None]:
y_submission.to_csv('stacked_pred.csv')