In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
sub = pd.read_excel('Sample_Submission.xlsx')

In [3]:
train.shape, test.shape, sub.shape

((620, 29), (266, 28), (266, 5))

In [4]:
train.head(3)

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f19,f20,f21,f22,f23,f24,f25,f26,f27,grade
0,1.848564,-0.26425,-0.461423,0.4094,1.305455,2.329398,0.370965,0.090167,0.107958,0.0,...,0.085505,0.233285,-1.080663,0.443257,-0.406121,-0.687687,0.271886,3.727218,0.102129,2
1,-0.825098,-0.26425,3.032397,-2.442599,1.305455,-0.276144,0.370965,0.090167,0.107958,0.0,...,0.085505,0.233285,-1.080663,-0.232546,-0.406366,-0.687687,0.271886,-0.232472,0.102129,4
2,1.848564,-0.26425,-0.461423,0.4094,1.305455,2.329398,0.370965,0.090167,0.107958,0.0,...,0.085505,0.233285,0.925358,1.459782,1.221876,1.877777,0.271886,-0.232472,0.102129,2


In [5]:
train['grade'].value_counts()

2    472
1     68
3     47
4     27
0      6
Name: grade, dtype: int64

In [6]:
sub.head(3)

Unnamed: 0,0,1,2,3,4
0,0,0,1,0,0
1,0,0,0,1,0
2,0,0,1,0,0


In [7]:
#train.isnull().sum()

In [8]:
# merge train and test
df = train.append(test,ignore_index=True)

In [9]:
train_df = df[df['grade'].isnull()!=True]
test_df = df[df['grade'].isnull()==True]
test_df.drop(['grade'], axis=1, inplace=True)

In [10]:
X = train_df.drop(labels=['grade'], axis=1)
y = train_df['grade'].values

X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)

In [11]:
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)
y_pred = xgb.predict_proba(X_cv)

log_loss(y_cv, y_pred)

0.09115066029823332

In [12]:
Xtest = test_df

In [13]:
err = []
y_pred_tot = []

fold = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)

for train_index, test_index in fold.split(X, y):
    
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xgb = XGBClassifier(random_state=42)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict_proba(X_test)
    
    print("Log Loss:", log_loss(y_test, y_pred))

    err.append(log_loss(y_test, y_pred))
    p = xgb.predict_proba(Xtest)
    y_pred_tot.append(p)

Log Loss: 0.130434659164306
Log Loss: 0.07391394816085477
Log Loss: 0.2244337599602182
Log Loss: 0.14362498297137366
Log Loss: 0.18121509278603962
Log Loss: 0.10370132110821359


In [14]:
np.mean(err, 0)

0.14288729402516764

In [15]:
np.mean(y_pred_tot, 0)

array([[5.1788043e-05, 2.2935931e-04, 9.9957418e-01, 5.3117168e-05,
        9.1531219e-05],
       [1.4595404e-04, 4.0418815e-02, 2.1245487e-03, 9.5709133e-01,
        2.1933320e-04],
       [5.0315834e-05, 6.8664143e-05, 9.9977618e-01, 4.9299084e-05,
        5.5526016e-05],
       ...,
       [1.4377552e-03, 4.9046257e-01, 5.0313455e-01, 3.0291444e-03,
        1.9359970e-03],
       [1.9963169e-03, 4.7785866e-01, 5.0508344e-01, 1.7274638e-03,
        1.3334109e-02],
       [9.8708893e-05, 9.0410927e-04, 9.9824739e-01, 1.2818500e-04,
        6.2168529e-04]], dtype=float32)

In [16]:
y_pred = np.mean(y_pred_tot, 0)

In [17]:
sub = pd.DataFrame(y_pred)
sub.head(5)

Unnamed: 0,0,1,2,3,4
0,5.2e-05,0.000229,0.999574,5.3e-05,9.2e-05
1,0.000146,0.040419,0.002125,0.957091,0.000219
2,5e-05,6.9e-05,0.999776,4.9e-05,5.6e-05
3,0.000708,0.000544,0.026243,0.971162,0.001343
4,9.2e-05,0.000126,0.996964,0.002723,9.5e-05


In [18]:
sub.to_excel('s1.xlsx', index=False)