In [32]:
# General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Train/Test splitting
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# Class Imbalance
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

# Classification models
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import (BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, VotingClassifier, 
GradientBoostingClassifier, AdaBoostClassifier)
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.ensemble import BalancedBaggingClassifier
from mlxtend.classifier import EnsembleVoteClassifier


# Neural networks
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import backend

# Error
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [38]:
#importing the data
X = pd.read_csv('X_train.csv', float_precision='high').drop('id', axis=1)
X_final = pd.read_csv('X_test.csv', float_precision='high').drop('id', axis=1)
y = pd.read_csv('y_train.csv', float_precision='high').drop('id', axis=1)

# replacing the missing values with the median of that column
#X = X.fillna(X.median())

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [39]:
#sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
#sss.get_n_splits(X, y)
X.head(10)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x990,x991,x992,x993,x994,x995,x996,x997,x998,x999
0,-1.099144,0.918165,-1.227394,0.887061,1.182734,-0.371802,-0.127331,1.201702,1.825839,2.944655,...,-1.111422,1.566261,-1.656656,-0.412988,2.803183,-3.253815,-2.042599,3.497246,-1.275422,2.440708
1,0.366477,0.933802,0.061234,0.430073,1.029673,-0.613771,0.364698,1.140867,0.182811,-0.344876,...,-0.672386,-0.003721,-0.496326,0.672818,-0.546066,-0.227112,0.291441,-0.150495,0.156421,0.714252
2,-0.425879,-0.802152,0.002718,0.820036,1.490237,-0.888121,0.769524,1.05902,0.854806,-0.077359,...,-1.031358,-0.085724,-0.557461,-0.091904,-0.123858,0.387162,1.031941,0.766522,0.339105,-0.046364
3,0.822922,-0.843041,-0.734624,-4e-05,0.123783,-1.777226,0.364601,0.425521,1.265122,0.734897,...,-0.118974,1.271367,-1.518161,-0.315441,0.218074,-1.880067,-0.49517,1.418946,-0.201938,1.475484
4,-0.285342,-0.641116,0.436524,-0.618663,0.319982,-1.160489,0.528379,0.3373,-0.15621,-0.756133,...,-0.99775,-1.008573,-0.404263,0.304188,0.313034,-0.514287,0.701526,0.473238,-0.046099,0.492881
5,-0.342445,0.199332,-0.09038,0.458183,0.891484,-0.447942,0.327099,-0.12995,0.453594,0.089181,...,-0.786505,0.208726,-1.030579,-0.362479,0.749629,-1.429047,-0.155704,0.745928,-0.696751,1.228423
6,-0.043826,-0.181633,-0.322609,0.346912,0.077354,-1.104049,0.541301,0.254954,-0.099049,-0.396617,...,-1.064404,-0.096151,-0.445242,-0.284852,0.563819,0.12001,0.389729,0.42435,-0.60871,0.461548
7,-0.681491,-0.673945,-0.588701,0.429639,-0.098095,-1.521309,0.258657,0.127858,1.301962,-0.10659,...,-0.382918,0.850127,-0.611081,-1.367277,0.323803,-1.988082,-0.513081,1.560772,-0.284178,1.056016
8,-0.03029,0.81089,0.199345,0.200558,0.509789,-0.670375,-1.552185,0.669737,1.494708,1.821085,...,-2.753024,0.722741,-1.031887,-1.995433,1.653382,-3.262406,-0.529423,2.028094,0.150517,-1.492435
9,0.201803,-0.342499,0.699407,0.679692,0.908805,-0.647258,0.804946,0.653396,1.14953,-1.027472,...,-0.962504,0.131858,-0.836563,-0.53906,0.745173,-1.490457,0.596143,1.488954,0.198124,0.968845


## Exploring the data

In [40]:
# Number of each class in the training set
y.y.value_counts()

1    3600
2     600
0     600
Name: y, dtype: int64

## Classes Imbalance

In [42]:
# concatenate our training data back together
#X = pd.concat([X_train, y_train], axis=1)

X = pd.concat([X, y], axis=1)
# separate minority and majority classes
class_0 = X[X.y==0]
class_1 = X[X.y==1]
class_2 = X[X.y==2]

# upsample minority
class_0 = resample(class_0, 
                   replace=True, # sample with replacement
                   n_samples=len(class_1), # match number in majority class
                   random_state=27) # reproducible results
# upsample minority
class_2 = resample(class_2, 
                   replace=True, # sample with replacement
                   n_samples=len(class_1), # match number in majority class
                   random_state=27) # reproducible results


# combine majority and upsampled minority
upsampled = pd.concat([class_0, class_1, class_2])

y_upsampled = upsampled.y
X_upsampled = upsampled.drop('y', axis=1)

In [1]:
# check new class counts
print(y_train_upsampled.value_counts())
X_train_upsampled.shape

NameError: name 'y_train_upsampled' is not defined

## SMOTE - Synthetic Minority Over-sampling Technique

In [36]:
sm = SMOTE(sampling_strategy={0: X.shape[0], 1: X.shape[0], 2: X.shape[0]}, random_state=27)
X, y = sm.fit_sample(X, y)


In [49]:
X.head(5)


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x990,x991,x992,x993,x994,x995,x996,x997,x998,x999
148,-0.096023,0.130199,-0.060079,0.291787,0.959385,-0.692944,-0.311773,0.704975,-0.271186,-0.563955,...,-0.232652,-0.043905,-0.736941,0.994046,0.429387,0.199501,0.476326,0.747109,0.103769,-0.632316
4347,-1.048684,0.273354,-0.654514,0.323813,0.849184,-0.067237,-0.488956,1.691567,-0.409806,-1.329159,...,-0.358177,0.472728,-0.250097,0.711923,1.117118,-0.345489,0.640481,0.880876,-0.711542,0.078018
2342,0.415622,-0.125793,-0.090027,-0.277413,-0.303201,-0.224223,-0.745746,0.362637,0.779969,-0.65677,...,-1.236626,0.806601,0.735825,1.30496,0.327683,-0.326636,1.535137,0.719197,0.291618,-0.367624
178,0.056069,-0.169683,-0.376146,0.55798,0.46754,-1.3864,0.615987,0.794732,-0.043987,-1.261437,...,-0.647287,-0.239682,-0.428079,-0.616974,0.656375,0.53902,0.572355,0.905697,0.303488,-0.095536
4317,0.486108,-0.41824,0.875455,-0.467575,0.717754,-0.408097,0.00038,1.011524,-0.020505,-0.143252,...,-0.892487,0.029166,0.371115,0.085639,0.976408,0.175409,0.238641,0.080604,-0.110289,-0.485794


# -- Classifiers -- 

### Linear models

In [None]:
lr = LogisticRegression(solver='liblinear').fit(X_train_upsampled, y_train_upsampled)
y_pred = lr.predict(X_test)

In [None]:
rg = RidgeClassifier(solver='liblinear').fit(X_train_upsampled, y_train_upsampled)
y_pred = rg.predict(X_test)

### Balanced Random Forest

In [None]:
# Can be used on the dataset without upsampling first 
bbc = BalancedBaggingClassifier(random_state=42).fit(X_train, y_train)
y_pred = bbc.predict(X_test)

### Random Forest 

In [None]:
rfc = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
y_pred = rfc.predict(X_test)

### K-Nearest Neighbor

In [None]:
knn = KNeighborsClassifier()

### Support Vector Machine

In [None]:
svc = SVC()

# Bagging

In [None]:
seed = 1075
np.random.seed(seed)

rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rg = RidgeClassifier()

clf_array = [rf, knn, et, svc, rg]
bagging_pred = []
for clf in clf_array:
    #vanilla_scores = cross_val_score(clf, X_train_upsampled, y_train_upsampled, cv=10, n_jobs=-1)
    bagging_clf = BaggingClassifier(clf, max_samples=0.4, max_features=3, random_state=seed)
    bagging_scores = cross_val_score(bagging_clf, X_train_upsampled, y_train_upsampled, cv=10, n_jobs=-1)
    

    print("Mean:",vanilla_scores.mean() ,"std:", vanilla_scores.std(), "Type: ", clf.__class__.__name__)
    print("Mean:",bagging_scores.mean() ,"std:", bagging_scores.std(), "Type: Bagging", clf.__class__.__name__)
    

## Voting Classifier

In [None]:
rf = BaggingClassifier(RandomForestClassifier())
et = BaggingClassifier(ExtraTreesClassifier())
knn = BaggingClassifier(KNeighborsClassifier())
svc = BaggingClassifier(SVC())
rg = BaggingClassifier(RidgeClassifier())

eclf = VotingClassifier(estimators=[('Random Forests', rf), ('Extra Trees', et), ('KNeighbors', knn), ('SVC', svc), ('Ridge Classifier', rg)], voting='hard').fit(X_train_upsampled, y_train_upsampled)
y_pred = eclf.predict(X_test)


# Boosting + Bagging + Voting

In [44]:
rf = BaggingClassifier(RandomForestClassifier())
#et = BaggingClassifier(ExtraTreesClassifier())
knn = BaggingClassifier(KNeighborsClassifier())
svc = BaggingClassifier(SVC())
rg = BaggingClassifier(RidgeClassifier())
#ada_boost = AdaBoostClassifier()
#grad_boost = GradientBoostingClassifier()
xgb_boost = XGBClassifier()
boost_array = [ada_boost, grad_boost, xgb_boost]

evclf = EnsembleVoteClassifier(clfs=[xgb_boost, rf, knn, svc, rg], voting='hard').fit(X, y)
#y_pred_ = evclf.predict(X_final)


## Accuracy 

In [None]:
BMAC = balanced_accuracy_score(y_test, y_pred_)

print(BMAC)

In [45]:
# Cross-validation of the results
cv_score = cross_val_score(evclf, X, y, cv=4, scoring=('balanced_accuracy'))
print('The mean cross-validation score is : ',cv_score.mean())
print(cv_score)

The mean cross-validation score is :  0.9096296296296296
[0.9062963  0.90814815 0.91259259 0.91148148]


## Neural network

In [None]:
# Normalizing the X values
X_train_normalized = (X_train_upsampled - X_train_upsampled.mean()) / X_train_upsampled.std()
X_test_normalized = (X_test - X_test.mean()) / X_test.std()

# Normalizing y values
#y_train_mean, y_train_std = y_train_upsampled.to_frame().mean(axis=1), y_train_upsampled.to_frame().std(axis=1)
#y_train_normalized = ((y_train_upsampled - y_train_mean) / y_train_std).values


In [None]:
def balanced_recall(y_true, y_pred):
    """
    Computes the average per-column recall metric
    for a multi-class classification problem
    """ 
    true_positives = backend.sum(backend.round(backend.clip(y_true * y_pred, 0, 1)), axis=0)  
    possible_positives = backend.sum(backend.round(backend.clip(y_true, 0, 1)), axis=0)   
    recall = true_positives / (possible_positives + backend.epsilon())    
    balanced_recall = backend.mean(recall)
    return balanced_recall

In [None]:
def create_network(number_of_features):
    model = Sequential([Dense(number_of_features, activation='relu'),
                        Dense(1024, activation='relu'),
                        Dense(512, activation='relu'),
                        Dropout(0.1),
                        Dense(1024, activation='relu'),
                        Dense(1024, activation='relu'),
                        Dropout(0.1),
                        Dense(1024, activation='relu'),
                        Dropout(0.1),
                        Dense(512, activation='relu'),
                        Dense(256, activation='relu'),
                        Flatten(),
                        Dense(128,activation = 'sigmoid'),                         
                        Dense(3,activation = 'softmax')])

    model.compile(optimizer = 'adam',loss='sparse_categorical_crossentropy',metrics = [balanced_accuracy_score])
        
    return model

In [None]:
# Choosing only the relevant features
#X_train_relevant = X_train_normalized[features]
#X_test_relevant = X_test_normalized[features]

# creating NN
model = create_network(number_of_features = X.shape[1])

# Fitting NN
model.fit(X.values, y.values, epochs = 10)

# Predicting classes
y_pred = model.predict(X_final).argmax(axis=1)
#y_pred = y_pred * y_train_std + y_train_mean

# Saving the score
#score = balanced_accuracy_score(y_test, y_pred.argmax(axis=1))
#scores.append(score)

# saving the model
#models.append(model)

# saving the predictions
#predictions.append(y_pred)

In [None]:
score

In [55]:
X_final.head(5)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x990,x991,x992,x993,x994,x995,x996,x997,x998,x999
0,0.393081,-1.266585,-0.534465,-0.017727,0.870355,-1.243459,-0.194805,3.369338,0.51803,1.014564,...,-1.642932,0.524188,-1.297014,-0.299613,2.391213,-1.134149,-1.061005,3.20822,-0.057589,0.591722
1,0.97305,0.121111,-0.051525,0.002681,0.020402,-0.203451,-0.851999,0.623676,-0.142667,-1.082818,...,-0.802205,-0.881559,0.044385,0.104938,1.400879,0.440617,0.250877,0.920801,0.138718,-0.052341
2,0.038292,-0.995839,0.087764,1.868466,-0.438614,-0.226892,0.39637,0.848138,0.591847,0.904976,...,-0.50947,1.053428,-1.08273,-0.822194,1.164689,-2.721737,-1.270373,2.793598,0.333958,0.790519
3,0.650431,0.32494,0.404872,0.02838,0.848341,-0.533394,0.237058,1.755244,-0.291717,-1.512967,...,-0.343155,-0.255553,0.723408,0.689066,0.756997,0.983355,0.660556,-0.030405,0.180313,-0.427872
4,0.345413,0.176561,-0.427172,-0.057769,0.865265,-1.274553,-1.041643,0.644448,1.667168,0.932939,...,-0.688064,1.845356,-1.246581,-0.986015,1.381274,-1.549053,-2.936785,2.497881,0.276085,0.167375


# Final Predictions

In [54]:
X_final = pd.read_csv('X_test.csv', float_precision='high').drop('id', axis=1)


In [56]:
y_pred_ = evclf.predict(X_final)

#y_final_pred['y'] = model.predict(X_final).argmax(axis=1)
#y_final_pred.head(20)

In [57]:
y_final_pred = pd.read_csv('sample.csv')
y_final_pred['y'] = pd.DataFrame(y_pred_) 
y_final_pred.to_csv('bagging_voting.csv')

In [None]:
#np.savetxt('pred_nn.gz', y_pred)

4100