In [10]:
# General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Train/Test splitting
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# Class Imbalance
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier

# Neural networks
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import backend as K

# Error
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
#importing the data
X = pd.read_csv('X_train.csv', float_precision='high').drop('id', axis=1)
y = pd.read_csv('y_train.csv', float_precision='high').drop('id', axis=1)

# replacing the missing values with the median of that column
X = X.fillna(X.median())

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
#sss.get_n_splits(X, y)

## Exploring the data

In [7]:
# Number of each class in the training set
y.y.value_counts()


1    3600
2     600
0     600
Name: y, dtype: int64

## Classes Imbalance

In [3]:
# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
class_0 = X[X.y==0]
class_1 = X[X.y==1]
class_2 = X[X.y==2]

# upsample minority
class_0 = resample(class_0, 
                   replace=True, # sample with replacement
                   n_samples=len(class_1), # match number in majority class
                   random_state=27) # reproducible results
# upsample minority
class_2 = resample(class_2, 
                   replace=True, # sample with replacement
                   n_samples=len(class_1), # match number in majority class
                   random_state=27) # reproducible results


# combine majority and upsampled minority
upsampled = pd.concat([class_0, class_1, class_2])

# check new class counts
print(upsampled.y.value_counts())

y_train_upsampled = upsampled.y
X_train_upsampled = upsampled.drop('y', axis=1)

2    2868
1    2868
0    2868
Name: y, dtype: int64


## SMOTE - Synthetic Minority Over-sampling Technique

In [4]:
sm = SMOTE(sampling_strategy={0: X_train.shape[0], 1: X_train.shape[0], 2: X_train.shape[0]}, random_state=27)
X_train_SMOTE, y_train_SMOTE = sm.fit_sample(X_train, y_train)

## Logistic Regression

In [4]:
lr = LogisticRegression(solver='liblinear').fit(X_train_upsampled, y_train_upsampled)
y_pred = lr.predict(X_test)

## Balanced Random Forest

In [12]:
# train model
#rfc = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
# predict on test set
#y_pred = rfc.predict(X_test)

bbc = BalancedBaggingClassifier(random_state=42).fit(X_train, y_train)

# predict on test set
y_pred = bbc.predict(X_test)

## Accuracy 

In [13]:
BMAC = balanced_accuracy_score(y_test, y_pred)

print(BMAC)

0.6104082349873595


In [5]:
# Cross-validation of the results
cv_score = cross_val_score(lr, X_train, y_train, cv=4, scoring=('balanced_accuracy'))
print('The mean cross-validation score is : ',cv_score.mean())
print(cv_score)

The mean cross-validation score is :  0.5805243995051563
[0.57407915 0.60240864 0.58965466 0.55595514]


## Neural network

In [72]:
# Normalizing the X values
X_train_normalized = (X_train_upsampled - X_train_upsampled.mean()) / X_train_upsampled.std()
X_test_normalized = (X_test - X_test.mean()) / X_test.std()

# Normalizing y values
#y_train_mean, y_train_std = y_train_upsampled.to_frame().mean(axis=1), y_train_upsampled.to_frame().std(axis=1)
#y_train_normalized = ((y_train_upsampled - y_train_mean) / y_train_std).values



pandas.core.series.Series

In [9]:


def balanced_recall(y_true, y_pred):
    """
    Computes the average per-column recall metric
    for a multi-class classification problem
    """ 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)  
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)), axis=0)   
    recall = true_positives / (possible_positives + K.epsilon())    
    balanced_recall = K.mean(recall)
    return balanced_recall

ModuleNotFoundError: No module named 'backend'

In [6]:
def create_network(number_of_features):
    model = Sequential([Dense(number_of_features, activation='relu'),
                        Dense(256, activation='relu'),
                        Flatten(),
                        Dense(128,activation = 'sigmoid'),                         
                        Dense(10,activation = 'softmax')])

    model.compile(optimizer = 'adam',loss='sparse_categorical_crossentropy',metrics = [balanced_recall])
        
    return model

In [7]:
# Choosing only the relevant features
#X_train_relevant = X_train_normalized[features]
#X_test_relevant = X_test_normalized[features]

# creating NN
model = create_network(number_of_features = X_train_upsampled.shape[1])

# Fitting NN
model.fit(X_train_upsampled.values, y_train_upsampled.values, epochs = 5)

# Predicting values
y_pred = model.predict(X_test)
#y_pred = y_pred * y_train_std + y_train_mean

# Saving the score
score = balanced_accuracy_score(y_test, y_pred)
scores.append(score)

# saving the model
models.append(model)

# saving the predictions
predictions.append(y_pred)

TypeError: object of type 'Tensor' has no len()

# Final Predictions

In [None]:
X_final = pd.read_csv('X_test.csv', float_precision='high')