In [1]:
# scoring metrics
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.metrics import mean_squared_error
# model selection stuff
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
import scipy
# scaling and normalisation metrics
from sklearn import preprocessing
# classifier models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LassoCV
from sklearn.svm import SVC
# graphical stuff
import matplotlib.pyplot as plt
%matplotlib inline
# feature selection
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
# dimensionality reduction
from sklearn.decomposition import PCA

from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced

from collections import Counter
import numpy as np
import pandas as pd
import warnings

# helper function to print the data
def print_results(headline, true_value, pred):
    print(headline)
    print("accuracy: {}".format(accuracy_score(true_value, pred)))
    print("precision: {}".format(precision_score(true_value, pred)))
    print("recall: {}".format(recall_score(true_value, pred)))
    print("f1: {}".format(f1_score(true_value, pred)))

# helper function to build the model
def build_lr_model_for_data(data, target):
    X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, test_size=0.2)
    pipeline = make_pipeline(LogisticRegression())
    model = pipeline.fit(X_train, y_train)
    return (X_test, y_test, model)

# helper function to build the model
def build_svc_model_for_data(data, target):
    X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, test_size=0.2)
    pipeline = make_pipeline(SVC())
    model = pipeline.fit(X_train, y_train)
    return (X_test, y_test, model)

In [2]:
# Read data and remove the feature titles
df = pd.read_csv('cc_clients.csv')
data = df.drop(df.index[[0]])
print('before: {}'.format(data.shape))
data.drop_duplicates()
print('after: {}'.format(data.shape))

data.head()


before: (30000, 25)
after: (30000, 25)


Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
# separate the data into features and target
training_data = np.asarray(data.loc[:, 'X1':'X23']).astype(np.float)
target_data = np.asarray(data['Y']).astype(np.float)

<h3>Statistics to see how the dataset is transformed

In [4]:
# Results from over/undersampling vs. normal distribution
print("Normal data distribution: {}".format(Counter(target_data)))

X_nearmiss, y_nearmiss = NearMiss().fit_sample(training_data, target_data)
print("NearMiss(Undersampling) data distribution: {}".format(Counter(y_nearmiss)))

X_smote, y_smote = SMOTE().fit_sample(training_data, target_data)
print("SMOTE(Oversampling) data distribution: {}".format(Counter(y_smote)))

Normal data distribution: Counter({0.0: 23364, 1.0: 6636})
NearMiss(Undersampling) data distribution: Counter({0.0: 6636, 1.0: 6636})
SMOTE(Oversampling) data distribution: Counter({1.0: 23364, 0.0: 23364})


In [5]:
# declare classification models(LR, SVM)
classifier_lr = LogisticRegression
classifier_svm = SVC()

<h3>NearMiss undersampling vs SMOTE oversampling

In [11]:
# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Normal model
pipeline = make_pipeline(classifier_lr())
model = pipeline.fit(X_train, y_train)
prediction = model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nearmiss_pipeline = make_pipeline_imb(NearMiss(), classifier_lr(random_state=42))
nearmiss_model = nearmiss_pipeline.fit(X_train, y_train)
nearmiss_prediction = nearmiss_model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), classifier_lr(random_state=42))
smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing) 
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Score the models using the test data
print('Logistic Regression accuracy, no class distribution {}'.format(pipeline.score(X_test, y_test)))
print('Logistic Regression accuracy, NearMiss undersampling {}'.format(nearmiss_pipeline.score(X_test, y_test)))
print('Logistic Regression accuracy, SMOTE oversampling {}'.format(smote_pipeline.score(X_test, y_test)))

Logistic Regression accuracy, no class distribution 0.7811666666666667
Logistic Regression accuracy, NearMiss undersampling 0.336
Logistic Regression accuracy, SMOTE oversampling 0.5586666666666666


In [None]:
# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Normal model
pipeline = make_pipeline(classifier_svm)
model = pipeline.fit(X_train, y_train)
prediction = model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=42), classifier_svm)
nearmiss_model = nearmiss_pipeline.fit(X_train, y_train)
nearmiss_prediction = nearmiss_model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), classifier_svm)
smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Score the models using the test data
print('SVM accuracy, no class distribution {}'.format(pipeline.score(X_test, y_test)))
print('SVM accuracy, NearMiss undersampling {}'.format(nearmiss_pipeline.score(X_test, y_test)))
print('SVM accuracy, SMOTE oversampling {}'.format(smote_pipeline.score(X_test, y_test)))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

print_results("normal classification", y_test, prediction)
print()
print_results("SMOTE classification", y_test, smote_prediction)
print()
print_results("NearMiss classification", y_test, nearmiss_prediction)

normal classification
accuracy: 0.7811666666666667
precision: 0.5
recall: 0.0007616146230007616
f1: 0.0015209125475285172

SMOTE classification
accuracy: 0.5586666666666666
precision: 0.2956841138659321
recall: 0.7357197258187357
f1: 0.4218340611353712

NearMiss classification
accuracy: 0.336
precision: 0.18134096874254355
recall: 0.5788271134805788
f1: 0.2761627906976744


<h2>Rescaling and Normalisation

<h3>Declaring scaling instances

In [13]:
# Doing MinMax and Standard scaling and analysing results

min_max = preprocessing.MinMaxScaler()
training_minmax = min_max.fit_transform(training_data)

std = preprocessing.StandardScaler()
training_std = std.fit_transform(training_data)

training_l1 = preprocessing.normalize(training_data, norm="l1")

training_l2 = preprocessing.normalize(training_data, norm="l2")

<h3>Computing MSE for Logistic Regression using several scaling and normalisation techniques

In [14]:
# MSE of Logistic Regression with MinMax, Scaling and Normalisation

X_test, y_test, model = build_lr_model_for_data(training_data, target_data)
prediction = model.predict(X_test)
print("MSE of Logistic Regression with nothing: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_lr_model_for_data(training_minmax, target_data)
prediction = model.predict(X_test)
print("MSE of Logistic Regression with MinMax: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_lr_model_for_data(training_std, target_data)
prediction = model.predict(X_test)
print("MSE of Logistic Regression with Standard Scaler: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_lr_model_for_data(training_l1, target_data)
prediction = model.predict(X_test)
print("MSE of Logistic Regression with Normalisation(L1): {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_lr_model_for_data(training_l2, target_data)
prediction = model.predict(X_test)
print("MSE of Logistic Regression with Normalisation(L2): {}".format(mean_squared_error(y_test, prediction)))

MSE of Logistic Regression with nothing: 0.21883333333333332
MSE of Logistic Regression with MinMax: 0.19
MSE of Logistic Regression with Standard Scaler: 0.19016666666666668
MSE of Logistic Regression with Normalisation(L1): 0.21883333333333332
MSE of Logistic Regression with Normalisation(L2): 0.21916666666666668


<h3>Using scaling/normalisation techniques with Logistic Regression

In [28]:
# Using LR with normal training data, MinMax and Standard scaling
#####################################################################################################################

# Using LR with training data
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using Logistic Regression on data
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)

# print('LR training set accuracy(training data): {}'.format(lr_model.score(X_train, y_train)))
print('LR test set accuracy(training data): {}'.format(lr_model.score(X_test, y_test)))
#####################################################################################################################

# Using LR with MinMax scaling
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using Logistic Regression on data
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)

print('')
# print('LR training set accuracy(Standard scaling): {}'.format(lr_model.score(X_train, y_train)))
print('LR test set accuracy(MinMax): {}'.format(lr_model.score(X_test, y_test)))
#####################################################################################################################

# Using LR with Standard scaling
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using Logistic Regression on data
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)

print('')
# print('LR training set accuracy(Standard scaling): {}'.format(lr_model.score(X_train, y_train)))
print('LR test set accuracy(Standard scaling): {}'.format(lr_model.score(X_test, y_test)))
#####################################################################################################################

# Using LR with Normalisation(L1)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using Logistic Regression on data
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)

print('')
# print('LR training set accuracy(Normalisation, l1): {}'.format(lr_model.score(X_train, y_train)))
print('LR test set accuracy(Normalisation, l1): {}'.format(lr_model.score(X_test, y_test)))
#####################################################################################################################

# Using LR with Standard scaling
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using Logistic Regression on data
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)

print('')
# print('LR training set accuracy(Standard scaling): {}'.format(lr_model.score(X_train, y_train)))
print('LR test set accuracy(Normalisation, l2): {}'.format(lr_model.score(X_test, y_test)))

LR test set accuracy(training data): 0.7788333333333334

LR test set accuracy(MinMax): 0.8075

LR test set accuracy(Standard scaling): 0.8078333333333333

LR test set accuracy(Normalisation, l1): 0.7788333333333334

LR test set accuracy(Normalisation, l2): 0.7788333333333334


<h3>Adding sampling techniques to scaling/normalisation techniques

In [30]:
# Scaling, Normalisation with class distribution techniques on Logistic Regression

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_minmax = make_pipeline_imb(SMOTE(), classifier_lr())
smote_model_minmax = smote_pipeline_minmax.fit(X_train, y_train)
smote_prediction_minmax = smote_model_minmax.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_std = make_pipeline_imb(SMOTE(), classifier_lr())
smote_model_std = smote_pipeline_std.fit(X_train, y_train)
smote_prediction_std = smote_model_std.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_l1 = make_pipeline_imb(SMOTE(), classifier_lr())
smote_model_l1 = smote_pipeline_l1.fit(X_train, y_train)
smote_prediction_l1 = smote_model_l1.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_l2 = make_pipeline_imb(SMOTE(), classifier_lr())
smote_model_l2 = smote_pipeline_l2.fit(X_train, y_train)
smote_prediction_l2 = smote_model_l2.predict(X_test)

print('Logistic Regression accuracy, SMOTE, MinMax {}'.format(smote_pipeline_minmax.score(X_test, y_test)))
print('Logistic Regression accuracy, SMOTE, Standard scaling {}'.format(smote_pipeline_std.score(X_test, y_test)))
print('Logistic Regression accuracy, SMOTE, Normalisation(L1) {}'.format(smote_pipeline_l1.score(X_test, y_test)))
print('Logistic Regression accuracy, SMOTE, Normalisation(L2) {}'.format(smote_pipeline_l2.score(X_test, y_test)))


print()
#######################################################################################################################
#######################################################################################################################

# Scaling, Normalisation with class distribution techniques on Logistic Regression

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
smote_pipeline_minmax = make_pipeline_imb(NearMiss(), classifier_lr())
smote_model_minmax = smote_pipeline_minmax.fit(X_train, y_train)
smote_prediction_minmax = smote_model_minmax.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
smote_pipeline_std = make_pipeline_imb(NearMiss(), classifier_lr())
smote_model_std = smote_pipeline_std.fit(X_train, y_train)
smote_prediction_std = smote_model_std.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
smote_pipeline_l1 = make_pipeline_imb(NearMiss(), classifier_lr())
smote_model_l1 = smote_pipeline_l1.fit(X_train, y_train)
smote_prediction_l1 = smote_model_l1.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
smote_pipeline_l2 = make_pipeline_imb(NearMiss(), classifier_lr())
smote_model_l2 = smote_pipeline_l2.fit(X_train, y_train)
smote_prediction_l2 = smote_model_l2.predict(X_test)


print('Logistic Regression accuracy, NearMiss, MinMax {}'.format(smote_pipeline_minmax.score(X_test, y_test)))
print('Logistic Regression accuracy, NearMiss, Standard scaling {}'.format(smote_pipeline_std.score(X_test, y_test)))
print('Logistic Regression accuracy, NearMiss, Normalisation(L1) {}'.format(smote_pipeline_l1.score(X_test, y_test)))
print('Logistic Regression accuracy, NearMiss, Normalisation(L2) {}'.format(smote_pipeline_l2.score(X_test, y_test)))

Logistic Regression accuracy, SMOTE, MinMax 0.7811666666666667
Logistic Regression accuracy, SMOTE, Standard scaling 0.7811666666666667
Logistic Regression accuracy, SMOTE, Normalisation(L1) 0.45466666666666666
Logistic Regression accuracy, SMOTE, Normalisation(L2) 0.5921666666666666

Logistic Regression accuracy, NearMiss, MinMax 0.3978333333333333
Logistic Regression accuracy, NearMiss, Standard scaling 0.21883333333333332
Logistic Regression accuracy, NearMiss, Normalisation(L1) 0.7408333333333333
Logistic Regression accuracy, NearMiss, Normalisation(L2) 0.3695


In [24]:
# MSE of SVC with MinMax, Scaling and Normalisation

X_test, y_test, model = build_svc_model_for_data(training_data, target_data)
prediction = model.predict(X_test)
print("MSE of Support Vector Machine with nothing: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_svc_model_for_data(training_minmax, target_data)
prediction = model.predict(X_test)
print("MSE of Support Vector Machine with MinMax: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_svc_model_for_data(training_std, target_data)
prediction = model.predict(X_test)
print("MSE of Support Vector Machine with Standard Scaler: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_svc_model_for_data(training_l1, target_data)
prediction = model.predict(X_test)
print("MSE of Support Vector Machine with Normalisation(L1): {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_svc_model_for_data(training_l2, target_data)
prediction = model.predict(X_test)
print("MSE of Support Vector Machine with Normalisation(L2): {}".format(mean_squared_error(y_test, prediction)))

MSE of Support Vector Machine with nothing: 0.21933333333333332
MSE of Support Vector Machine with MinMax: 0.212
MSE of Support Vector Machine with Standard Scaler: 0.18066666666666667
MSE of Support Vector Machine with Normalisation(L1): 0.21883333333333332
MSE of Support Vector Machine with Normalisation(L2): 0.21883333333333332


In [None]:
# classification reports
print(classification_report(y_test, prediction))
# print(classification_report_imbalanced(y_test, nearmiss_prediction))
print(classification_report_imbalanced(y_test, smote_prediction))

In [32]:
# lr_model100 = LogisticRegression(C=0.01)
# lr_model100 = lr_model.fit(X_train, y_train)

# print('Accuracy on the training set, using Logistic Regression: {}'.format(lr_model100.score(X_train, y_train)))
# print('Accuracy on the test set, using Logistic Regression: {}'.format(lr_model100.score(X_test, y_test)))

<h3>Using scaling/normalisation techniques with Support Vector Machines

In [35]:
# Using SVMs with normal training data, MinMax and Standard scaling
#####################################################################################################################

# Using SVMs with training data
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, stratify=target_data, random_state=42, test_size=0.2)

svm_model = SVC()
svm_model.fit(X_train, y_train)

svm_model.score(X_test, y_test)

# print('SVM training set accuracy(training data): {}'.format(svm_model.score(X_train, y_train)))
print('SVM test set accuracy(training data): {}'.format(svm_model.score(X_test, y_test)))
#####################################################################################################################
# Using SVMs with Standard scaling
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using SVMs on data
svm_model = SVC()
svm_model.fit(X_train, y_train)

svm_model.score(X_test, y_test)

print('')
# print('SVM training set accuracy(Standard scaling): {}'.format(svm_model.score(X_train, y_train)))
print('SVM test set accuracy(Standard scaling): {}'.format(svm_model.score(X_test, y_test)))
#####################################################################################################################
# Using SVMs with Normalisation(l1)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using SVMs on data
svm_model = SVC()
svm_model.fit(X_train, y_train)

svm_model.score(X_test, y_test)

print('')
# print('SVM training set accuracy(Normalisation, l1): {}'.format(svm_model.score(X_train, y_train)))
print('SVM test set accuracy(Normalisation, l1): {}'.format(svm_model.score(X_test, y_test)))
#####################################################################################################################
# Using SVMs with Normalisation(l2)
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using SVMs on data
svm_model = SVC()
svm_model.fit(X_train, y_train)

svm_model.score(X_test, y_test)

print('')
# print('SVM training set accuracy(Normalisation, l1): {}'.format(svm_model.score(X_train, y_train)))
print('SVM test set accuracy(Normalisation, l2): {}'.format(svm_model.score(X_test, y_test)))
#####################################################################################################################


SVM test set accuracy(training data): 0.7791666666666667

SVM test set accuracy(Standard scaling): 0.8158333333333333

SVM test set accuracy(Normalisation, l1): 0.7788333333333334

SVM test set accuracy(Normalisation, l2): 0.7788333333333334


<h3>Adding sampling techniques to scaling/normalisation techniques

In [37]:
# Scaling, Normalisation with class distribution techniques on Logistic Regression

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_minmax = make_pipeline_imb(SMOTE(), classifier_svm)
smote_model_minmax = smote_pipeline_minmax.fit(X_train, y_train)
smote_prediction_minmax = smote_model_minmax.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_std = make_pipeline_imb(SMOTE(), classifier_svm)
smote_model_std = smote_pipeline_std.fit(X_train, y_train)
smote_prediction_std = smote_model_std.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_l1 = make_pipeline_imb(SMOTE(), classifier_svm)
smote_model_l1 = smote_pipeline_l1.fit(X_train, y_train)
smote_prediction_l1 = smote_model_l1.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_l2 = make_pipeline_imb(SMOTE(), classifier_svm)
smote_model_l2 = smote_pipeline_l2.fit(X_train, y_train)
smote_prediction_l2 = smote_model_l2.predict(X_test)

print('Support Vector Machine accuracy, SMOTE, MinMax {}'.format(smote_pipeline_minmax.score(X_test, y_test)))
print('Support Vector Machine accuracy, SMOTE, Standard scaling {}'.format(smote_pipeline_std.score(X_test, y_test)))
print('Support Vector Machine accuracy, SMOTE, Normalisation(L1) {}'.format(smote_pipeline_l1.score(X_test, y_test)))
print('Support Vector Machine accuracy, SMOTE, Normalisation(L2) {}'.format(smote_pipeline_l2.score(X_test, y_test)))


print()
#######################################################################################################################
#######################################################################################################################

# Scaling, Normalisation with class distribution techniques on Logistic Regression

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
smote_pipeline_minmax = make_pipeline_imb(NearMiss(), classifier_svm)
smote_model_minmax = smote_pipeline_minmax.fit(X_train, y_train)
smote_prediction_minmax = smote_model_minmax.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
smote_pipeline_std = make_pipeline_imb(NearMiss(), classifier_svm)
smote_model_std = smote_pipeline_std.fit(X_train, y_train)
smote_prediction_std = smote_model_std.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
smote_pipeline_l1 = make_pipeline_imb(NearMiss(), classifier_svm)
smote_model_l1 = smote_pipeline_l1.fit(X_train, y_train)
smote_prediction_l1 = smote_model_l1.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
smote_pipeline_l2 = make_pipeline_imb(NearMiss(), classifier_svm)
smote_model_l2 = smote_pipeline_l2.fit(X_train, y_train)
smote_prediction_l2 = smote_model_l2.predict(X_test)


print('Support Vector Machine accuracy, NearMiss, MinMax {}'.format(smote_pipeline_minmax.score(X_test, y_test)))
print('Support Vector Machine accuracy, NearMiss, Standard scaling {}'.format(smote_pipeline_std.score(X_test, y_test)))
print('Support Vector Machine accuracy, NearMiss, Normalisation(L1) {}'.format(smote_pipeline_l1.score(X_test, y_test)))
print('Support Vector Machine accuracy, NearMiss, Normalisation(L2) {}'.format(smote_pipeline_l2.score(X_test, y_test)))

Support Vector Machine accuracy, SMOTE, MinMax 0.588
Support Vector Machine accuracy, SMOTE, Standard scaling 0.588
Support Vector Machine accuracy, SMOTE, Normalisation(L1) 0.588
Support Vector Machine accuracy, SMOTE, Normalisation(L2) 0.588

Support Vector Machine accuracy, NearMiss, MinMax 0.44
Support Vector Machine accuracy, NearMiss, Standard scaling 0.44
Support Vector Machine accuracy, NearMiss, Normalisation(L1) 0.44
Support Vector Machine accuracy, NearMiss, Normalisation(L2) 0.44


<h3> SelectPercentile

In [None]:
# Using SelectPercentile technique on Logistic Regression and scaling methods used previously
sp_model = SelectPercentile(percentile=40)

#####################################################################################################################
# LR, Training data, Feature selection
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, stratify=target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
print('LR with all features: {}'.format(lr_model.score(X_test, y_test)))

sp_model.fit(X_train, y_train)
X_train_selected = sp_model.transform(X_train)
X_test_selected = sp_model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('LR with feature selection[SelectPercentile]: {}'.format(lr_model.score(X_test_selected, y_test)))

svm_model = SVC(kernel='rbf', C = 1.0)
svm_model.fit(X_train_selected, y_train)
print('SVM with feature selection[SelectPercentile]: {}'.format(svm_model.score(X_test_selected, y_test)))
#####################################################################################################################
print('')

# LR, Standard scaling, Feature selection
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)
print('LR with all features(Standard scaling): {}'.format(lr_model.score(X_test, y_test)))

sp_model.fit(X_train, y_train)
X_train_selected = sp_model.transform(X_train)
X_test_selected = sp_model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('LR with feature selection(Standard scaling)[SelectPercentile]: {}'.format(lr_model.score(X_test_selected, y_test)))

svm_model = SVC(kernel='rbf', C = 1.0)
svm_model.fit(X_train_selected, y_train)
print('SVM with feature selection(Standard scaling)[SelectPercentile]: {}'.format(svm_model.score(X_test_selected, y_test)))
#####################################################################################################################
print('')
# LR, L1 normalisation, Feature selection
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)
  
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)
print('LR with all features(Normalisation, l1): {}'.format(lr_model.score(X_test, y_test)))

sp_model.fit(X_train, y_train)
X_train_selected = sp_model.transform(X_train)
X_test_selected = sp_model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('LR with feature selection(Normalisation, l1)[SelectPercentile]: {}'.format(lr_model.score(X_test_selected, y_test)))

svm_model = SVC(kernel='rbf', C = 1.0)
svm_model.fit(X_train_selected, y_train)
print('SVM with feature selection(Normalisation, l1)[SelectPercentile]: {}'.format(svm_model.score(X_test_selected, y_test)))

<h3>SelectFromModel

In [None]:
# Using SelectFromModel technique on Logistic Regression and scaling methods used previously
model = SelectFromModel(LogisticRegression(C=0.01, dual=False))
#####################################################################################################################
# LR, Training data, Feature selection
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, stratify=target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
print('LR with all features: {}'.format(lr_model.score(X_test, y_test)))

model.fit(X_train, y_train)
X_train_selected = model.transform(X_train)
X_test_selected = model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('LR with feature selection[SelectFromModel]: {}'.format(lr_model.score(X_test_selected, y_test)))

svm_model = SVC(kernel='rbf', C = 1.0)
svm_model.fit(X_train_selected, y_train)
print('SVM with feature selection[SelectPercentile]: {}'.format(svm_model.score(X_test_selected, y_test)))
#####################################################################################################################
print('')

# LR, Standard scaling, Feature selection
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)
print('LR with all features(Standard scaling): {}'.format(lr_model.score(X_test, y_test)))

model.fit(X_train, y_train)
X_train_selected = model.transform(X_train)
X_test_selected = model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('LR with feature selection(Standard scaling)[SelectFromModel]: {}'.format(lr_model.score(X_test_selected, y_test)))

svm_model = SVC(kernel='rbf', C = 1.0)
svm_model.fit(X_train_selected, y_train)
print('SVM with feature selection(Standard scaling)[SelectPercentile]: {}'.format(svm_model.score(X_test_selected, y_test)))
#####################################################################################################################
print('')

# LR, L1 normalisation, Feature selection
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)
print('LR with all features(Normalisation, l1): {}'.format(lr_model.score(X_test, y_test)))

model.fit(X_train, y_train)
X_train_selected = model.transform(X_train)
X_test_selected = model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('LR with feature selection(Normalisation, l1)[SelectFromModel]: {}'.format(lr_model.score(X_test_selected, y_test)))

svm_model = SVC(kernel='rbf', C = 1.0)
svm_model.fit(X_train_selected, y_train)
print('SVM with feature selection(Normalisation, l1)[SelectPercentile]: {}'.format(svm_model.score(X_test_selected, y_test)))
#####################################################################################################################

<h3> PCA Dimensionality reduction

In [None]:
# Pipelines
estimators = [('reduce_dim', PCA(n_components=10)), ('clf', SVC())]

#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 10 components: {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 10 components: {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 10 components: {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 10 components: {}'.format(pipe.score(X_test, y_test)))

#####################################################################################################################
print('')
estimators = [('reduce_dim', PCA(n_components=10, whiten=True)), ('clf', SVC())]

#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 10 components: {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 10 components: {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 10 components: {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 10 components: {}'.format(pipe.score(X_test, y_test)))

#####################################################################################################################
print('')
estimators = [('reduce_dim', PCA(n_components=5)), ('clf', SVC())]

#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 5 components: {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 5 components: {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 5 components: {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('PCA with 5 components: {}'.format(pipe.score(X_test, y_test)))

In [None]:
# Hyper-parameter tuning
#####################################################################################################################

# GridSearch
tuned_parameters = [
    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [0.01, 0.1, 1]},
    {'kernel': ['linear'], 'C': [0.01, 0.1, 1]}
]

clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=10, scoring='f1_macro')

print('Training data')
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, stratify=target_data, random_state=42, test_size=0.2)
clf.fit(X_train, y_train)
clf.best_params_
y_true, y_pred = y_test, clf.predict(X_test)

print('Standard scaling')
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)
clf.fit(X_train, y_train)
clf.best_params_
y_true, y_pred = y_test, clf.predict(X_test)

print('Normalisation L1')
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)
clf.fit(X_train, y_train)
clf.best_params_
y_true, y_pred = y_test, clf.predict(X_test)


print(classification_report(y_true, y_pred))

In [None]:
# Model evaluation(chateau)

# GridSearchCV and cross_val_score take different scoring parameters
# classification report
# confusion matrices

<h1> To do next..


In [None]:
# feat evaluation

# add SMOTE to experiments
# cross validation
# visualisation(scatter)

In [None]:
# Cross validation
logreg = LogisticRegression()
print('Logistic regression cross-validation accuracy: %0.4f' % cross_val_score(logreg, training_data, target_data, cv=10, scoring='accuracy').mean())

svm = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, training_data, target_data, cv=10, scoring='accuracy')
print('SVM cross-validation accuracy: %0.2f' % (cross_val_score(svm, training_data, target_data, cv=10, scoring='accuracy').mean())
#####################################################################################################################

# 10-Fold Cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
accuracy = []
precision = []
recall = []
f1 = []
auc = []
for train, test in kf.split(X_train, y_train):
    pipeline = make_pipeline_imb(SMOTE(), classifier_lr(random_state=42))
    model = pipeline.fit(X_train[train], y_train[train])
    prediction = model.predict(X_train[test])

    accuracy.append(pipeline.score(X_train[test], y_train[test]))
    precision.append(precision_score(y_train[test], prediction))
    recall.append(recall_score(y_train[test], prediction))
    f1.append(f1_score(y_train[test], prediction))
    auc.append(roc_auc_score(y_train[test], prediction))

In [None]:
# Printing mean of several metrics after 5-fold cross validation
print("Mean of scores 5-fold:")
print("Accuracy: {}".format(np.mean(accuracy)))
print("Precision: {}".format(np.mean(precision)))
print("Recall: {}".format(np.mean(recall)))
print("F1: {}".format(np.mean(f1)))
print("Auc: {}".format(np.mean(auc)))