In [None]:
# scoring metrics
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.metrics import mean_squared_error
# model selection stuff
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
import scipy
# scaling and normalisation metrics
from sklearn import preprocessing
# classifier models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LassoCV
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
# graphical stuff
import matplotlib.pyplot as plt
%matplotlib inline
# feature selection
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
# dimensionality reduction
from sklearn.decomposition import PCA

from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced

from collections import Counter
import numpy as np
import pandas as pd
import warnings

# helper function to print the data
def print_results(headline, true_value, pred):
    print(headline)
    print("accuracy: {}".format(accuracy_score(true_value, pred)))
    print("precision: {}".format(precision_score(true_value, pred)))
    print("recall: {}".format(recall_score(true_value, pred)))
    print("f1: {}".format(f1_score(true_value, pred)))

# helper function to build the model
def build_lr_model_for_data(data, target):
    X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, test_size=0.2)
    pipeline = make_pipeline(LogisticRegression())
    model = pipeline.fit(X_train, y_train)
    return (X_test, y_test, model)

# helper function to build the model
def build_svc_model_for_data(data, target):
    X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, test_size=0.2)
    pipeline = make_pipeline(SVC())
    model = pipeline.fit(X_train, y_train)
    return (X_test, y_test, model)

In [None]:
# Read data and remove the feature titles
df = pd.read_csv('cc_clients.csv')
data = df.drop(df.index[[0]])
print('before: {}'.format(data.shape))
data.drop_duplicates()
print('after: {}'.format(data.shape))

data.head()


In [None]:
# separate the data into features and target
training_data = np.asarray(data.loc[:, 'X1':'X23']).astype(np.float)
target_data = np.asarray(data['Y']).astype(np.float)

<h3>Statistics to see how the dataset is transformed

In [None]:
# Results from over/undersampling vs. normal distribution
print("Normal data distribution: {}".format(Counter(target_data)))

X_nearmiss, y_nearmiss = NearMiss().fit_sample(training_data, target_data)
print("NearMiss(Undersampling) data distribution: {}".format(Counter(y_nearmiss)))

X_smote, y_smote = SMOTE().fit_sample(training_data, target_data)
print("SMOTE(Oversampling) data distribution: {}".format(Counter(y_smote)))

In [None]:
# declare classification models(LR, SVM)
classifier_lr = LogisticRegression
classifier_svm = SVC()

<h3>NearMiss undersampling vs SMOTE oversampling(Logistic Regression and Support Vector Machines)<h4> - Logistic Regression

In [None]:
# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Normal model
pipeline = make_pipeline(classifier_lr(random_state=42))
model = pipeline.fit(X_train, y_train)
prediction = model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=42), classifier_lr(random_state=42))
nearmiss_model = nearmiss_pipeline.fit(X_train, y_train)
nearmiss_prediction = nearmiss_model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(random_state=42), classifier_lr(random_state=42))
smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing) 
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Score the models using the test data
print('Logistic Regression accuracy, no class distribution {}'.format(pipeline.score(X_test, y_test)))
print('Logistic Regression accuracy, NearMiss undersampling {}'.format(nearmiss_pipeline.score(X_test, y_test)))
print('Logistic Regression accuracy, SMOTE oversampling {}'.format(smote_pipeline.score(X_test, y_test)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

print_results("normal classification", y_test, prediction)
print()
print_results("SMOTE classification", y_test, smote_prediction)
print()
print_results("NearMiss classification", y_test, nearmiss_prediction)

<h4> - Support Vector Machine

In [None]:
# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Normal model
pipeline = make_pipeline(classifier_svm)
model = pipeline.fit(X_train, y_train)
prediction = model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=42), classifier_svm)
nearmiss_model = nearmiss_pipeline.fit(X_train, y_train)
nearmiss_prediction = nearmiss_model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), classifier_svm)
smote_model = smote_pipeline.fit(X_train, y_train)
smote_prediction = smote_model.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

# Score the models using the test data
print('SVM accuracy, no class distribution {}'.format(pipeline.score(X_test, y_test)))
print('SVM accuracy, NearMiss undersampling {}'.format(nearmiss_pipeline.score(X_test, y_test)))
print('SVM accuracy, SMOTE oversampling {}'.format(smote_pipeline.score(X_test, y_test)))

<h4> - Comparing performance of normal vs. SMOTE and NearMiss techniques

In [None]:
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size=0.2, random_state=42)

print_results("normal classification", y_test, prediction)
print()
print_results("SMOTE classification", y_test, smote_prediction)
print()
print_results("NearMiss classification", y_test, nearmiss_prediction)

<h2>Rescaling and Normalisation (Logistic Regression and Support Vector Machines)<h3>Logistic Regression<h4> - Declaring scaling models

In [None]:
# Doing MinMax and Standard scaling and analysing results

min_max = preprocessing.MinMaxScaler()
training_minmax = min_max.fit_transform(training_data)

std = preprocessing.StandardScaler()
training_std = std.fit_transform(training_data)

training_l1 = preprocessing.normalize(training_data, norm="l1")

training_l2 = preprocessing.normalize(training_data, norm="l2")

<h4> - Computing MSE for Logistic Regression

In [None]:
# MSE of Logistic Regression with MinMax, Scaling and Normalisation

X_test, y_test, model = build_lr_model_for_data(training_data, target_data)
prediction = model.predict(X_test)
print("MSE of Logistic Regression with nothing: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_lr_model_for_data(training_minmax, target_data)
prediction = model.predict(X_test)
print("MSE of Logistic Regression with MinMax: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_lr_model_for_data(training_std, target_data)
prediction = model.predict(X_test)
print("MSE of Logistic Regression with Standard Scaler: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_lr_model_for_data(training_l1, target_data)
prediction = model.predict(X_test)
print("MSE of Logistic Regression with Normalisation(L1): {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_lr_model_for_data(training_l2, target_data)
prediction = model.predict(X_test)
print("MSE of Logistic Regression with Normalisation(L2): {}".format(mean_squared_error(y_test, prediction)))

In [None]:
# # classification reports
# print(classification_report(y_test, prediction))
# # print(classification_report_imbalanced(y_test, nearmiss_prediction))
# print(classification_report_imbalanced(y_test, smote_prediction))

<h4> - Using scaling/normalisation techniques

In [None]:
# Using Logistic Regression with normal training data, MinMax and Standard scaling
#####################################################################################################################

# Using Logistic Regression with training data
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, random_state=42, test_size=0.2)

# Using Logistic Regression on data
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)

# print('LR training set accuracy(training data): {}'.format(lr_model.score(X_train, y_train)))
print('Logistic Regression accuracy(training data): {}'.format(lr_model.score(X_test, y_test)))
#####################################################################################################################

# Using Logistic Regression with MinMax scaling
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, random_state=42, test_size=0.2)

# Using Logistic Regression on data
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)

# print('Logistic Regression training set accuracy(Standard scaling): {}'.format(lr_model.score(X_train, y_train)))
print('Logistic Regression accuracy(MinMax): {}'.format(lr_model.score(X_test, y_test)))
#####################################################################################################################

# Using Logistic Regression with Standard scaling
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, random_state=42, test_size=0.2)

# Using Logistic Regression on data
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)

# print('Logistic Regression training set accuracy(Standard scaling): {}'.format(lr_model.score(X_train, y_train)))
print('Logistic Regression accuracy(Standard scaling): {}'.format(lr_model.score(X_test, y_test)))
#####################################################################################################################

# Using Logistic Regression with Normalisation(L1)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, random_state=42, test_size=0.2)

# Using Logistic Regression on data
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)

# print('LR training set accuracy(Normalisation, l1): {}'.format(lr_model.score(X_train, y_train)))
print('Logistic Regression accuracy(Normalisation, l1): {}'.format(lr_model.score(X_test, y_test)))
#####################################################################################################################

# Using Logistic Regression with Standard scaling
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, random_state=42, test_size=0.2)

# Using Logistic Regression on data
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)

# print('LR training set accuracy(Standard scaling): {}'.format(lr_model.score(X_train, y_train)))
print('Logistic Regression accuracy(Normalisation, l2): {}'.format(lr_model.score(X_test, y_test)))

<h4> - Adding sampling techniques to scaling/normalisation techniques

In [None]:
# Scaling, Normalisation with class distribution techniques on Logistic Regression

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_minmax = make_pipeline_imb(SMOTE(), classifier_lr())
smote_model_minmax = smote_pipeline_minmax.fit(X_train, y_train)
smote_prediction_minmax = smote_model_minmax.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_std = make_pipeline_imb(SMOTE(), classifier_lr())
smote_model_std = smote_pipeline_std.fit(X_train, y_train)
smote_prediction_std = smote_model_std.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_l1 = make_pipeline_imb(SMOTE(), classifier_lr())
smote_model_l1 = smote_pipeline_l1.fit(X_train, y_train)
smote_prediction_l1 = smote_model_l1.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_l2 = make_pipeline_imb(SMOTE(), classifier_lr())
smote_model_l2 = smote_pipeline_l2.fit(X_train, y_train)
smote_prediction_l2 = smote_model_l2.predict(X_test)

print('Logistic Regression accuracy, SMOTE, MinMax {}'.format(smote_pipeline_minmax.score(X_test, y_test)))
print('Logistic Regression accuracy, SMOTE, Standard scaling {}'.format(smote_pipeline_std.score(X_test, y_test)))
print('Logistic Regression accuracy, SMOTE, Normalisation(L1) {}'.format(smote_pipeline_l1.score(X_test, y_test)))
print('Logistic Regression accuracy, SMOTE, Normalisation(L2) {}'.format(smote_pipeline_l2.score(X_test, y_test)))

print()
#######################################################################################################################
#######################################################################################################################

# Scaling, Normalisation with class distribution techniques on Logistic Regression

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nm_pipeline_minmax = make_pipeline_imb(NearMiss(), classifier_lr())
nm_model_minmax = nm_pipeline_minmax.fit(X_train, y_train)
nm_prediction_minmax = nm_model_minmax.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nm_pipeline_std = make_pipeline_imb(NearMiss(), classifier_lr())
nm_model_std = nm_pipeline_std.fit(X_train, y_train)
nm_prediction_std = nm_model_std.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nm_pipeline_l1 = make_pipeline_imb(NearMiss(), classifier_lr())
nm_model_l1 = nm_pipeline_l1.fit(X_train, y_train)
nm_prediction_l1 = nm_model_l1.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nm_pipeline_l2 = make_pipeline_imb(NearMiss(), classifier_lr())
nm_model_l2 = nm_pipeline_l2.fit(X_train, y_train)
nm_prediction_l2 = nm_model_l2.predict(X_test)


print('Logistic Regression accuracy, NearMiss, MinMax {}'.format(nm_pipeline_minmax.score(X_test, y_test)))
print('Logistic Regression accuracy, NearMiss, Standard scaling {}'.format(nm_pipeline_std.score(X_test, y_test)))
print('Logistic Regression accuracy, NearMiss, Normalisation(L1) {}'.format(nm_pipeline_l1.score(X_test, y_test)))
print('Logistic Regression accuracy, NearMiss, Normalisation(L2) {}'.format(nm_pipeline_l2.score(X_test, y_test)))

<h3>Support Vector Machine<h4> - Declaring scaling models

In [None]:
# MSE of SVC with MinMax, Scaling and Normalisation

X_test, y_test, model = build_svc_model_for_data(training_data, target_data)
prediction = model.predict(X_test)
print("MSE of Support Vector Machine with nothing: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_svc_model_for_data(training_minmax, target_data)
prediction = model.predict(X_test)
print("MSE of Support Vector Machine with MinMax: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_svc_model_for_data(training_std, target_data)
prediction = model.predict(X_test)
print("MSE of Support Vector Machine with Standard Scaler: {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_svc_model_for_data(training_l1, target_data)
prediction = model.predict(X_test)
print("MSE of Support Vector Machine with Normalisation(L1): {}".format(mean_squared_error(y_test, prediction)))

X_test, y_test, model = build_svc_model_for_data(training_l2, target_data)
prediction = model.predict(X_test)
print("MSE of Support Vector Machine with Normalisation(L2): {}".format(mean_squared_error(y_test, prediction)))

In [None]:
# # classification reports
# print(classification_report(y_test, prediction))
# # print(classification_report_imbalanced(y_test, nearmiss_prediction))
# print(classification_report_imbalanced(y_test, smote_prediction))

<h4> - Using scaling/normalisation techniques

In [None]:
# Using SVMs with normal training data, MinMax and Standard scaling
#####################################################################################################################

# Using SVMs with training data
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, stratify=target_data, random_state=42, test_size=0.2)

svm_model = SVC()
svm_model.fit(X_train, y_train)

svm_model.score(X_test, y_test)

# print('SVM training set accuracy(training data): {}'.format(svm_model.score(X_train, y_train)))
print('SVM test set accuracy(training data): {}'.format(svm_model.score(X_test, y_test)))
#####################################################################################################################
# Using SVMs with MinMax scaling
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using SVMs on data
svm_model = SVC()
svm_model.fit(X_train, y_train)

svm_model.score(X_test, y_test)

# print('SVM training set accuracy(Standard scaling): {}'.format(svm_model.score(X_train, y_train)))
print('SVM test set accuracy(MinMax scaling): {}'.format(svm_model.score(X_test, y_test)))
#####################################################################################################################
# Using SVMs with Standard scaling
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using SVMs on data
svm_model = SVC()
svm_model.fit(X_train, y_train)

svm_model.score(X_test, y_test)

# print('SVM training set accuracy(Standard scaling): {}'.format(svm_model.score(X_train, y_train)))
print('SVM test set accuracy(Standard scaling): {}'.format(svm_model.score(X_test, y_test)))
#####################################################################################################################
# Using SVMs with Normalisation(l1)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using SVMs on data
svm_model = SVC()
svm_model.fit(X_train, y_train)

svm_model.score(X_test, y_test)

# print('SVM training set accuracy(Normalisation, l1): {}'.format(svm_model.score(X_train, y_train)))
print('SVM test set accuracy(Normalisation, l1): {}'.format(svm_model.score(X_test, y_test)))
#####################################################################################################################
# Using SVMs with Normalisation(l2)
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, stratify=target_data, random_state=42, test_size=0.2)

# Using SVMs on data
svm_model = SVC()
svm_model.fit(X_train, y_train)

svm_model.score(X_test, y_test)

# print('SVM training set accuracy(Normalisation, l1): {}'.format(svm_model.score(X_train, y_train)))
print('SVM test set accuracy(Normalisation, l2): {}'.format(svm_model.score(X_test, y_test)))
#####################################################################################################################

<h4> - Adding sampling techniques to scaling/normalisation techniques

In [None]:
# Scaling, Normalisation with class distribution techniques on Logistic Regression

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_minmax = make_pipeline_imb(SMOTE(), classifier_svm)
smote_model_minmax = smote_pipeline_minmax.fit(X_train, y_train)
smote_prediction_minmax = smote_model_minmax.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_std = make_pipeline_imb(SMOTE(), classifier_svm)
smote_model_std = smote_pipeline_std.fit(X_train, y_train)
smote_prediction_std = smote_model_std.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_l1 = make_pipeline_imb(SMOTE(), classifier_svm)
smote_model_l1 = smote_pipeline_l1.fit(X_train, y_train)
smote_prediction_l1 = smote_model_l1.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, test_size=0.2, random_state=42)

# Minority oversampling(SMOTE)
smote_pipeline_l2 = make_pipeline_imb(SMOTE(), classifier_svm)
smote_model_l2 = smote_pipeline_l2.fit(X_train, y_train)
smote_prediction_l2 = smote_model_l2.predict(X_test)

print('Support Vector Machine accuracy, SMOTE, MinMax {}'.format(smote_pipeline_minmax.score(X_test, y_test)))
print('Support Vector Machine accuracy, SMOTE, Standard scaling {}'.format(smote_pipeline_std.score(X_test, y_test)))
print('Support Vector Machine accuracy, SMOTE, Normalisation(L1) {}'.format(smote_pipeline_l1.score(X_test, y_test)))
print('Support Vector Machine accuracy, SMOTE, Normalisation(L2) {}'.format(smote_pipeline_l2.score(X_test, y_test)))

print()
#######################################################################################################################
#######################################################################################################################

# Scaling, Normalisation with class distribution techniques on Logistic Regression

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nm_pipeline_minmax = make_pipeline_imb(NearMiss(), classifier_svm)
nm_model_minmax = nm_pipeline_minmax.fit(X_train, y_train)
nm_prediction_minmax = nm_model_minmax.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nm_pipeline_std = make_pipeline_imb(NearMiss(), classifier_svm)
nm_model_std = nm_pipeline_std.fit(X_train, y_train)
nm_prediction_std = nm_model_std.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nm_pipeline_l1 = make_pipeline_imb(NearMiss(), classifier_svm)
nm_model_l1 = nm_pipeline_l1.fit(X_train, y_train)
nm_prediction_l1 = nm_model_l1.predict(X_test)

#######################################################################################################################

# Split the data into 80/20(training/testing)
X_train, X_test, y_train, y_test = train_test_split(training_l2, target_data, test_size=0.2, random_state=42)

# Majority undersampling(NearMiss)
nm_pipeline_l2 = make_pipeline_imb(NearMiss(), classifier_svm)
nm_model_l2 = nm_pipeline_l2.fit(X_train, y_train)
nm_prediction_l2 = nm_model_l2.predict(X_test)


print('Support Vector Machine accuracy, NearMiss, MinMax {}'.format(nm_pipeline_minmax.score(X_test, y_test)))
print('Support Vector Machine accuracy, NearMiss, Standard scaling {}'.format(nm_pipeline_std.score(X_test, y_test)))
print('Support Vector Machine accuracy, NearMiss, Normalisation(L1) {}'.format(nm_pipeline_l1.score(X_test, y_test)))
print('Support Vector Machine accuracy, NearMiss, Normalisation(L2) {}'.format(nm_pipeline_l2.score(X_test, y_test)))

<h3>Feature Selection (SelectPercentile, SelectFromModel)<h4> - SelectPercentile

In [None]:
# Using SelectPercentile technique on Logistic Regression and scaling methods used previously
sp_model = SelectPercentile(percentile=40)

#####################################################################################################################
# Logistic Regression, Training data, Feature selection
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
sp_model.fit(X_train, y_train)
X_train_selected = sp_model.transform(X_train)
X_test_selected = sp_model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectPercentile, using training data: {}'.format(lr_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), lr_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectPercentile, NearMiss, using training data: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), lr_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectPercentile, SMOTE, using training data: {}'.format(smote_pipeline.score(X_test_selected, y_test)))

########################################################
# SVC
print()
svm_model = SVC()
svm_model.fit(X_train, y_train)
X_train_selected = sp_model.transform(X_train)
X_test_selected = sp_model.transform(X_test)
svm_model.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, using training data: {}'.format(svm_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), svm_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, NearMiss, using training data: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), svm_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, SMOTE, using training data: {}'.format(smote_pipeline.score(X_test_selected, y_test)))
#####################################################################################################################
######################################################################################################################
print()
print()
# Logistic Regression, Training data, Feature selection
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
sp_model.fit(X_train, y_train)
X_train_selected = sp_model.transform(X_train)
X_test_selected = sp_model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectPercentile, MinMax: {}'.format(lr_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), lr_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectPercentile, NearMiss, MinMax: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), lr_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectPercentile, SMOTE, MinMax: {}'.format(smote_pipeline.score(X_test_selected, y_test)))

########################################################
# SVC
print()
svm_model = SVC()
svm_model.fit(X_train, y_train)
X_train_selected = sp_model.transform(X_train)
X_test_selected = sp_model.transform(X_test)
svm_model.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, MinMax: {}'.format(svm_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), svm_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, NearMiss, MinMax: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), svm_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, SMOTE, MinMax: {}'.format(smote_pipeline.score(X_test_selected, y_test)))
#####################################################################################################################
#####################################################################################################################
print()
print()

# Logistic Regression, Training data, Feature selection
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
sp_model.fit(X_train, y_train)
X_train_selected = sp_model.transform(X_train)
X_test_selected = sp_model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectPercentile, Standard Scaling: {}'.format(lr_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), lr_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectPercentile, NearMiss, Standard Scaling: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), lr_model)
# smote_model = smote_pipeline.fit(X_train_selected, y_train)
# smote_prediction = smote_model.predict(X_test_selected)
print('Logistic Regression accuracy, SelectPercentile, SMOTE, Standard Scaling: {}'.format(smote_pipeline.score(X_test_selected, y_test)))

########################################################
# SVC
print()
svm_model = SVC()
svm_model.fit(X_train, y_train)
X_train_selected = sp_model.transform(X_train)
X_test_selected = sp_model.transform(X_test)
svm_model.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, Standard Scaling: {}'.format(svm_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), svm_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, NearMiss, Standard Scaling: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), svm_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, SMOTE, Standard Scaling: {}'.format(smote_pipeline.score(X_test_selected, y_test)))
#####################################################################################################################
#####################################################################################################################
print()
print()

# Logistic Regression, Training data, Feature selection
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
sp_model.fit(X_train, y_train)
X_train_selected = sp_model.transform(X_train)
X_test_selected = sp_model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectPercentile, Normalisation(L1): {}'.format(lr_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), lr_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectPercentile, NearMiss, Normalisation(L1): {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), lr_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectPercentile, SMOTE, Normalisation(L1): {}'.format(smote_pipeline.score(X_test_selected, y_test)))

########################################################
# SVC
print()
svm_model = SVC()
svm_model.fit(X_train, y_train)
X_train_selected = sp_model.transform(X_train)
X_test_selected = sp_model.transform(X_test)
svm_model.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, Normalisation(L1): {}'.format(svm_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), svm_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, NearMiss, Normalisation(L1): {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), svm_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectPercentile, SMOTE, Normalisation(L1): {}'.format(smote_pipeline.score(X_test_selected, y_test)))
#####################################################################################################################

<h4> - SelectFromModel

In [None]:
# Using SelectFromModel technique on Logistic Regression and scaling methods used previously
sm_model = SelectFromModel(LinearSVC(C=0.01, penalty='l1', dual=False))
# sm_model = SelectFromModel(LassoCV())

#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
sm_model.fit(X_train, y_train)
X_train_selected = sm_model.transform(X_train)
X_test_selected = sm_model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, using training data: {}'.format(lr_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), lr_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, NearMiss, using training data: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), lr_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, SMOTE, using training data: {}'.format(smote_pipeline.score(X_test_selected, y_test)))
########################################################
# SVC
print()
svm_model = SVC()

X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, random_state=42, test_size=0.2)

sm_model.fit(X_train, y_train)
X_train_selected = sm_model.transform(X_train)
X_test_selected = sm_model.transform(X_test)
svm_model.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel: {}'.format(svm_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), svm_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel, NearMiss, using training data: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), svm_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel, SMOTE, using training data: {}'.format(smote_pipeline.score(X_test_selected, y_test)))
#####################################################################################################################
print('')
print('')
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
sm_model.fit(X_train, y_train)
X_train_selected = sm_model.transform(X_train)
X_test_selected = sm_model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, MinMax: {}'.format(lr_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), lr_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, NearMiss, MinMax: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), lr_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, SMOTE, MinMax: {}'.format(smote_pipeline.score(X_test_selected, y_test)))
########################################################
# SVC
print()
svm_model = SVC()

X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, random_state=42, test_size=0.2)

sm_model.fit(X_train, y_train)
X_train_selected = sm_model.transform(X_train)
X_test_selected = sm_model.transform(X_test)
svm_model.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel, MinMax: {}'.format(svm_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), svm_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel, NearMiss, MinMax: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), svm_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel, SMOTE, MinMax: {}'.format(smote_pipeline.score(X_test_selected, y_test)))
#####################################################################################################################
print('')
print('')
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
sm_model.fit(X_train, y_train)
X_train_selected = sm_model.transform(X_train)
X_test_selected = sm_model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, Standard scaling: {}'.format(lr_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), lr_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, NearMiss, Standard scaling: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), lr_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, SMOTE, Standard scaling: {}'.format(smote_pipeline.score(X_test_selected, y_test)))
########################################################
# SVC
print()
svm_model = SVC()

X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, random_state=42, test_size=0.2)

sm_model.fit(X_train, y_train)
X_train_selected = sm_model.transform(X_train)
X_test_selected = sm_model.transform(X_test)
svm_model.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel, Standard scaling: {}'.format(svm_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), svm_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel, NearMiss, Standard scaling: {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), svm_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel, SMOTE, Standard scaling: {}'.format(smote_pipeline.score(X_test_selected, y_test)))
#####################################################################################################################
print('')
print('')
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, random_state=42, test_size=0.2)

lr_model = LogisticRegression()
sm_model.fit(X_train, y_train)
X_train_selected = sm_model.transform(X_train)
X_test_selected = sm_model.transform(X_test)
lr_model.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, (Normalisation, l1): {}'.format(lr_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), lr_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, NearMiss, (Normalisation, l1): {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), lr_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Logistic Regression accuracy, SelectFromModel, SMOTE, (Normalisation, l1): {}'.format(smote_pipeline.score(X_test_selected, y_test)))
########################################################
# SVC
print()
svm_model = SVC()

X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, random_state=42, test_size=0.2)

sm_model.fit(X_train, y_train)
X_train_selected = sm_model.transform(X_train)
X_test_selected = sm_model.transform(X_test)
svm_model.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel, (Normalisation, l1): {}'.format(svm_model.score(X_test_selected, y_test)))

# Majority undersampling(NearMiss)
nm_pipeline = make_pipeline_imb(NearMiss(), svm_model)
nm_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel, NearMiss, (Normalisation, l1): {}'.format(nm_pipeline.score(X_test_selected, y_test)))

# Minority oversampling(SMOTE)
smote_pipeline = make_pipeline_imb(SMOTE(), svm_model)
smote_pipeline.fit(X_train_selected, y_train)
print('Support Vector Machine accuracy, SelectFromModel, SMOTE, (Normalisation, l1): {}'.format(smote_pipeline.score(X_test_selected, y_test)))

<h3> PCA Dimensionality reduction

In [None]:
# Pipelines
estimators = [('reduce_dim', PCA(n_components=10)), ('clf', SVC())]
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train) 
print('RBF Kernel, MinMax scaling, PCA(10 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('RBF Kernel, Standard scaling, PCA(10 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('RBF Kernel, Normalisation(L1), PCA(10 dims): {}'.format(pipe.score(X_test, y_test)))

#####################################################################################################################
print('')
estimators = [('reduce_dim', PCA(n_components=10)), ('clf', SVC(kernel='linear'))]
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('Linear Kernel, MinMax scaling, PCA(10 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('Linear Kernel, Standard scaling, PCA(10 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('Linear Kernel, Normalisation(L1), PCA(10 dims): {}'.format(pipe.score(X_test, y_test)))

#####################################################################################################################
#####################################################################################################################
#####################################################################################################################
#####################################################################################################################
print('')
estimators = [('reduce_dim', PCA(n_components=5)), ('clf', SVC())]
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('RBF Kernel, MinMax scaling, PCA(5 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('RBF Kernel, Standard scaling, PCA(5 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('RBF Kernel, Normalisation(L1), PCA(5 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
print('')
estimators = [('reduce_dim', PCA(n_components=5)), ('clf', SVC(kernel='linear'))]
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('Linear Kernel, MinMax scaling, PCA(5 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('Linear Kernel, Standard scaling, PCA(5 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('Linear Kernel, Normalisation(L1), PCA(5 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
#####################################################################################################################
print('')

pca = PCA(.95)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

pca.fit(X_train)
print('Retaining 95% variance with {} components'.format(pca.n_components_))
########################################################
########################################################
print('')
pca = PCA(.85)
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

pca.fit(X_train)
print('Retaining 85% variance with {} components'.format(pca.n_components_))
print('')
######################################################################################################################
estimators = [('reduce_dim', PCA(n_components=3)), ('clf', SVC())]

X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('RBF Kernel, MinMax scaling, PCA(3 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('RBF Kernel, Standard scaling scaling, PCA(3 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('RBF Kernel, Normalisation(L1), PCA(3 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
print('')
#####################################################################################################################
estimators = [('reduce_dim', PCA(n_components=3)), ('clf', SVC(kernel='linear'))]
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_minmax, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('Linear Kernel, MinMax scaling, PCA(3 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('Linear Kernel, Standard scaling, PCA(3 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################
X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)

pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
print('Linear Kernel, Normalisation(L1), PCA(3 dims): {}'.format(pipe.score(X_test, y_test)))
#####################################################################################################################



<h3>Cross-validation

In [4]:
# 10-Fold Cross validation

logreg = LogisticRegression()
scores = cross_val_score(logreg, training_std, target_data, cv=10)
print('Logistic regression, 10-fold cross-validation: %0.3f (+/- (%.3f))' % (scores.mean(), scores.std()*2))


# svm = SVC(kernel='linear')
# scores = cross_val_score(svm, training_std, target_data, cv=10)
# print('Linear kernel SVM, 10-fold cross-validation: %0.3f (+/- (%.3f))' % (scores.mean(), scores.std()*2))

# svm = SVC()
# scores = cross_val_score(svm, training_std, target_data, cv=10)
# print('RBF Kernel, 10-fold cross-validation: %0.3f (+/- (%.3f))' % (scores.mean(), scores.std()*2))

#####################################################################################################################
#####################################################################################################################
logreg = LogisticRegression()
scores = cross_val_score(logreg, training_std, target_data, cv=10, scoring='recall')
print('Logistic regression recall, 10-fold cross-validation: %0.3f)' % (scores.mean())


svm = SVC(kernel='linear')
scores = cross_val_score(svm, training_std, target_data, cv=10, scoring='recall')
print('Linear kernel SVM recall, 10-fold cross-validation: %0.3f)' % (scores.mean())

svm = SVC()
scores = cross_val_score(svm, training_std, target_data, cv=10, scoring='recall')
print('RBF Kernel recall, 10-fold cross-validation: %0.3f)' % (scores.mean())

SyntaxError: invalid syntax (<ipython-input-4-a7410634f599>, line 23)

In [None]:
# Printing mean of several metrics after 5-fold cross validation
# print("Mean of scores 5-fold:")
# print("Accuracy: {}".format(np.mean(accuracy)))
# print("Precision: {}".format(np.mean(precision)))
# print("Recall: {}".format(np.mean(recall)))
# print("F1: {}".format(np.mean(f1)))
# print("Auc: {}".format(np.mean(auc)))

In [None]:
# Hyper-parameter tuning
#####################################################################################################################

# GridSearch
# tuned_parameters = [
#     {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [0.01, 0.1, 1]},
#     {'kernel': ['linear'], 'C': [0.01, 0.1, 1]}
# ]

# clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=10, scoring='f1_macro')

# print('Training data')
# X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, stratify=target_data, random_state=42, test_size=0.2)
# clf.fit(X_train, y_train)
# clf.best_params_
# y_true, y_pred = y_test, clf.predict(X_test)

# print('Standard scaling')
# X_train, X_test, y_train, y_test = train_test_split(training_std, target_data, stratify=target_data, random_state=42, test_size=0.2)
# clf.fit(X_train, y_train)
# clf.best_params_
# y_true, y_pred = y_test, clf.predict(X_test)

# print('Normalisation L1')
# X_train, X_test, y_train, y_test = train_test_split(training_l1, target_data, stratify=target_data, random_state=42, test_size=0.2)
# clf.fit(X_train, y_train)
# clf.best_params_
# y_true, y_pred = y_test, clf.predict(X_test)


# print(classification_report(y_true, y_pred))