In [1]:
import pandas as pd
import numpy as np 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from imblearn.over_sampling import SMOTE

In [2]:
# load preprocessed data and split

df = pd.read_pickle("data/df_le.pkl")

X = df.drop("target", axis=1)
y = df["target"]


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13410, 13), (5748, 13), (13410,), (5748,))

In [3]:
# Applying 3 ml models 

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
clf_list = [log_clf, rnd_clf, svm_clf]

def ml_clf(X_train, y_train, X_test, y_test, clf_list=clf_list):
  score=[]
  name=[]
  score_frame=pd.DataFrame()
  for clf in clf_list:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # adding to the lists
    score.append(accuracy_score(y_test, y_pred))
    name.append(clf.__class__.__name__)
  score_frame["name"]=name
  score_frame["score"]=score
  return score_frame.sort_values(by=["score"])

In [5]:
ml_clf(X_train, y_train, X_test, y_test)

Unnamed: 0,name,score
2,SVC,0.748086
0,LogisticRegression,0.765658
1,RandomForestClassifier,0.77714


In [7]:
# normalize data

def normalize_data(X_train, X_test):
    names = X_train.columns
    norm = MinMaxScaler().fit(X_train)
    X_train_norm = norm.transform(X_train)
    X_test_norm = norm.transform(X_test)
    return pd.DataFrame(X_train_norm, columns=[names]), pd.DataFrame(X_test_norm, columns=[names])

In [8]:
X_train_norm, X_test_norm = normalize_data(X_train, X_test)

ml_clf(X_train_norm, y_train, X_test_norm, y_test)

Unnamed: 0,name,score
2,SVC,0.765484
0,LogisticRegression,0.766875
1,RandomForestClassifier,0.778706


In [9]:
# Dataset is imbalanced
# Use SMOTE technique to resolve 
sm = SMOTE(random_state=42)
X_smote,y_smote = sm.fit_resample(X,y)

In [10]:
target_count = y_smote.value_counts()
print('Class 0:', target_count[0], '%', round((target_count[0]/target_count.sum())*100,2))
print('Class 1:', target_count[1], '%', round((target_count[1]/target_count.sum())*100,2))
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

Class 0: 14381 % 50.0
Class 1: 14381 % 50.0
Proportion: 1.0 : 1


In [11]:
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote,y_smote,test_size=0.3, random_state=42)
X_train_smote_norm, X_test_smote_norm = normalize_data(X_train_smote, X_test_smote)

ml_clf(X_train_smote_norm, y_train_smote, X_test_smote_norm, y_test_smote)

Unnamed: 0,name,score
0,LogisticRegression,0.734732
2,SVC,0.786766
1,RandomForestClassifier,0.83428
