In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
# pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [2]:
def confusion_matrix_customize(y_test,y_pred):
    df = pd.DataFrame(confusion_matrix(y_test,y_pred),columns=['Predict Negative','Predict Positive'],index=['Actual Negative','Actual Positive'])
    
    return df.style.background_gradient(cmap='Blues')

def model_running(train_data,validation_data,algorithm,scale_data=False,logreg=False,logreg_thres=0.4):
    X_train = train_data.drop('label',axis=1)
    y_train = train_data['label']
    X_test = validation_data.drop('label',axis=1)
    y_test = validation_data['label']
    if scale_data == True:
        scaler = StandardScaler()
        X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
        X_test = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns)
    if logreg == True:
        model = algorithm
        model.fit(X_train,y_train)
        y_train_pred = np.where(model.predict_proba(X_train)[:,1]>=logreg_thres,1,0)
        y_test_pred = np.where(model.predict_proba(X_test)[:,1]>=logreg_thres,1,0)
    else:
        model = algorithm
        model.fit(X_train,y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    print('Accuracy of train set:',np.round(accuracy_score(y_train,y_train_pred),2))
    print('Accuracy of test set:',np.round(accuracy_score(y_test,y_test_pred),2))
    print(classification_report(y_test,y_test_pred))
    
    return confusion_matrix_customize(y_test,y_test_pred),model

- Version 1

In [3]:
train_v1 = pd.read_csv('data_for_model/train_v1.csv')
validation_v1 = pd.read_csv('data_for_model/validation_v1.csv')
test_v1 = pd.read_csv('data_for_model/test_v1.csv')

- Version 2

In [4]:
train_v2 = pd.read_csv('data_for_model/train_v2.csv')
validation_v2 = pd.read_csv('data_for_model/validation_v2.csv')
test_v2 = pd.read_csv('data_for_model/test_v2.csv')

In [5]:
# XGB
cm,model = model_running(train_v1,validation_v1,XGBClassifier(random_state=44),scale_data=False,logreg=False)
cm

Accuracy of train set: 0.73
Accuracy of test set: 0.64
              precision    recall  f1-score   support

           0       0.65      0.94      0.77     11437
           1       0.34      0.06      0.10      6022

    accuracy                           0.64     17459
   macro avg       0.50      0.50      0.44     17459
weighted avg       0.54      0.64      0.54     17459



Unnamed: 0,Predict Negative,Predict Positive
Actual Negative,10744,693
Actual Positive,5670,352


- Remove anomalies from the training data

In [6]:
from sklearn.ensemble import IsolationForest
zero = train_v1[train_v1['label']==0]
clf = IsolationForest(random_state=3107,contamination=0.2,max_features=19,max_samples=4000)
clf.fit(zero)
zero['iso'] = clf.predict(zero)
one = train_v1[train_v1['label']==1]
clf = IsolationForest(random_state=3107,contamination=0.2,max_features=19,max_samples=4000)
clf.fit(one)
one['iso'] = clf.predict(one)
# temp_train = train_v1.drop(zero[zero['iso']==-1].index)
# temp_train = temp_train.drop(one[one['iso']==-1].index)
temp_train = train_v1.copy()
temp_train.loc[zero[zero['iso']==-1].index,'label'] = 1
temp_train.loc[one[one['iso']==-1].index,'label'] = 0

temp_validation = validation_v1.copy()

In [7]:
# XGB
cm,model = model_running(temp_train,temp_validation,XGBClassifier(random_state=44),scale_data=False,logreg=False)
cm

Accuracy of train set: 0.74
Accuracy of test set: 0.59
              precision    recall  f1-score   support

           0       0.65      0.80      0.72     11437
           1       0.34      0.20      0.25      6022

    accuracy                           0.59     17459
   macro avg       0.50      0.50      0.49     17459
weighted avg       0.55      0.59      0.56     17459



Unnamed: 0,Predict Negative,Predict Positive
Actual Negative,9182,2255
Actual Positive,4840,1182


In [8]:
# Light GBM
cm,model = model_running(temp_train,temp_validation,LGBMClassifier(random_state=44),scale_data=False,logreg=False)
cm

Accuracy of train set: 0.69
Accuracy of test set: 0.6
              precision    recall  f1-score   support

           0       0.66      0.81      0.72     11437
           1       0.35      0.20      0.25      6022

    accuracy                           0.60     17459
   macro avg       0.50      0.50      0.49     17459
weighted avg       0.55      0.60      0.56     17459



Unnamed: 0,Predict Negative,Predict Positive
Actual Negative,9246,2191
Actual Positive,4834,1188
