# In this notebook I will present my models which used for predictions

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df = pd.read_csv('../creditcard.csv')

In [3]:
X = df.iloc[:, :-1]
y = df['Class']

In [4]:
und = RandomUnderSampler()
X_und, y_und = und.fit_resample(X, y)

In [5]:
Counter(y_und)

Counter({0: 492, 1: 492})

In [23]:
# Lets concat undersampled X and y
df_und = pd.concat([X_und, y_und], axis=1)
df_und.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,88389.0,-3.708227,3.670773,-3.267675,-1.057623,-0.531656,0.465471,-1.261994,2.688371,0.276476,...,0.440956,1.537195,0.233305,-1.60459,-0.390139,-0.089148,0.482076,0.182497,1.79,0
1,42453.0,-0.237118,0.223022,1.254576,-1.556033,-0.273281,-1.211332,0.30191,-0.275672,-0.88912,...,0.040419,0.208583,-0.077185,0.289889,-0.026483,-0.472985,0.102522,-0.121395,24.9,0
2,80490.0,1.160366,0.03797,0.557385,0.518714,-0.484127,-0.355001,-0.229549,0.162624,0.021896,...,-0.184544,-0.61696,0.199718,0.172904,0.030826,0.099903,-0.026926,0.003737,0.89,0
3,43887.0,-6.46628,-5.120985,0.881461,-0.412296,0.023292,0.302638,0.745833,-0.799876,0.722063,...,-1.191141,0.79361,1.395308,-0.153513,1.115056,-0.264281,-1.170124,2.766348,253.68,0
4,67504.0,1.2508,-1.222272,1.714089,-0.223284,-2.18289,0.285805,-1.781897,0.463338,0.725618,...,0.182862,0.757754,-0.055848,0.566429,0.358175,-0.069724,0.070048,0.015238,4.8,0


## Lets build our first Pipeline

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, f1_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

In [7]:
algos = {
    'DecisionTreeClassifier' : DecisionTreeClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'GaussianNB' : GaussianNB(),
    'LogisticRegression' : LogisticRegression(),
}

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_und, y_und)

In [9]:
for name, classifier in algos.items():
    pipeline = Pipeline(steps=[
        ('classifier', classifier)
    ])
    pipeline.fit(X_train, y_train)
    y_predicted=pipeline.predict_proba(X_test)
    print(name,'ROC SCORE', roc_auc_score(y_test, y_predicted[:, 1]))

DecisionTreeClassifier ROC SCORE 0.8862809917355371
KNeighborsClassifier ROC SCORE 0.6052231404958679
GaussianNB ROC SCORE 0.9654876033057851
LogisticRegression ROC SCORE 0.9474380165289256


### As we can see Gaussian Naive Bayes along side with Logistic regression performed well on the data
### Now I will try ensemble methods

In [10]:
ensem = {
    'RandomForestClassifier' : RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'LGBMClassifier' : LGBMClassifier(),
    'CatBoostClassifier' : CatBoostClassifier(verbose=False),
}

In [11]:
for name, classifier in ensem.items():
    pipe_en = Pipeline(steps=[
        ('classifier', classifier)
    ])
    pipe_en.fit(X_train, y_train)
    y_predicted=pipe_en.predict_proba(X_test)
    print(name,'ROC SCORE', roc_auc_score(y_test, y_predicted[:, 1]))

RandomForestClassifier ROC SCORE 0.9655537190082645
GradientBoostingClassifier ROC SCORE 0.9704462809917356
LGBMClassifier ROC SCORE 0.9678016528925619
CatBoostClassifier ROC SCORE 0.9726942148760331


## The accuracy might change depending on train test split so we will try cross validation 

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [15]:
models = {
    'GaussianNB' : GaussianNB(),
    'LogisticRegression' : LogisticRegression(solver='lbfgs', max_iter=100 ),
    'RandomForestClassifier' : RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'LGBMClassifier' : LGBMClassifier(),
    'CatBoostClassifier' : CatBoostClassifier(verbose=False),
}

In [14]:
results = []
names = []

for name, model in models.items():
    skfold = StratifiedKFold(n_splits=10)
    cv_results = cross_val_score(model, X_und, y_und, cv=skfold, scoring='roc_auc')
    results.append(cv_results)
    names.append(name)
    msg = '%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)

GaussianNB: 0.965252 (0.032131)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression: 0.970970 (0.026409)
RandomForestClassifier: 0.975022 (0.026845)
GradientBoostingClassifier: 0.976508 (0.023652)
LGBMClassifier: 0.972043 (0.026710)
CatBoostClassifier: 0.978477 (0.022780)


In [16]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

## In the EDA notebook we saw that dimensionality reduction techniques were useful. Let's try predicting on them

In [30]:
X_new = df_und[['V12', 'V14', 'V10', 'V11', 'V17']]
y_new = df_und.iloc[:, -1]

In [29]:
results = []
names = []

for name, model in models.items():
    skfold = StratifiedKFold(n_splits=10)
    pipeline = Pipeline(steps=[
        ('dim_red', PCA()),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    cv_results = cross_val_score(pipeline, X_new, y_new, cv=skfold, scoring='roc_auc')
    results.append(cv_results)
    names.append(name)
    msg = '%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)

GaussianNB: 0.960429 (0.034630)
LogisticRegression: 0.959548 (0.027139)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



RandomForestClassifier: 0.956959 (0.037925)
GradientBoostingClassifier: 0.956636 (0.031182)
LGBMClassifier: 0.952819 (0.038367)
CatBoostClassifier: 0.958152 (0.030567)


In [31]:
results = []
names = []

for name, model in models.items():
    skfold = StratifiedKFold(n_splits=10)
    pipeline = Pipeline(steps=[
        ('dim_red', PCA()),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    cv_results = cross_val_score(pipeline, X_new, y_new, cv=skfold, scoring='roc_auc')
    results.append(cv_results)
    names.append(name)
    msg = '%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)

GaussianNB: 0.963266 (0.029932)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression: 0.958960 (0.029106)
RandomForestClassifier: 0.958221 (0.034424)
GradientBoostingClassifier: 0.937576 (0.051871)
LGBMClassifier: 0.952439 (0.035453)
CatBoostClassifier: 0.956920 (0.030310)
