# In this notebook I will present my models which used for predictions

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df = pd.read_csv('../creditcard.csv')

In [4]:
X = df.iloc[:, :-1]
y = df['Class']

In [5]:
und = RandomUnderSampler()
X_und, y_und = und.fit_resample(X, y)

In [6]:
Counter(y_und)

Counter({0: 492, 1: 492})

## Lets build our first Pipeline

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, f1_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

In [40]:
algos = {
    'DecisionTreeClassifier' : DecisionTreeClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'GaussianNB' : GaussianNB(),
    'LogisticRegression' : LogisticRegression(),
}

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_und, y_und)

In [42]:
for name, classifier in algos.items():
    pipeline = Pipeline(steps=[
        ('classifier', classifier)
    ])
    pipeline.fit(X_train, y_train)
    y_predicted=pipeline.predict_proba(X_test)
    print(name,'ROC SCORE', roc_auc_score(y_test, y_predicted[:, 1]))

DecisionTreeClassifier ROC SCORE 0.8812296227293899
KNeighborsClassifier ROC SCORE 0.6506420919555526
GaussianNB ROC SCORE 0.9372879100405882
LogisticRegression ROC SCORE 0.9592121897664515


### As we can see Gaussian Naive Bayes along side with Logistic regression performed well on the data
### Now I will try ensemble methods

In [43]:
ensem = {
    'RandomForestClassifier' : RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'LGBMClassifier' : LGBMClassifier(),
    'CatBoostClassifier' : CatBoostClassifier(verbose=False),
}

In [46]:
for name, classifier in ensem.items():
    pipe_en = Pipeline(steps=[
        ('classifier', classifier)
    ])
    pipe_en.fit(X_train, y_train)
    y_predicted=pipe_en.predict_proba(X_test)
    print(name,'ROC SCORE', roc_auc_score(y_test, y_predicted[:, 1]))

RandomForestClassifier ROC SCORE 0.9547874110053896
GradientBoostingClassifier ROC SCORE 0.968327899394504
LGBMClassifier ROC SCORE 0.966864062811897
CatBoostClassifier ROC SCORE 0.9672632909707898


## The accuracy might change depending on train test split so we will try cross validation 

In [45]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [47]:
models = {
    'GaussianNB' : GaussianNB(),
    'LogisticRegression' : LogisticRegression(),
    'RandomForestClassifier' : RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'LGBMClassifier' : LGBMClassifier(),
    'CatBoostClassifier' : CatBoostClassifier(verbose=False),
}

In [52]:
results = []
names = []

for name, model in models.items():
    skfold = StratifiedKFold(n_splits=10)
    cv_results = cross_val_score(model, X_und, y_und, cv=skfold, scoring='roc_auc')
    results.append(cv_results)
    names.append(name)
    msg = '%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)

GaussianNB: 0.963519 (0.030133)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression: 0.974657 (0.021839)
RandomForestClassifier: 0.976802 (0.025509)
GradientBoostingClassifier: 0.976709 (0.023261)
LGBMClassifier: 0.976694 (0.023696)
CatBoostClassifier: 0.982542 (0.019218)
