In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import numpy as np
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("creditcard_cleaned.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,7418,-0.201182,-1.785541,-1.38638,2.199543,1.525807,2.954481,0.264425,-0.239757,0.953239,...,-0.131055,0.662084,-0.086671,-0.118522,0.700997,0.380318,0.197355,-0.896647,-0.409293,0
1,154234,-0.349231,0.128667,-23.984747,16.697832,-22.209875,9.584969,-16.230439,2.596333,-33.239328,...,5.804551,-12.615023,5.774087,2.750221,0.513411,-1.608804,-0.459624,-4.626127,-0.334561,1
2,173753,-0.209898,0.56465,0.345932,-0.024238,-0.249973,-2.121791,0.461026,-0.265107,0.399168,...,0.177209,0.557694,1.489021,-0.219031,0.356435,0.070467,-0.027019,-0.007897,-0.016354,0
3,222133,0.225693,1.011331,-3.61385,-0.922136,-4.749887,3.373001,-0.545207,-1.171301,-4.172315,...,-0.320541,0.786787,0.893065,1.034907,0.097671,-1.345551,-0.788329,1.055442,0.099971,1
4,15506,0.046539,-1.430146,-21.885434,12.930505,-24.098872,6.203314,-16.466099,-4.459842,-16.519836,...,1.611998,1.762232,-1.579055,-0.951043,0.134565,1.50711,-0.222671,1.527655,0.453699,1


In [4]:
df = df.drop(['Unnamed: 0'], axis = 1)

#### After dropping the columns which have least correlation with 'class' we observe a rise in accuracy

In [5]:
df.drop(['scaled_amount','scaled_time','V8','V13','V15','V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28'], axis=1, inplace=True)

In [6]:
X = df.drop('Class', axis = 1)
y = df['Class']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=10000,random_state=0),
    "Logistic Regression with Cross Validation": LogisticRegressionCV(max_iter = 10000, cv=5, random_state=0),
    "Support Vector Classifier": SVC(kernel='linear'),
    "One vs One SVC": NuSVC(),
    "Linear SVC": LinearSVC(),
    "Decision Tree Classifer": DecisionTreeClassifier(criterion='entropy', random_state=0),
    "Random Forest": RandomForestClassifier(n_estimators=20,criterion='entropy',random_state=0)
}

In [9]:
from sklearn.model_selection import cross_val_score
scores = []
for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "has a training score of", round(training_score.max(), 2) * 100)

Classifiers:  LogisticRegression has a training score of 98.0
Classifiers:  LogisticRegressionCV has a training score of 98.0
Classifiers:  SVC has a training score of 97.0
Classifiers:  NuSVC has a training score of 97.0
Classifiers:  LinearSVC has a training score of 97.0
Classifiers:  DecisionTreeClassifier has a training score of 94.0
Classifiers:  RandomForestClassifier has a training score of 97.0


In [10]:
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict

for key, classifier in classifiers.items():
    log_pred = cross_val_predict(LogisticRegression(max_iter=10000,random_state=0), X_train, y_train, cv=5)
    log_cv_pred = cross_val_predict(LogisticRegressionCV(max_iter = 10000, cv=5, random_state=0), X_train, y_train, cv=5)
    svc_pred = cross_val_predict(SVC(kernel='linear'), X_train, y_train, cv=5)
    nusvc_pred = cross_val_predict(NuSVC(), X_train, y_train, cv=5)
    linear_svc_pred = cross_val_predict(LinearSVC(), X_train, y_train, cv=5)
    decision_tree_pred = cross_val_predict(DecisionTreeClassifier(criterion='entropy', random_state=0), X_train, y_train, cv=5)
    random_forest_pred = cross_val_predict(RandomForestClassifier(n_estimators=20,criterion='entropy',random_state=0), X_train, y_train, cv=5)

In [11]:
from sklearn.metrics import roc_auc_score

print('Logistic Regression', roc_auc_score(y_train, log_pred))
print('Logistic Regression CV', roc_auc_score(y_train, log_cv_pred))
print('SVC', roc_auc_score(y_train, svc_pred))
print('NuSMC', roc_auc_score(y_train, nusvc_pred))
print('Linear SVC', roc_auc_score(y_train, linear_svc_pred))
print('Decision tree', roc_auc_score(y_train, decision_tree_pred))
print('Random Forest', roc_auc_score(y_train, random_forest_pred))

Logistic Regression 0.9392993342382523
Logistic Regression CV 0.9479413095468942
SVC 0.945472173744425
NuSMC 0.9123779975437917
Linear SVC 0.943003037941956
Decision tree 0.9012507271669575
Random Forest 0.9390763363712753


Both Logistic Regression with Cross validation and Support Vector Classifer give a very high roc-auc score

In [12]:
class_names = ['Fraud', 'Non-Fraud']
for key, classifier in classifiers.items():
    y_pred = classifier.predict(X_test)
    print("Classifiers: ", classifier.__class__.__name__)
    print(classification_report(y_test, y_pred, target_names=class_names))
    print(confusion_matrix(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))
    print("------------------------------------------------------------")

Classifiers:  LogisticRegression
              precision    recall  f1-score   support

       Fraud       0.88      0.97      0.92        87
   Non-Fraud       0.97      0.89      0.93       110

    accuracy                           0.92       197
   macro avg       0.92      0.93      0.92       197
weighted avg       0.93      0.92      0.92       197

[[84  3]
 [12 98]]
0.9238578680203046
------------------------------------------------------------
Classifiers:  LogisticRegressionCV
              precision    recall  f1-score   support

       Fraud       0.88      0.98      0.92        87
   Non-Fraud       0.98      0.89      0.93       110

    accuracy                           0.93       197
   macro avg       0.93      0.93      0.93       197
weighted avg       0.93      0.93      0.93       197

[[85  2]
 [12 98]]
0.9289340101522843
------------------------------------------------------------
Classifiers:  SVC
              precision    recall  f1-score   support

       

SVC(Support Vector Classifier) is the best model among others.
The F1 score is higher than other algorithms.
The trade-off between precision and recall is well balanced.

In [13]:
classifier = SVC(kernel='linear')
joblib.dump(classifier,"model.joblib")

['model.joblib']