In [1]:
import pandas as pd

df = pd.read_csv("./data/cleaned_creditcard.csv")
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,scaled_amount,scaled_time
0,-0.911547,0.848781,2.058419,-0.369295,0.534314,0.293869,0.606555,0.202376,-0.55739,-0.821807,...,0.77149,-0.540937,-0.460555,0.830709,-0.516034,0.063451,0.055119,0,-0.013973,0.462376
1,-13.192671,12.785971,-9.90665,3.320337,-4.801176,5.760059,-18.750889,-37.353443,-0.39154,-5.052502,...,-8.887017,5.303607,-0.639435,0.263203,-0.108877,1.269566,0.939407,1,-0.29344,-0.19367
2,0.376472,0.07403,-0.557307,-1.619405,0.106406,-0.125209,0.07317,0.023187,-0.780447,0.069638,...,0.459547,-0.09469,-1.216613,-0.673362,-0.215961,0.001256,0.066803,0,0.276672,0.445987
3,-2.356348,1.74636,-6.374624,1.772205,-3.439294,1.457811,-0.362577,1.443791,-1.927359,-6.564659,...,0.621203,0.964817,-0.619437,-1.732613,0.108361,1.130828,0.415703,1,9.8639,0.637343
4,-0.234922,0.355413,1.972183,-1.255593,-0.681387,-0.665732,0.05911,-0.003153,1.122451,-1.481246,...,0.912107,-0.286338,0.451208,0.188315,-0.531846,0.123185,0.039581,1,-0.29344,-0.47181


In [2]:
X = df.drop('Class', axis=1)
Y = df['Class']

In [3]:
from sklearn.model_selection import train_test_split

# split the dataset into dependent and independent features
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.20, random_state=42)

In [4]:
# Turn the values into an array for feeding the classification algorithms.
x_train = x_train.values
x_test = x_test.values
y_train = y_train.values
y_test = y_test.values

Model Building
1. Logistic Regression
2. KNN
3. SVC
4. DecisionTreeClassifier
5. RandomForestClassifier

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [6]:
classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier()
}

In [7]:
from sklearn.model_selection import cross_val_score

for key, classifier in classifiers.items():
    classifier.fit(x_train,y_train)
    training_score = cross_val_score(classifier, x_train, y_train,cv=10)
    print()
    print(classifier, " : Training accuracy score is", round(training_score.mean(),2)*100, "%")


LogisticRegression()  : Training accuracy score is 93.0 %

KNeighborsClassifier()  : Training accuracy score is 92.0 %

SVC()  : Training accuracy score is 93.0 %

DecisionTreeClassifier()  : Training accuracy score is 90.0 %

RandomForestClassifier()  : Training accuracy score is 93.0 %


In [9]:
# Prediction

for key, classifier in classifiers.items():
    prediction_score = classifier.predict(x_test)
    print(classifier, " : Prediction accuracy score is", round(prediction_score.mean(),2)*100, "%")

LogisticRegression()  : Prediction accuracy score is 53.0 %
KNeighborsClassifier()  : Prediction accuracy score is 52.0 %
SVC()  : Prediction accuracy score is 52.0 %
DecisionTreeClassifier()  : Prediction accuracy score is 56.99999999999999 %
RandomForestClassifier()  : Prediction accuracy score is 54.0 %


# Observation

Clearly we can see this is the case of overfitting. Here bias is hight but varience is low. This leads to overfitting.

To overcome overfitting, Lets use grid search cv for getting best parameter

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")


classifiers = {
    'log_reg': LogisticRegression(solver='liblinear'),
    'knears_neighbors': KNeighborsClassifier(),
    'svc': SVC(),
    'tree_clf': DecisionTreeClassifier(),
    'forest_clf': RandomForestClassifier()
}

def bestparameter():
    param_list = {
        'log_reg': {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
        'knears_neighbors': {"n_neighbors": list(range(2, 5, 1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
        'svc': {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']},
        'tree_clf': {"criterion": ["gini", "entropy"], "max_depth": list(range(2, 4, 1)),
                     "min_samples_leaf": list(range(5, 7, 1))},
        'forest_clf': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    }

    for clf_name, clf in classifiers.items():
        random_search = RandomizedSearchCV(clf, param_distributions=param_list[clf_name], n_iter=10, cv=5,
                                           scoring='accuracy', random_state=42)
        random_search.fit(x_train, y_train)
        print(f"Best Parameters for {clf_name}: {random_search.best_params_}")

        # Update the best estimator in classifiers dictionary
        classifiers[clf_name] = random_search.best_estimator_



def validation_score():

    print("\n\n==============================================================")
    for clf_name, clf in classifiers.items():
        clf_score = cross_val_score(clf, x_train, y_train, cv=5)
        
        print(f'Cross Validation Score of {clf} : \n-- {round(clf_score.mean() * 100, 2)}%')
    print("\n================================================================")

    # Assuming x_test and y_test are your test data
    for clf_name, clf in classifiers.items():
        # Make predictions on the test set
        y_pred = clf.predict(x_test)

        # Evaluate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f'Test Accuracy of {clf} : \n-- {round(accuracy * 100, 2)}%')


bestparameter()
validation_score()

Best Parameters for log_reg: {'penalty': 'l1', 'C': 0.1}
Best Parameters for knears_neighbors: {'n_neighbors': 3, 'algorithm': 'brute'}
Best Parameters for svc: {'kernel': 'linear', 'C': 0.9}
Best Parameters for tree_clf: {'min_samples_leaf': 5, 'max_depth': 3, 'criterion': 'entropy'}
Best Parameters for forest_clf: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 20}


Cross Validation Score of LogisticRegression(C=0.1, penalty='l1', solver='liblinear') : 
-- 94.54%
Cross Validation Score of KNeighborsClassifier(algorithm='brute', n_neighbors=3) : 
-- 93.39%
Cross Validation Score of SVC(C=0.9, kernel='linear') : 
-- 94.92%
Cross Validation Score of DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5) : 
-- 91.87%
Cross Validation Score of RandomForestClassifier(max_depth=20, min_samples_split=10, n_estimators=200) : 
-- 93.64%

Test Accuracy of LogisticRegression(C=0.1, penalty='l1', solver='liblinear') : 
-- 91.88%
Test Accur