In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

In [3]:
#Staic Analysis of the Data set
#Reading the dataset with pandas
df = pd.read_csv("creditcard.csv")

In [4]:
# Finding the shape of the dataset - (no.of.rows,no.of.columns)
print("Dataset's Shape:",df.shape)
print(df.head())
#class : 1 - fradulent, 0 - non-fradulent

Dataset's Shape: (284807, 31)
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

  

In [26]:
#Data Cleaning
#Identifying the number of missing values
print(df.isnull().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [6]:
#scaling the columns
from sklearn.preprocessing import StandardScaler

#scaling - 'Amount' column
df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))

#scaling - 'Time' column
df['Time'] = StandardScaler().fit_transform(df['Time'].values.reshape(-1, 1))

print(df.head())

       Time        V1        V2        V3        V4        V5        V6  \
0 -1.996583 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388   
1 -1.996583  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361   
2 -1.996562 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499   
3 -1.996562 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203   
4 -1.996541 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921   

         V7        V8        V9  ...       V21       V22       V23       V24  \
0  0.239599  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928   
1 -0.078803  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846   
2  0.791461  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281   
3  0.237609  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575   
4  0.592941 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267   

        V25       V26       V27       V28    Amount  Class  
0  0.12

In [9]:
print("Isolation Forest")
# Create the Isolation Forest object
clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.01),
 max_features=1.0, random_state=42, n_jobs = -1)

# Fit the data and tag the outliers
features = ['Time','Amount']
df_features = df[features]

clf.fit(df_features)

# Get the predictions
y_pred = clf.predict(df_features)

# counting the outliers
no_of_outliers = (y_pred == -1).sum()
print("Number of outliers:", no_of_outliers)

Isolation Forest
Number of outliers: 2849


In [12]:
#One Class SVM(Support Vector Machines)
# Create the One-class SVM object
print("\nOne Class SVM")
clf = OneClassSVM(kernel='rbf', gamma=0.001, nu=0.01)

# Fit the data and tag the outliers
clf.fit(df_features)

# Get the predictions
y_pred = clf.predict(df_features)

# Counting the no_of_outliers
no_of_outliers = (y_pred == -1).sum()
print("Number of outliers:", no_of_outliers)


One Class SVM
Number of outliers: 2842


In [23]:
#Evaluation and Model Selection
# Define X and y
X = df.drop('Class', axis=1)
y = df['Class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocess the training and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Detect outliers on the training and testing data
outlier_clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.01, max_features=1.0, random_state=42)
outlier_clf.fit(X_train_scaled)

y_train_pred = outlier_clf.predict(X_train_scaled)
no_of_outliers = (y_train_pred == -1).sum()
print("Number of outliers found in training dataset :", no_of_outliers)

y_test_pred = outlier_clf.predict(X_test_scaled)
no_of_outliers = (y_test_pred == -1).sum()
print("Number of outliers found in testing dataset :", no_of_outliers)

Number of outliers found in training dataset : 1994
Number of outliers found in testing dataset : 842


In [24]:
# Remove outliers from training and testing data
X_train_cleaned = X_train_scaled[y_train_pred != -1]
y_train_cleaned = y_train[y_train_pred != -1]
X_test_cleaned = X_test_scaled[y_test_pred != -1]
y_test_cleaned = y_test[y_test_pred != -1]

In [22]:
classifiers = [LogisticRegression(), DecisionTreeClassifier()]

# Create parameter grids for each classifier
lr_params = {'penalty': ['l2'], 'C': [0.1, 1, 10]}
dt_params = {'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7]}
rf_params = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7]}
knn_params = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
param_grids = [lr_params, dt_params, rf_params, knn_params]

# Loop over classifiers and parameter grids to find the best model
for i, classifier in enumerate(classifiers):
    clf = GridSearchCV(classifier, param_grids[i], cv=5)
    clf.fit(X_train_cleaned, y_train_cleaned)
    print(classifier.__class__.__name__)
    print(clf.best_params_)
    y_pred = clf.predict(X_test_cleaned)
    print(classification_report(y_test_cleaned, y_pred))


LogisticRegression
{'C': 0.1, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84547
           1       0.94      0.54      0.68        54

    accuracy                           1.00     84601
   macro avg       0.97      0.77      0.84     84601
weighted avg       1.00      1.00      1.00     84601

DecisionTreeClassifier
{'criterion': 'entropy', 'max_depth': 3}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84547
           1       0.90      0.65      0.75        54

    accuracy                           1.00     84601
   macro avg       0.95      0.82      0.88     84601
weighted avg       1.00      1.00      1.00     84601



In [25]:
# evaluate the model's performance
acc = accuracy_score(y_test_cleaned, y_pred)
prec = precision_score(y_test_cleaned, y_pred)
rec = recall_score(y_test_cleaned, y_pred)
f1 = f1_score(y_test_cleaned, y_pred)

# print the classification metrics
print(f"Accuracy: {acc}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"F1 Score: {f1}")

Accuracy: 0.9997281356012341
Precision: 0.8974358974358975
Recall: 0.6481481481481481
F1 Score: 0.7526881720430108
