In [108]:
# import libraries

# handle dataset 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split

from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV, train_test_split


from sklearn import linear_model

# Data set from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud/data 

## Pre-Processing 
- Read the CSV into a dataframe 
- Drop some columns that do we do not want as the independent variables (Xs)

In [46]:
# split train test dataset 

df = pd.read_csv('creditcard.csv')

In [89]:
# define X and y variables as dataframe 
y = df['Class']
X = df.iloc[:, df.columns != 'Class']
X = X.drop("Time",axis=1)
print(X.head())
X_train, X_OOS_test, y_train, y_OOS_test = train_test_split(X, y, test_size=0.20, random_state=66)

         V1        V2        V3        V4        V5        V6        V7  \
0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9       V10  ...       V20       V21       V22       V23  \
0  0.098698  0.363787  0.090794  ...  0.251412 -0.018307  0.277838 -0.110474   
1  0.085102 -0.255425 -0.166974  ... -0.069083 -0.225775 -0.638672  0.101288   
2  0.247676 -1.514654  0.207643  ...  0.524980  0.247998  0.771679  0.909412   
3  0.377436 -1.387024 -0.054952  ... -0.208038 -0.108300  0.005274 -0.190321   
4 -0.270533  0.817739  0.753074  ...  0.408542 -0.009431  0.798278 -0.137458   

        V24       V25       V26       V27       V28  Amount  
0  0.0

## Oversampling 
Since we see that the data is highly imbalanced, we try to oversample the data set with SMOTE. 

In [63]:
from imblearn.over_sampling import SMOTE
# solve error by downgrading to install scikit-learn==1.2.2 
# ''' steps : 
# 1. install pip 
# 2. uninstall sci-kit 
# 3. uninstall imblearn
# 4. install sci-kit 1.2.2 
# 5. install imblearn 
# '''

In [77]:
# Over sample using SMOTE
# -- by inspecting the data, we see that the minority class is extremely class (fraud "Class" == 1)
sm = SMOTE(random_state=42)
X_smote, y_smote = sm.fit_resample(X_train, y_train)

In [7]:
# -- Code to Inspect the data set -- 
# df_oversampled = X_smote
# df_oversampled['Outcome_Variable'] = y_smote
# df_oversampled
# fig, ax = plt.subplots(figsize=(10, 8))
# df_oversampled['Outcome_Variable'].value_counts().plot(kind='bar', ax=ax, fontsize=14)
# ax.set_title('Oversampled Dataset', fontsize=16)
# ax.set_ylabel('Observation counts', fontsize=14)
# ax.set_xlabel('Class', fontsize=14)
# plt.show()


## K-Fold Cross Validation 
We setup K-Fold cross validation for parameters fine-tuning in the models below as a 5 Fold. 

In [110]:
# Set up K-Fold Cross Validation 
n_splits = 5
shuffle = True
random_state = 809
cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
# plot = plot_cv_indices(cv, X_smote, y_smote, n_splits)

## Model 1 : Logistic Regression 

In [9]:
def plot_cv_indices(cv, X, y, n_splits, lw=10):
    '''
    This function plots the Cross validation indices.
    '''
    
    fig, ax = plt.subplots(figsize = (15,8))
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=plt.cm.coolwarm,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes
    ax.scatter(range(len(X)), [ii + 1.5] * len(X), c=y, marker='_', lw=lw, cmap=plt.cm.Paired)

    # Formatting
    yticklabels = list(range(n_splits)) + ['Class']
    ax.set(yticks=np.arange(n_splits+1) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [130]:
# Model 1 : Logistic Model 
logistic_model = LogisticRegression(solver='lbfgs', max_iter=300)
for train_index, test_index in cv.split(X_smote):
    # change to loc to define the rows in the dataframe 
    X_cv_train, X_cv_test, y_cv_train, y_cv_test = X_smote.loc[train_index], X_smote.loc[test_index], y_smote[train_index], y_smote[test_index]
    logistic_model.fit(X_cv_train, y_cv_train)
    #Cross-Validation Prediction Error
    score = logistic_model.score(X_cv_test, y_cv_test)
    print(score)

In [121]:
logit = LogisticRegression(solver='lbfgs', max_iter=1000)
logit.fit(X_smote, y_smote)    
score_OOS = logit.score(X_OOS_test, y_OOS_test)
print ("Logistic Model score :" , score_OOS)
y_pred = logit.predict(X_OOS_test)
print("Logistic Model Recall : " , recall_score(y_OOS_test, y_pred))
print("Logistic Model Precision : ", precision_score(y_OOS_test,y_pred))
print("Probability if you only predict 0s ", 1-round(95/56867,3))

Logistic Model score : 0.9846037709350093
Logistic Model Recall :  0.8631578947368421
Logistic Model Precision :  0.08668076109936575
Probability if you only predict 0s  0.998




## Model 2 : Lasso 

In [15]:
# Model 2 : LASSO 

# Cross Validate the penalty term in lasso
cross_validate_result = {}  
iter_alpha = 0.01
for penalty_term in range(100): 
    print(penalty_term)
    accuracies = [] 
    lasso = linear_model.Lasso(alpha=iter_alpha)
    for train_index, test_index in cv.split(X_smote):
        # change to loc to define the rows in the dataframe 
        X_cv_train, X_cv_test, y_cv_train, y_cv_test = X_smote.loc[train_index], X_smote.loc[test_index], y_smote[train_index], y_smote[test_index]
        logistic_model.fit(X_cv_train, y_cv_train)
        #Cross-Validation Prediction Error
        score = logistic_model.score(X_cv_test, y_cv_test)
        accuracies.append(score)
    cross_validate_result[penalty_term] = (sum(accuracies)/len(accuracies))
    iter_alpha += 0.05
    print("Alpha : " + str(iter_alpha) + " " + str((sum(accuracies)/len(accuracies))))
print(cross_validate_result)
print(max(cross_validate_result, key=cross_validate_result.get))
    

### OLS vs Lasso vs Ridge regression

In [120]:
# OLS 
ols = linear_model.LinearRegression()
ols.fit(X_smote, y_smote)    
y_pred = ols.predict(X_OOS_test)
# turn the continous value into classification via simple >= 0.5 is 1 
y_pred_classification = [1 if x >= 0.5 else 0 for x in y_pred]
print("OLS Model Recall : " , recall_score(y_OOS_test, y_pred_classification))
print("OLS Model Precision : ", precision_score(y_OOS_test,y_pred_classification))
print("--")

# LASSO 
lasso = linear_model.Lasso(alpha=1)
lasso.fit(X_smote, y_smote)
y_pred = lasso.predict(X_OOS_test)
# turn the continous value into classification via simple >= 0.5 is 1 
y_pred_classification = [1 if x >= 0.5 else 0 for x in y_pred]
print("LASSO score : " , )
print("LASSO Model Recall : " , recall_score(y_OOS_test, y_pred_classification))
print("LASSO Model Precision : ", precision_score(y_OOS_test,y_pred_classification))
print("--")

# RIDGE
ridge = linear_model.Ridge(alpha=1)
ridge.fit(X_smote, y_smote)
y_pred = ridge.predict(X_OOS_test)
# turn the continous value into classification via simple >= 0.5 is 1 
y_pred_classification = [1 if x >= 0.5 else 0 for x in y_pred]
print("Ridge Model Recall : " , recall_score(y_OOS_test, y_pred_classification))
print("Ridge Model Precision : ", precision_score(y_OOS_test,y_pred_classification))

OLS Model Recall :  0.7684210526315789
OLS Model Precision :  0.08805790108564536
--
LASSO score : 
LASSO Model Recall :  0.7789473684210526
LASSO Model Precision :  0.09762532981530343
--
Ridge Model Recall :  0.7684210526315789
Ridge Model Precision :  0.08805790108564536


Looks like the curse of dimensionality. 

## Random Forest 

In [122]:

random_forest = RandomForestClassifier(n_estimators = 100, max_depth=10, random_state=0)
random_forest.fit(X_smote, y_smote)


In [123]:
y_pred = random_forest.predict(X_OOS_test)
print("RND Forest Model Recall : " , recall_score(y_OOS_test, y_pred))
print("RND Forest Precision : ", precision_score(y_OOS_test,y_pred))

RND Forest Model Recall :  0.8526315789473684
RND Forest Precision :  0.4879518072289157


## Random forest with LASSO selected variables 

In [101]:
print(X_smote.columns)
print(lasso.coef_)
print(lasso.sparse_coef_)

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')
[-0.          0.         -0.00326987  0.         -0.         -0.
 -0.          0.         -0.         -0.          0.         -0.
 -0.         -0.03410666 -0.         -0.         -0.         -0.
  0.          0.          0.         -0.         -0.         -0.
  0.         -0.          0.          0.          0.00014124]
  (0, 2)	-0.003269866219385262
  (0, 13)	-0.03410666159760446
  (0, 28)	0.00014124466427369918


In [94]:
# Random forest + LASSO 
X_train_rf_lasso = X_smote[["V3","V14","Amount"]]
rf_lasso = RandomForestClassifier(max_depth=2)
rf_lasso.fit(X_train_rf_lasso, y_smote)



  (0, 2)	-0.003269866219385262
  (0, 13)	-0.03410666159760446
  (0, 28)	0.00014124466427369918


In [95]:
X_test_rf_lasso_OOS = X_OOS_test[["V3","V14","Amount"]]
y_pred = clf.predict(X_test_rf_lasso_OOS)
print("RND Forest with LASSO Model Recall : " , recall_score(y_OOS_test, y_pred))
print("RND Forest with LASSO Precision : ", precision_score(y_OOS_test,y_pred))

RND Forest with LASSO Model Recall :  0.8526315789473684
RND Forest with LASSO Precision :  0.0472027972027972


## Using Param Grid to search for best value with K-Fold Cross Validation 

In [128]:

param_grid = [{'n_estimators': [200,225,250], 'max_depth' :[10,20,30]}]
random_forest_cv = RandomForestClassifier()
grid_cv = GridSearchCV(estimator=random_forest_cv, cv=cv, param_grid=param_grid, n_jobs = 3)
grid_cv.fit(X_train_rf_lasso, y_smote)

In [129]:
print(grid_cv.best_estimator_)


RandomForestClassifier(max_depth=30, n_estimators=250)
