In [149]:
# import libraries

# handle dataset 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split

from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score
import warnings

from sklearn import linear_model

# Data set from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud/data 

In [136]:
# split train test dataset 

df = pd.read_csv('creditcard.csv')

In [137]:
# define X and y variables as dataframe 
y = df['Class']
X = df.iloc[:, df.columns != 'Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=66)

In [138]:
from imblearn.over_sampling import SMOTE
# solve error by downgrading to install scikit-learn==1.2.2 
# ''' steps : 
# 1. install pip 
# 2. uninstall sci-kit 
# 3. uninstall imblearn
# 4. install sci-kit 1.2.2 
# 5. install imblearn 
# '''

In [139]:
# Over sample using SMOTE
# -- by inspecting the data, we see that the minority class is extremely class (fraud "Class" == 1)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [140]:
# -- Code to Inspect the data set -- 
# df_oversampled = X_res
# df_oversampled['Outcome_Variable'] = y_res
# df_oversampled
# fig, ax = plt.subplots(figsize=(10, 8))
# df_oversampled['Outcome_Variable'].value_counts().plot(kind='bar', ax=ax, fontsize=14)
# ax.set_title('Oversampled Dataset', fontsize=16)
# ax.set_ylabel('Observation counts', fontsize=14)
# ax.set_xlabel('Class', fontsize=14)
# plt.show()


In [141]:
# Set up K-Fold Cross Validation 
n_splits = 5
shuffle = True
random_state = 809
cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
# plot = plot_cv_indices(cv, X_res, y_res, n_splits)

In [142]:
def plot_cv_indices(cv, X, y, n_splits, lw=10):
    '''
    This function plots the Cross validation indices.
    '''
    
    fig, ax = plt.subplots(figsize = (15,8))
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=plt.cm.coolwarm,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes
    ax.scatter(range(len(X)), [ii + 1.5] * len(X), c=y, marker='_', lw=lw, cmap=plt.cm.Paired)

    # Formatting
    yticklabels = list(range(n_splits)) + ['Class']
    ax.set(yticks=np.arange(n_splits+1) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [148]:
# Model 1 : Logistic Model 
warnings.filterwarnings("default")

logistic_model = LogisticRegression(solver='lbfgs', max_iter=100)
for train_index, test_index in cv.split(X_res):
    # change to loc to define the rows in the dataframe 
    X_train, X_test, y_train, y_test = X_res.loc[train_index], X_res.loc[test_index], y_res[train_index], y_res[test_index]
    model.fit(X_train, y_train)
    #Cross-Validation Prediction Error
    score = model.score(X_test, y_test)
    

Score: 0.9742470872719279


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score: 0.9771485727475572


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score: 0.9766539531100583
Score: 0.9762692489475593
Score: 0.9777311247650556


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [144]:
model = LogisticRegression()
model.fit(X_res, y_res)    
score_OOS = model.score(X_test, y_test)
print (score_OOS)
y_pred = model.predict(X_test)
print("Logistic Model Recall : " , recall_score(y_test, y_pred))

0.9772365051275569
Logistic Model Recall :  0.9687935111720114


In [145]:
print("Logistic Model Precision : ", precision_score(y_test,y_pred))

Logistic Model Precision :  0.9851890142491122


In [163]:
# Model 2 : LASSO 

cross_validate_result = {}  
# Cross Validate the penalty term in lasso
iter_alpha = 0.01
for penalty_term in range(100): 
    accuracies = [] 
    lasso = linear_model.Lasso(alpha=iter_alpha)
    for train_index, test_index in cv.split(X_res):
        # change to loc to define the rows in the dataframe 
        X_train, X_test, y_train, y_test = X_res.loc[train_index], X_res.loc[test_index], y_res[train_index], y_res[test_index]
        model = lasso.fit(X_train, y_train)
        #Cross-Validation Prediction Error
        score = model.score(X_test, y_test)
        accuracies.append(score)
    cross_validate_result[penalty_term] = (sum(accuracies)/len(accuracies))
    iter_alpha += 0.05
    print("Alpha : " + str(iter_alpha) + " " + str((sum(accuracies)/len(accuracies))))
print(cross_validate_result)
print(max(cross_validate_result, key=cross_validate_result.get))
    


Alpha : 0.060000000000000005 0.6871101974673335
Alpha : 0.11000000000000001 0.6605438044286925
Alpha : 0.16000000000000003 0.6536410838779648
Alpha : 0.21000000000000002 0.6442798366140133
Alpha : 0.26 0.6320614260441935
Alpha : 0.31 0.6203177912573684
Alpha : 0.36 0.6145625222833477
Alpha : 0.41 0.6077971194414591
Alpha : 0.45999999999999996 0.6000217808070734
Alpha : 0.51 0.5912374956045621
Alpha : 0.56 0.5814424355211354
Alpha : 0.6100000000000001 0.5706385495553142
Alpha : 0.6600000000000001 0.5588239032962512
Alpha : 0.7100000000000002 0.5459993396007043
Alpha : 0.7600000000000002 0.5321652700410698


KeyboardInterrupt: 

In [167]:
# Somehow LASSO is performing worst?? 
model = linear_model.LinearRegression()
model.fit(X_res, y_res)    
score_OOS = model.score(X_test, y_test)
y_pred = model.predict(X_test)
print(score_OOS)

model = linear_model.Lasso(alpha=0.01)
model.fit(X_res, y_res)
score_OOS = model.score(X_test, y_test)
y_pred = model.predict(X_test)
print(score_OOS)

0.7007530999881367
0.6888412900277909
