# Logistische Regression CV

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

os.getcwd()
%cd "C:\Users\Dennis\OneDrive\Dokumente\03_Master BAOR\05_Kurse\01_Business Analytics\04_Data Analytics Challenge"

ccdata = pd.read_csv('creditcard.csv')

C:\Users\Dennis\OneDrive\Dokumente\03_Master BAOR\05_Kurse\01_Business Analytics\04_Data Analytics Challenge


In [2]:
# Small dataset
#ccdata = ccdata.iloc[:100000, :]

# Balance of dataset target values
display(ccdata.Class.value_counts())

# Drop feature 'Time'
ccdata.drop('Time', axis=1, inplace=True)

0    284315
1       492
Name: Class, dtype: int64

In [3]:
X = ccdata.iloc[:, :-1].to_numpy()
y = ccdata.iloc[:, -1].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.95, random_state=1, stratify=ccdata['Class'], shuffle=True)

print('{}\n'.format(Counter(y_train)))
print('Ratio of target value 1 in ccdata: {:.2f} %'.format(100*np.sum(ccdata['Class'] == 1)/len(ccdata['Class'])))
print('Ratio of target value 1 in train_set: {:.2f} %'.format(100*np.sum(y_train == 1)/len(y_train)))
print('Ratio of target value 1 in test_set: {:.2f} %'.format(100*np.sum(y_test == 1)/len(y_test)))

Counter({0: 270099, 1: 467})

Ratio of target value 1 in ccdata: 0.17 %
Ratio of target value 1 in train_set: 0.17 %
Ratio of target value 1 in test_set: 0.18 %


In [25]:
# Smote the dataset until its balanced
smote = SMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5)
X_res, y_res = smote.fit_resample(X_train, y_train)
Counter(y_res)

Counter({0: 270099, 1: 270099})

### Cross-Validation
### L1 Regularization

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

max_R2 = -np.inf
best_expo = None
reg_terms = [0.1, 1, 10]
reg_terms2 = [10**i for i in [-3, -2, -1, 0, 1, 2, 3]] #[0.001, 0.01, 0.1, 1, 10, 100, 1000]

for term in reg_terms2: #find the optimal hyperparameter for C
  pipe = Pipeline([("scaler", StandardScaler()), #first: name of pipeline, second: Transformer or Predictor
                  ("LogReg", LogisticRegression(penalty='l1', C=term, max_iter=3000, solver='liblinear'))])

  scores = cross_val_score(pipe,
                          X_train,
                          y_train,
                          cv=5, # number of folds
                          scoring='roc_auc')

  cv_R2 = np.mean(scores)
  if cv_R2 > max_R2:
    max_R2 = cv_R2
    best_expo = term
  print(f"mean CV R^2 ({term=}):", np.round(cv_R2, 4))
  print(f"std CV R^2 ({term=}):", np.round(np.std(scores), 4), '\n')

print("Best CV R^2:", max_R2)
print("Best expo:", best_expo)

mean CV R^2 (term=0.001): 0.9468
std CV R^2 (term=0.001): 0.0063 

mean CV R^2 (term=0.01): 0.9709
std CV R^2 (term=0.01): 0.0093 

mean CV R^2 (term=0.1): 0.9786
std CV R^2 (term=0.1): 0.0074 

mean CV R^2 (term=1): 0.9765
std CV R^2 (term=1): 0.0071 

mean CV R^2 (term=10): 0.9761
std CV R^2 (term=10): 0.0069 

mean CV R^2 (term=100): 0.9761
std CV R^2 (term=100): 0.0069 

mean CV R^2 (term=1000): 0.9761
std CV R^2 (term=1000): 0.0069 

Best CV R^2: 0.9785761933580515
Best expo: 0.1


### L2 Regularization

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

max_R2 = -np.inf
best_expo = None
reg_terms = [0.1, 1, 10]
reg_terms2 = [10**i for i in [-3, -2, -1, 0, 1, 2, 3]] #[0.001, 0.01, 0.1, 1, 10, 100, 1000]

for term in reg_terms2:
  pipe = Pipeline([("scaler", StandardScaler()), #first: name of pipeline, second: Transformer or Predictor
                  ("LogReg", LogisticRegression(penalty='l2', C=term, max_iter=3000, solver='lbfgs'))]) #solver='saga'

  scores = cross_val_score(pipe,
                          X_train,
                          y_train,
                          cv=5, # number of folds
                          scoring='roc_auc')
  #print(scores)
  cv_R2 = np.mean(scores)
  if cv_R2 > max_R2:
    max_R2 = cv_R2
    best_expo = term
  print(f"mean CV R^2 ({term=}):", np.round(cv_R2, 4))
  print(f"std CV R^2 ({term=}):", np.round(np.std(scores), 4), '\n')

print("Best CV R^2:", max_R2)
print("Best expo:", best_expo)

mean CV R^2 (term=0.001): 0.982
std CV R^2 (term=0.001): 0.0076 

mean CV R^2 (term=0.01): 0.9813
std CV R^2 (term=0.01): 0.0073 

mean CV R^2 (term=0.1): 0.9775
std CV R^2 (term=0.1): 0.0074 

mean CV R^2 (term=1): 0.9763
std CV R^2 (term=1): 0.007 

mean CV R^2 (term=10): 0.9761
std CV R^2 (term=10): 0.0069 

mean CV R^2 (term=100): 0.9761
std CV R^2 (term=100): 0.0069 

mean CV R^2 (term=1000): 0.9761
std CV R^2 (term=1000): 0.0069 

Best CV R^2: 0.9819651806297547
Best expo: 0.001


### L1 and L2 Regularization

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

max_R2 = -np.inf
best_expo = None
reg_terms = [0.1, 1, 10]
reg_terms2 = [10**i for i in [-3, -2, -1, 0, 1, 2, 3]] #[0.001, 0.01, 0.1, 1, 10, 100, 1000]

for term in reg_terms:
  pipe = Pipeline([("scaler", StandardScaler()), #first: name of pipeline, second: Transformer or Predictor
                  ("LogReg", LogisticRegression(penalty='elasticnet', C=10**term, max_iter=3000, solver='saga'))]) #solver='saga'

  scores = cross_val_score(pipe,
                          X_train,
                          y_train,
                          cv=5, # number of folds
                          scoring='roc_auc')

  cv_R2 = np.mean(scores)
  if cv_R2 > max_R2:
    max_R2 = cv_R2
    best_expo = term
  print(f"mean CV R^2 ({term=}):", np.round(cv_R2, 4))
  print(f"std CV R^2 ({term=}):", np.round(np.std(scores), 4), '\n')

print("Best CV R^2:", max_R2)
print("Best expo:", best_expo)

### Grid Search

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV

# pipe = Pipeline([("scaler", StandardScaler()),
#                  ("LogReg", LogisticRegression())])

# C = [10**expo for expo in [ -2, -1, 0, 1, 2]] #[0.001, 0.01, 0.1, 1, 10, 100, 1000]

# # The following builds a dictionary of lists of values for the hyper-parameters
# # NOTE: The name of the parameter `alpha` of the pipeline step called `ridge`
# #       is `ridge__alpha`.
# param_grid = {"LogReg Cs" : C} 

# grid = GridSearchCV(pipe, # model for which good hyperparameters should be found
#                     param_grid=param_grid, # dictionary determining the parameters to search
#                     cv=5) # determine the value of k for k-fold CV (here, k=5)

# # Calling `fit` performs a search for the best hyper-parameter values using k-fold CV
# # Furthermore, it fits a model on the FULL TRAINING DATA,
# # using the best found choice for the hyper-parameters
# grid.fit(X_res, y_res)


# # We can access the best parameter choice and the corresponding CV score as follows
# print("Best CV score:", grid.best_score_)
# print("Best parameter:", grid.best_params_)

# # If we use `grid.score` or `grid.predict`, this uses the model trained on the
# # full training data, with the best hyper-parameter found using k-fold CV
# print("Test set score:", grid.score(X_test, y_test))

### Test AUC-Score with tuned model

In [38]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score

# Without SMOTE
logreg = LogisticRegression(penalty='l2', C=0.001, max_iter=3000, solver='lbfgs')
logreg.fit(X_train, y_train)

print('AUC on train_set imbalanced dataset: {}'.format(roc_auc_score(y_train, logreg.predict(X_train)))) #0.9819
print('AUC on test_set imbalanced dataset: {}\n'.format(roc_auc_score(y_test, logreg.predict(X_test))))

AUC on train_set imbalanced dataset: 0.768671825644262
AUC on test_set imbalanced dataset: 0.7799648283624087



In [42]:
# With SMOTE
logreg = LogisticRegression(penalty='l2', C=0.001, max_iter=3000, solver='lbfgs')
logreg.fit(X_res, y_res)

print('AUC on train_set smoted dataset: {}'.format(roc_auc_score(y_res, logreg.predict(X_res))))
print('AUC on test_set smoted dataset: {}'.format(roc_auc_score(y_test, logreg.predict(X_test))))

AUC on train_set smoted dataset: 0.9539465159071303
AUC on test_set smoted dataset: 0.9530008441193021
