In [1]:
import pickle
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [2]:
X = pd.read_pickle('/kaggle/input/amex-imputed-aggregate-data/X_train_agg.pkl', compression='gzip')
y = pd.read_pickle('/kaggle/input/amex-imputed-aggregate-data/y_train_agg.pkl', compression='gzip')
test = pd.read_pickle('/kaggle/input/amex-imputed-aggregate-data/X_test_agg.pkl', compression='gzip')

Function to calculate competition's evaluation metric: https://www.kaggle.com/code/inversion/amex-competition-metric-python

In [3]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

## Use StratifiedKFold() to train and predict on val and test sets

We will use the same hyperparameters from our tuned LogisticRegression() without KFolding: 

* max_iter=290
* C=100

In [4]:
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=20)

val_preds = np.zeros(X.shape[0])
val_class = np.zeros(X.shape[0])
test_preds = np.zeros(test.shape[0])

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f'\nFold {fold + 1}')
    
    # allocate train and val sets using fold split indices 
    X_train, X_val = (X.iloc[train_index].reset_index(drop=True), X.iloc[val_index].reset_index(drop=True))
    y_train, y_val = (y.iloc[train_index].reset_index(drop=True), y.iloc[val_index].reset_index(drop=True))
    
    # define and train model
    model = LogisticRegression(max_iter=290, C=100)
    model = model.fit(X_train, y_train)
    
    # predict on val set
    val_preds[val_index] = val_preds[val_index] + model.predict_proba(X_val)[:,1]
    val_class[val_index] = val_class[val_index] + model.predict(X_val)
    
    # predict on this fold's section of test set 
    test_proba = model.predict_proba(test)[:, 1]
    test_preds = test_preds + test_proba / num_folds


Fold 1

Fold 2

Fold 3

Fold 4

Fold 5


In [5]:
# use validation preds to get val M score
y_preds = pd.DataFrame(y).copy(deep=True)
y_preds = y_preds.rename(columns={"target": "prediction"})
y_preds['prediction'] = val_preds

# print M score and class report 
print(f'\nM score: {amex_metric(pd.DataFrame(y), y_preds)}')
print(f'\n{classification_report(y, val_class)}')


M score: 0.7789007070595015

              precision    recall  f1-score   support

           0       0.93      0.94      0.93    340085
           1       0.82      0.78      0.80    118828

    accuracy                           0.90    458913
   macro avg       0.87      0.86      0.87    458913
weighted avg       0.90      0.90      0.90    458913



## Generate submission

Note: since there are multiple models (one per fold), one model can't be exported like our other models

In [6]:
test['prediction'] = test_preds
test = test[['prediction']]

# submission can't have index 
test = test.reset_index()
test.to_csv('logistic_regression_submission.csv', index=False)