In [1]:
import pickle
import gc
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
X = pd.read_feather('../input/amex-imputed-and-1hot-encoded/X_train.ftr')
y = pd.read_feather('../input/amex-imputed-and-1hot-encoded/y_train.ftr')

X = X.set_index('customer_ID')
y = y.set_index('customer_ID')['target']

Function to calculate competition's evaluation metric: https://www.kaggle.com/code/inversion/amex-competition-metric-python

In [3]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

## Train model

In [4]:
model = LogisticRegression()
model = model.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [5]:
proba_preds = model.predict_proba(X)[:, 1]
preds = model.predict(X)

In [6]:
print(f'M = {amex_metric(pd.DataFrame(y), pd.DataFrame(proba_preds, index=y.index, columns=["prediction"]))}')

M = 0.6835356094957665


In [7]:
print(classification_report(y, preds))

              precision    recall  f1-score   support

           0       0.90      0.93      0.92   4153582
           1       0.76      0.70      0.73   1377869

    accuracy                           0.87   5531451
   macro avg       0.83      0.82      0.82   5531451
weighted avg       0.87      0.87      0.87   5531451



In [8]:
del X, y
gc.collect()

125

## Save model and predict on test set

In [9]:
pickle.dump(model, open('baseline_logistic_regression_model.sav', 'wb'))

In [10]:
X_test_1 = pd.read_feather('../input/amex-imputed-and-1hot-encoded/X_test_1.ftr')
X_test_1 = X_test_1.set_index('customer_ID')

In [11]:
X_test_2 = pd.read_feather('../input/amex-imputed-and-1hot-encoded/X_test_2.ftr')
X_test_2 = X_test_2.set_index('customer_ID')

In [12]:
preds_1 = pd.DataFrame(model.predict_proba(X_test_1)[:, 1], index=X_test_1.index, columns=['prediction'])
preds_2 = pd.DataFrame(model.predict_proba(X_test_2)[:, 1], index=X_test_2.index, columns=['prediction'])

In [13]:
submission = pd.concat([preds_1, preds_2])

# predictions only need to be for each customer
submission = submission.groupby('customer_ID').agg(['last'])
submission.columns = submission.columns.droplevel(1)

# predictions need to be doubles 
submission['prediction'] = submission['prediction'].astype('double')

# index needs to be removed from submission csv
submission = submission.reset_index()
submission.to_csv('baseline_logistic_regression_submission.csv', index=False)