In [1]:
import pickle
import gc
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
X = pd.read_pickle('/kaggle/input/amex-imputed-aggregate-data/X_train_agg.pkl', compression='gzip')
y = pd.read_pickle('/kaggle/input/amex-imputed-aggregate-data/y_train_agg.pkl', compression='gzip')

Function to calculate competition's evaluation metric: https://www.kaggle.com/code/inversion/amex-competition-metric-python

In [3]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

## Generating the model


In [4]:
m_score = 0
target_m_score = 0.76
final_model = None

# loop through different iterations of LogisticRegression() model until a model with expected M score is produced
while m_score < target_m_score:
    # add validation with train_test_split since aggregate data is not time series based
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2)
    
    model = RandomForestClassifier(max_depth=20)
    model = model.fit(X_train, y_train)
    
    # use predict_proba() for M score and predict() for classification_report()
    proba_preds = model.predict_proba(X_val)[:, 1]
    preds = model.predict(X_val)
    
    # calculate M score on validation set 
    m_score = amex_metric(pd.DataFrame(y_val), pd.DataFrame(proba_preds, index=y_val.index, columns=["prediction"]))
    final_model = model
    
    print(f'M = {m_score}')
    
    # show classification report for final model 
    if m_score >= target_m_score:
        print('\n', classification_report(y_val, preds))

M = 0.7596917967271029
M = 0.7652776121882127

               precision    recall  f1-score   support

           0       0.93      0.93      0.93     68154
           1       0.80      0.79      0.80     23629

    accuracy                           0.90     91783
   macro avg       0.86      0.86      0.86     91783
weighted avg       0.89      0.90      0.90     91783



## Saving and using the model to predict on test set 

In [5]:
pickle.dump(final_model, open('random_forest_classifier_model.sav', 'wb'))

In [6]:
X_test = pd.read_pickle('/kaggle/input/amex-imputed-aggregate-data/X_test_agg.pkl', compression='gzip')

submission = pd.DataFrame(final_model.predict_proba(X_test)[:, 1], index=X_test.index, columns=['prediction'])

# index needs to be removed from submission csv
submission = submission.reset_index()
submission.to_csv('random_forest_classifier_submission.csv', index=False)