In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import lightgbm as lgb
import xgboost as xgb

from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, roc_curve, mean_squared_error
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier, StackingClassifier 
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# neural network model
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adagrad, Adamax, Nadam, Ftrl
from keras.callbacks import EarlyStopping, ModelCheckpoint
from scikeras.wrappers import KerasClassifier
import keras.backend as K


# hyperparameter tuning
from bayes_opt import BayesianOptimization # can try Optuna also if it is better

import warnings
warnings.filterwarnings('ignore')

random.seed(10)

# Pre-processing 

### Time series data

In [2]:
# To obtain time series for full 270 days
l90d = pd.read_csv("train_data_l90d_daily_balance.csv")
n180d = pd.read_csv("train_data_n180d_daily_balance.csv")

df_270 = pd.concat([l90d, n180d], ignore_index=True).drop_duplicates().sort_values(by=['user_id', 'pt_date'])

### User features

In [3]:
def scale_features(df):
    scaler = StandardScaler()
    scaler.fit(df)
    df_scaled = pd.DataFrame(scaler.transform(df), columns=df.columns)
    return df_scaled

In [4]:
stable_data = pd.read_csv("user_features_l90_stable_20231111.csv")
growth_data = pd.read_csv("user_features_l90_growth_20231111.csv")

### Model training

In [5]:
# This function performs a train-test split with all available features
# The split is stratified on the label, meaning the proportion of classes will be similar across train % validation sets
# But this doesn't ensure that the WEIGHTS of classes will be similar across train & validation sets
# SMOTE is done to deal with the minority positive class

def get_X_train_y_train(data):
    X = data.drop(columns=['label'])
    y = data['label']

    # Train-test split on user level
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        stratify=y,
                                                        test_size=0.2, 
                                                        random_state=0,
                                                       )
    
    # Store train & validation set users
    train_users = X_train.user_id.tolist()
    test_users = X_test.user_id.tolist()

    # Drop user_id
    X_train = X_train.drop(columns=['user_id'])
    X_test = X_test.drop(columns=['user_id'])
    
    # SMOTE to deal with class imbalance
    smote = SMOTE(sampling_strategy='minority', random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    
    return X_train, y_train, X_test, y_test, train_users, test_users


stable_X_train, stable_y_train, stable_X_test, stable_y_test, stable_train_users, stable_test_users = get_X_train_y_train(stable_data)
growth_X_train, growth_y_train, growth_X_test, growth_y_test, stable_train_users, growth_test_users = get_X_train_y_train(growth_data)

### Feature selection

In [6]:
def get_final_X_train_test(features_lst, subset_type):
    if subset_type.lower() == 'stable':
        return stable_X_train[features_lst], stable_X_test[features_lst]
    else:
        return growth_X_train[features_lst], growth_X_test[features_lst]

# Finalise features to use for each model
lgbm_features_lst_growth = [
    'abs_bal_change_std', 
    'beta_normalized', 
    'label_by_avg_bal', 
    'trend',
    'num_ema_crosses',
    'stationary',
    'num_distinct_recurring_tx',
    'recurring_withdrawals',
    'withdrawal_propn',
]


lgbm_features_lst_stable = [
    'abs_bal_change_std', 
    'beta_normalized', 
    'label_by_avg_bal', 
    'trend',
    'num_distinct_recurring_tx',
    'recurring_withdrawals',
    'withdrawal_propn'
]


xgb_features_lst_growth = ['abs_bal_change_std', 
                        'beta_normalized', 
                        'deposits',
                        'withdrawals', 
                        'label_by_avg_bal', 
                        'trend',
                        'income',
                        'subscription',
                        'stat_sig_positive_kendall']

xgb_features_lst_stable = ['abs_bal_change_std', 
                            'beta_normalized',
                            'label_by_avg_bal', 
                            'trend',
                           'income']
    

logreg_features_lst_growth = ['volatility_stdev', 'volatility_cv',
                               'abs_bal_change_std', 'trend',
                               'deposits', 'num_ema_crosses', 
                               'num_distinct_recurring_tx', 
                               'withdrawal_propn']

logreg_features_lst_stable = ['growth_coeff', 'abs_bal_change_std',
                              'deposits', 'withdrawals', 'ema_7day']

mlp_features_lst_growth = [
    'abs_bal_change_std', 
    'beta_normalized', 
    'label_by_avg_bal', 
    'trend',
    'num_ema_crosses',
    'stationary',
    'num_distinct_recurring_tx',
    'recurring_withdrawals',
    'withdrawal_propn',
]

mlp_features_lst_stable = [
    'abs_bal_change_std', 
    'beta_normalized', 
    'label_by_avg_bal', 
    'trend',
    'num_distinct_recurring_tx',
    'recurring_withdrawals',
    'withdrawal_propn'
]

features_lst_growth = [
    'abs_bal_change_std', 
    'beta_normalized', 
    'label_by_avg_bal', 
    'trend',
    'num_ema_crosses',
    'stationary',
    'num_distinct_recurring_tx',
    'recurring_withdrawals',
    'withdrawal_propn'       
]

features_lst_stable = [
    'abs_bal_change_std', 
    'beta_normalized', 
    'label_by_avg_bal', 
    'trend',
    'num_distinct_recurring_tx',
    'recurring_withdrawals',
    'withdrawal_propn'
]

In [7]:
stable_X_train_lgbm, stable_X_test_lgbm = get_final_X_train_test(lgbm_features_lst_stable, 'stable')
growth_X_train_lgbm, growth_X_test_lgbm = get_final_X_train_test(lgbm_features_lst_growth, 'growth')


stable_X_train_xgb, stable_X_test_xgb = get_final_X_train_test(xgb_features_lst_stable, 'stable')
growth_X_train_xgb, growth_X_test_xgb = get_final_X_train_test(xgb_features_lst_growth, 'growth')


stable_X_train_logreg, stable_X_test_logreg = get_final_X_train_test(logreg_features_lst_stable, 'stable')
growth_X_train_logreg, growth_X_test_logreg = get_final_X_train_test(logreg_features_lst_growth, 'growth')

stable_X_train_logreg = scale_features(stable_X_train_logreg)
stable_X_test_logreg = scale_features(stable_X_test_logreg)
growth_X_train_logreg = scale_features(growth_X_train_logreg)
growth_X_test_logreg = scale_features(growth_X_test_logreg)

stable_X_train_mlp, stable_X_test_mlp = get_final_X_train_test(mlp_features_lst_stable, 'stable')
growth_X_train_mlp, growth_X_test_mlp = get_final_X_train_test(mlp_features_lst_growth, 'growth')

stable_X_train_mlp = scale_features(stable_X_train_mlp)
stable_X_test_mlp = scale_features(stable_X_test_mlp)
growth_X_train_mlp = scale_features(growth_X_train_mlp)
growth_X_test_mlp = scale_features(growth_X_test_mlp)

#### **VIF check for autocorrelation between cols**

In [8]:
def get_vif(X): 

    # Create a DataFrame to store the VIF values
    vif_data = pd.DataFrame()
    vif_data["Features"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    # Display the VIF values
    print(vif_data.sort_values(by='VIF', ascending=False))
    print("\n")

def show_vif(model_name, subset_type):
    model_map = {
        'stable': {
            'lgbm': stable_X_train_lgbm,
            'xgb': stable_X_train_xgb,
            'logreg': stable_X_train_logreg,
            'mlp': stable_X_train_mlp,
        },
        'growth': {
            'lgbm': growth_X_train_lgbm,
            'xgb': growth_X_train_xgb,
            'logreg': growth_X_train_logreg,
            'mlp': growth_X_train_mlp,
        }
    }
    return get_vif(model_map[subset_type][model_name])

# measure multi-collinearity between the features
show_vif('logreg', 'stable')

             Features       VIF
2            deposits  1.797526
3         withdrawals  1.758894
1  abs_bal_change_std  1.369367
0        growth_coeff  1.306846
4            ema_7day  1.072925




### Evaluation Metrics

In [9]:
# function to get metric for a single model
def get_metrics_df(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None)
    recall = recall_score(y_test, y_pred, average=None)
    f1 = f1_score(y_test, y_pred, average=None)
    roc_auc = roc_auc_score(y_test, y_pred, average=None)
    
    # Create a dictionary to hold the metrics
    metrics_dict = {
        'Class': [0, 1],
        'Accuracy': accuracy.tolist(),
        'Precision': precision.tolist(),
        'Recall': recall.tolist(),
        'ROC-AUC': roc_auc.tolist(),
        'F1-Score': f1.tolist(),
 
    }

    # Create a DataFrame from the dictionary
    metrics_df = pd.DataFrame(metrics_dict)

    # Set the 'Class' column as the index
    metrics_df.set_index('Class', inplace=True)

    # Display the DataFrame
    return metrics_df

In [10]:
# function to get metric in a df for all models
def get_all_metrics_df(y_pred1, y_pred2, y_pred3, y_pred4, y_test):
    model_labels = ["LGBM", "XGB", "LogReg", "MLP"]

    # Create an empty list to store the metrics DataFrames
    all_metrics_dfs = []

    # Calculate and store metrics for each model (replace y_pred_modelX and y_test_modelX with your actual data)
    for label, (y_pred, y_test) in zip(model_labels, [(y_pred1, y_test), (y_pred2, y_test), (y_pred3, y_test), (y_pred4, y_test)]):
        metrics_df = get_metrics_df(y_test, y_pred)
        metrics_df["Model"] = label  # Add a 'Model' column to label the metrics
        all_metrics_dfs.append(metrics_df)

    # Concatenate the metrics DataFrames vertically
    all_metrics_df = pd.concat(all_metrics_dfs, axis=0)

    # Reset the index to have a continuous index
    all_metrics_df.reset_index(inplace=True)

    return all_metrics_df

# Models

### Grid Search

In [11]:
# For LightGBM and XGBoost

def get_feature_importance(X_train, best_model):
    # Get feature importances
    feature_importance = best_model.feature_importances_
    feature_names = X_train.columns 

    # Pair feature names with their importance scores
    feature_importance_dict = dict(zip(feature_names, feature_importance))

    # Sort feature importance dictionary by values (importance scores)
    sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True))

    top_features = []
    # Print or use the sorted feature importance
    for feature, importance in sorted_feature_importance.items():
        print(f"{feature}: {importance}")
        top_features.append(feature)
    

def grid_search(parameters, model, X_train, y_train, X_test):
    cv_split = StratifiedKFold()

    optimised_model = GridSearchCV(estimator=model, cv=cv_split, param_grid=parameters, scoring='recall')
    optimised_model.fit(X_train, y_train)

    y_pred = optimised_model.predict(X_test)
    
    best_model = optimised_model.best_estimator_
    best_param = optimised_model.best_params_
    best_score = optimised_model.best_score_
    
    print(best_model)
    print(best_score)
    
    get_feature_importance(X_train, best_model)

    return optimised_model, y_pred
    

In [12]:
# For Log Reg
def get_feature_importance1(X_train, best_model):
    # Get feature importances
    coefficients = best_model.coef_[0]
    
    feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.abs(coefficients)})
    feature_importance = feature_importance.sort_values('Importance', ascending=True)
    print(feature_importance)

def grid_search1(parameters, model, X_train, y_train, X_test):
    cv_split = StratifiedKFold()

    optimised_model = GridSearchCV(estimator=model, cv=cv_split, param_grid=parameters, scoring='recall')
    optimised_model.fit(X_train, y_train)

    y_pred = optimised_model.predict(X_test)

    best_model = optimised_model.best_estimator_
    best_param = optimised_model.best_params_
    best_score = optimised_model.best_score_

    print(best_model)
    print(best_score)

    get_feature_importance1(X_train, best_model)

    return optimised_model, y_pred

In [13]:
# For MLP 
def bo_simple_nn(X_train: pd.DataFrame, y_train: pd.DataFrame, 
                 scorer: sklearn.metrics._scorer._PredictScorer, params: dict) -> dict:
    
    # model training
    def nn_cl_bo(neurons, activation, learning_rate,  batch_size, epochs, layers1):
        activationL = ['relu', 'sigmoid', 'softplus', 'softsign', 'tanh', 'selu',
                    'elu', 'exponential', 'relu']
        neurons = round(neurons)
        activation = activationL[round(activation)]
        batch_size = round(batch_size)
        epochs = round(epochs)
        layers1 = round(layers1)

        # MLP architecture
        def nn_cl_fun():
            opt = Adam(learning_rate = learning_rate) # should i optimise for this as well
            nn = Sequential()
            nn.add(Dense(neurons, input_dim=len(X_train.columns), activation=activation))
            for i in range(layers1):
                nn.add(Dense(neurons, activation=activation))
            nn.add(Dense(1, activation='sigmoid'))
            nn.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy', recall]) #change to recall?
            return nn
        es = EarlyStopping(monitor='val_recall', mode='max', verbose=0, patience=20) #change to recall?
        nn = KerasClassifier(build_fn=nn_cl_fun, epochs=epochs, batch_size=batch_size,
                            verbose=0)
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
        score = cross_val_score(nn, X_train, y_train, scoring=scorer, cv=kfold, fit_params={'callbacks':[es]}).mean()
        return score
    
    # Bayesian Optimisation
    nn_bo = BayesianOptimization(nn_cl_bo, params, random_state=111)
    nn_bo.maximize(init_points=10, n_iter=10)

    return nn_bo.max

### Light GBM

In [14]:
# LightGBM

    
def run_gbm(subset_type):
    model = lgb.LGBMClassifier(
        random_state=0,
    )

    model.set_params(verbose=-1) 
    
    parameters = {
        "max_depth": [10],
        "num_leaves": [20],
        "learning_rate": [0.05],
        "n_estimators": [1000],
    }

# set of parameters for grid search optimisation
#     parameters = {
#         "max_depth": [10, 20, 30],
#         "num_leaves": [10, 20, 30],
#         "learning_rate": [0.02, 0.05],
#         "n_estimators": [1000, 1200],
#     }
    
    if subset_type == 'stable':
        model, y_pred = grid_search(parameters, model, stable_X_train_lgbm, stable_y_train, stable_X_test_lgbm)
    else:
        model, y_pred = grid_search(parameters, model, growth_X_train_lgbm, growth_y_train, growth_X_test_lgbm)
        
    return model, y_pred 

In [15]:
gbm_stable, stable_y_pred_gbm = run_gbm('stable')
lgbm_metrics_stable = get_metrics_df(stable_y_test, stable_y_pred_gbm)
lgbm_metrics_stable

LGBMClassifier(learning_rate=0.05, max_depth=10, n_estimators=1000,
               num_leaves=20, random_state=0, verbose=-1)
0.9854207142662613
abs_bal_change_std: 5249
beta_normalized: 3925
withdrawal_propn: 3780
num_distinct_recurring_tx: 3762
label_by_avg_bal: 1595
trend: 600
recurring_withdrawals: 89


Unnamed: 0_level_0,Accuracy,Precision,Recall,ROC-AUC,F1-Score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.935388,0.99503,0.939591,0.660585,0.966516
1,0.935388,0.045741,0.381579,0.660585,0.08169


In [16]:
gbm_growth, growth_y_pred_gbm = run_gbm('growth')
lgbm_metrics_growth = get_metrics_df(growth_y_test, growth_y_pred_gbm)
lgbm_metrics_growth

LGBMClassifier(learning_rate=0.05, max_depth=10, n_estimators=1000,
               num_leaves=20, random_state=0, verbose=-1)
0.9618241903502975
abs_bal_change_std: 5524
beta_normalized: 4326
num_distinct_recurring_tx: 3045
withdrawal_propn: 2436
num_ema_crosses: 1491
label_by_avg_bal: 1254
stationary: 424
trend: 412
recurring_withdrawals: 88


Unnamed: 0_level_0,Accuracy,Precision,Recall,ROC-AUC,F1-Score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.849073,0.982721,0.853971,0.814998,0.913833
1,0.849073,0.26268,0.776025,0.814998,0.392501


### XGBoost

In [17]:
# XGB
def run_xgb(subset_type):
    model = xgb.XGBClassifier(
        objective="binary:logistic",  
        random_state=0
    )

# set of parameters for grid search optimisation   
#     parameters = {
#         "max_depth": [10, 20, 30],
#         "learning_rate": [0.05, 0.06, 0.07],
#         "n_estimators": [500, 1000, 1500],
#     }

    #optimised params
    parameters = {
        "max_depth": [10],
        "learning_rate": [0.05],
        "n_estimators": [500],
        }
    
    if subset_type == 'stable':
        model, y_pred = grid_search(parameters, model, stable_X_train_xgb, stable_y_train, stable_X_test_xgb)
    else:
        model, y_pred = grid_search(parameters, model, growth_X_train_xgb, growth_y_train, growth_X_test_xgb)
        
    return model, y_pred


In [18]:
xgb_stable, stable_y_pred_xgb = run_xgb('stable')
xgb_metrics_stable = get_metrics_df(stable_y_test, stable_y_pred_xgb)
xgb_metrics_stable

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=500, n_jobs=None,
              num_parallel_tree=None, random_state=0, ...)
0.94003539379484
label_by_avg_bal: 0.8231279850006104
abs_bal_change_std: 0.06988933682441711
trend: 0.057336147874593735
beta_normalized: 0.026339510455727577
income: 0.023306960240006447


Unnamed: 0_level_0,Accuracy,Precision,Recall,ROC-AUC,F1-Score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.875434,0.99626,0.877783,0.721786,0.933277
1,0.875434,0.033938,0.565789,0.721786,0.064036


In [19]:
xgb_growth, growth_y_pred_xgb = run_xgb('growth')
xgb_metrics_growth = get_metrics_df(growth_y_test, growth_y_pred_xgb)
xgb_metrics_growth

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=500, n_jobs=None,
              num_parallel_tree=None, random_state=0, ...)
0.9531262392597488
withdrawals: 0.7746371626853943
trend: 0.0689874067902565
stat_sig_positive_kendall: 0.05718370899558067
deposits: 0.029127534478902817
label_by_avg_bal: 0.024625379592180252
abs_bal_change_std: 0.021099120378494263
beta_normaliz

Unnamed: 0_level_0,Accuracy,Precision,Recall,ROC-AUC,F1-Score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.846992,0.982206,0.852173,0.810945,0.912581
1,0.846992,0.258749,0.769716,0.810945,0.387302


### Logistics Regression

In [20]:

def run_log_reg(subset_type):
    model = LogisticRegression(
        max_iter=10000,  # Maximum number of iterations
        random_state=0,  # Random seed for reproducibility
        solver = 'saga'
    )
    
# set of parameters for grid search optimisation
#     parameters = {
#         "C": [0.0001, 0.001, 0.01],  # Try different values for C
#         "penalty": ['l1', 'l2']  # Try both L1 and L2 regularization
#     }

    #optimised value
    parameters = {
        "C": [0.01],
        "penalty": ['l1']
    }
    
    if subset_type == 'stable':
        model, y_pred = grid_search1(parameters, model, stable_X_train_logreg, stable_y_train, stable_X_test_logreg)
    else:
        model, y_pred = grid_search1(parameters, model, growth_X_train_logreg, growth_y_train, growth_X_test_logreg)
        
    return model, y_pred
    


In [21]:
log_reg_stable, stable_y_pred_logreg = run_log_reg('stable')
logreg_metrics_stable = get_metrics_df(stable_y_test, stable_y_pred_logreg)
logreg_metrics_stable

LogisticRegression(C=0.01, max_iter=10000, penalty='l1', random_state=0,
                   solver='saga')
0.9149711956843165
              Feature  Importance
0        growth_coeff    0.040391
1  abs_bal_change_std    0.122138
3         withdrawals    1.135660
4            ema_7day    3.493188
2            deposits    3.842446


Unnamed: 0_level_0,Accuracy,Precision,Recall,ROC-AUC,F1-Score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.512734,0.999804,0.509136,0.747989,0.674694
1,0.512734,0.015027,0.986842,0.747989,0.029603


In [22]:
log_reg_growth, growth_y_pred_logreg = run_log_reg('growth')
logreg_metrics_growth = get_metrics_df(growth_y_test, growth_y_pred_logreg)
logreg_metrics_growth

LogisticRegression(C=0.01, max_iter=10000, penalty='l1', random_state=0,
                   solver='saga')
0.8833840052875083
                     Feature  Importance
2         abs_bal_change_std    0.006535
0           volatility_stdev    0.021067
1              volatility_cv    0.284424
3                      trend    0.371020
6  num_distinct_recurring_tx    0.596324
7           withdrawal_propn    0.666639
4                   deposits    1.040451
5            num_ema_crosses    1.854386


Unnamed: 0_level_0,Accuracy,Precision,Recall,ROC-AUC,F1-Score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.62858,0.996521,0.605795,0.787124,0.753518
1,0.62858,0.141409,0.968454,0.787124,0.246785


### MLP

In [31]:
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives / (possible_positives + K.epsilon())

In [15]:
# Run the bayesian optimisation function for MLP if you want to find the best parameters

# params_simple_nn = {
#     'neurons': (10, 100),
#     'activation':(0, 8),
#     'learning_rate':(0.01, 1),
#     'batch_size':(200, 1000),
#     'epochs':(20, 100),
#     'layers1':(1,5)
# }

# scorer_rec = make_scorer(recall_score)

# # this is to run the function
# bo_simple_nn(X_train, y_train, scorer_rec, params_simple_nn)

In [32]:
# mlp function with the best parameters after doing bayesian optimisation 
def run_mlp(subset_type):
    
    if subset_type == 'stable':
        learning_rate = 0.04374
        neurons = 95
        layers1 = 4
        epochs = 94
        batch_size = 958
        activation = 'softsign'

        def nn_cl_fun():
            opt = Adam(learning_rate = learning_rate)
            nn = Sequential()
            nn.add(Dense(neurons, input_dim=len(stable_X_train_mlp.columns), activation=activation))
            for i in range(layers1):
                nn.add(Dense(neurons, activation=activation))
            nn.add(Dense(1, activation='sigmoid'))
            nn.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy', recall]) 
            return nn
        es = EarlyStopping(monitor='val_recall', mode='max', verbose=0, patience=20)
        nn = KerasClassifier(build_fn=nn_cl_fun, epochs=epochs, batch_size=batch_size,
                            verbose=0)
        nn.fit(stable_X_train_mlp, stable_y_train, validation_data=(stable_X_test_mlp, stable_y_test), verbose=0)
        y_pred = nn.predict(stable_X_test_mlp)
        return nn, y_pred
    
    else:
        learning_rate = 0.01
        neurons = 68
        layers1 = 5
        epochs = 100
        batch_size = 973
        activation = 'softsign'

        def nn_cl_fun():
            opt = Adam(learning_rate = learning_rate)
            nn = Sequential()
            nn.add(Dense(neurons, input_dim=len(growth_X_train_mlp.columns), activation=activation))
            for i in range(layers1):
                nn.add(Dense(neurons, activation=activation))
            nn.add(Dense(1, activation='sigmoid'))
            nn.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy', recall])
            return nn
        es = EarlyStopping(monitor='val_recall', mode='max', verbose=0, patience=20)
        nn = KerasClassifier(build_fn=nn_cl_fun, epochs=epochs, batch_size=batch_size,
                            verbose=0)
        nn.fit(growth_X_train_mlp, growth_y_train, validation_data=(growth_X_test_mlp, growth_y_test), verbose=0)
        y_pred = nn.predict(growth_X_test_mlp)
        return nn, y_pred

In [36]:
mlp_stable, stable_y_pred_mlp = run_mlp('stable')
mlp_metrics_stable = get_metrics_df(stable_y_test, stable_y_pred_mlp)
mlp_metrics_stable

Unnamed: 0_level_0,Accuracy,Precision,Recall,ROC-AUC,F1-Score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.804182,0.997401,0.804793,0.764239,0.890805
1,0.804182,0.027363,0.723684,0.764239,0.052733


In [37]:
mlp_growth, growth_y_pred_mlp = run_mlp('growth')
mlp_metrics_growth = get_metrics_df(growth_y_test, growth_y_pred_mlp)
mlp_metrics_growth

Unnamed: 0_level_0,Accuracy,Precision,Recall,ROC-AUC,F1-Score
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.752651,0.985764,0.746854,0.792985,0.849838
1,0.752651,0.181818,0.839117,0.792985,0.298876


### Metrics Comparison

In [None]:
stable_all_metrics_df = get_all_metrics_df(stable_y_pred_gbm, stable_y_pred_xgb, stable_y_pred_logreg, stable_y_pred_mlp, stable_y_test)
growth_all_metrics_df = get_all_metrics_df(growth_y_pred_gbm, growth_y_pred_xgb, growth_y_pred_logreg, growth_y_pred_mlp, growth_y_test)

print("Stable Subset")
print(stable_all_metrics_df)
print("")
print("Growth Subset")
print(growth_all_metrics_df)

Stable Subset
   Class  Accuracy  Precision    Recall   ROC-AUC  F1-Score   Model
0      0  0.935388   0.995030  0.939591  0.660585  0.966516    LGBM
1      1  0.935388   0.045741  0.381579  0.660585  0.081690    LGBM
2      0  0.875434   0.996260  0.877783  0.721786  0.933277     XGB
3      1  0.875434   0.033938  0.565789  0.721786  0.064036     XGB
4      0  0.512734   0.999804  0.509136  0.747989  0.674694  LogReg
5      1  0.512734   0.015027  0.986842  0.747989  0.029603  LogReg
6      0  0.804182   0.997401  0.804793  0.764239  0.890805     MLP
7      1  0.804182   0.027363  0.723684  0.764239  0.052733     MLP

Growth Subset
   Class  Accuracy  Precision    Recall   ROC-AUC  F1-Score   Model
0      0  0.849073   0.982721  0.853971  0.814998  0.913833    LGBM
1      1  0.849073   0.262680  0.776025  0.814998  0.392501    LGBM
2      0  0.846992   0.982206  0.852173  0.810945  0.912581     XGB
3      1  0.846992   0.258749  0.769716  0.810945  0.387302     XGB
4      0  0.628580 

# Ensemble Techniques

## Voting

### Growth subset

In [234]:
#choose soft or hard voting
voting_ensemble = VotingClassifier(estimators=[
    ('XGBoost', xgb_growth),
    ('LightGBM', gbm_growth),
    ('Logistic Regression', log_reg_growth),
    ('MLP', mlp_growth),
], voting='soft') 

voting_ensemble.fit(growth_X_train[features_lst_growth], growth_y_train)

# Make predictions with the VotingClassifier
growth_y_pred_voting = voting_ensemble.predict(growth_X_test[features_lst_growth])

recall = recall_score(growth_y_test, growth_y_pred_voting, average=None)

recall_df = pd.DataFrame({
    'Class': ['Non-Growth', 'Growth'],
    'Recall': [recall[0], recall[1]]
})

# Print the DataFrame
print(recall_df)

        Class    Recall
0  Non-Growth  0.864651
1      Growth  0.742902


### Stable subset

In [236]:
#choose soft or hard voting
voting_ensemble = VotingClassifier(estimators=[
    ('XGBoost', xgb_stable),
    ('LightGBM', gbm_stable),
    ('Logistic Regression', log_reg_stable),
#     ('MLP', mlp_stable),
], voting='soft') 

voting_ensemble.fit(stable_X_train[features_lst_stable], stable_y_train)

# Make predictions with the VotingClassifier
stable_y_pred_voting = voting_ensemble.predict(stable_X_test[features_lst_stable])
recall = recall_score(stable_y_test, stable_y_pred_voting, average=None)

recall_df = pd.DataFrame({
    'Class': ['Non-Stable', 'Stable'],
    'Recall': [recall[0], recall[1]]
})

print(recall_df)

        Class    Recall
0  Non-Stable  0.956665
1      Stable  0.342105


## Stacking (for heterogenous models)

### Growth subset

In [241]:
# Create a stacking classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', xgb_growth),
        ('gbm', gbm_growth),
        ('log_reg', log_reg_growth),
    ],
    final_estimator=LogisticRegression(),  # You can use a different final estimator if desired
    stack_method='auto'  # Automatically select the best method (can be 'auto', 'predict_proba', or 'decision_function')
)

# Fit the stacking classifier
# Define a new set of features for the stacking clf
stacking_clf.fit(growth_X_train[features_lst_growth], growth_y_train)

# Make predictions
growth_y_pred_stacking = stacking_clf.predict(growth_X_test[features_lst_growth])

recall = recall_score(growth_y_test, growth_y_pred_stacking, average=None)
recall_df = pd.DataFrame({
    'Class': ['Non-Growth', 'Growth'],
    'Recall': [recall[0], recall[1]]
})

print(recall_df)

        Class    Recall
0  Non-Growth  0.880723
1      Growth  0.708202


### Stable subset

In [242]:
# Create a stacking classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', xgb_stable),
        ('gbm', gbm_stable),
        ('log_reg', log_reg_stable),
    ],
    final_estimator=LogisticRegression(),  # You can use a different final estimator if desired
    stack_method='auto'  # Automatically select the best method (can be 'auto', 'predict_proba', or 'decision_function')
)

# Fit the stacking classifier
# Define a new set of features for the stacking clf
stacking_clf.fit(stable_X_train[features_lst_stable], stable_y_train)

# Make predictions
stable_y_pred_stacking = stacking_clf.predict(stable_X_test[features_lst_stable])

recall = recall_score(stable_y_test, stable_y_pred_stacking, average=None)
recall_df = pd.DataFrame({
    'Class': ['Non-Stable', 'Stable'],
    'Recall': [recall[0], recall[1]]
})

print(recall_df)

        Class    Recall
0  Non-Stable  0.962956
1      Stable  0.328947


# Filtering

## Get validation set users

In [23]:
# This function returns 1 user-level dataframe of validation set (10k users)
def get_results_df(test_users, y_test, y_pred, user_features):
    
    # Create a DataFrame with validation set users, their true labels, and predicted labels
    results_df = pd.DataFrame({
        'user_id': test_users, 
        'true_label': y_test,
        'predicted_label': y_pred
    })

    results_df = results_df.merge(user_features, on='user_id', how='left')
    
    # Weight of each user relative to the 10k validation set users (not entire portfolio of 50k users)
    results_df['weight'] = results_df['avg_balance'] / np.sum(results_df['avg_balance'])
    return results_df



# This function returns 2 user-level dataframes
# It splits the df generated from the above function into
# 1) user-level dataframe of actual subset
# 2) user-level dataframe of predicted subset
def get_subset_pred_true(results_df):
    
    ####################
    # Predicted subset #
    ####################
    subset_pred = results_df[results_df['predicted_label'] == 1].copy()
    
    # calculate weight relative to subset & obtain weighted stability
    subset_pred['subset_weight'] = subset_pred['avg_balance'] / subset_pred['avg_balance'].sum()
    subset_pred['weighted_stability'] = subset_pred['subset_weight'] * subset_pred['stability_index']
    
    #################
    # Actual subset #
    #################
    subset_true = results_df[results_df['true_label'] == 1].copy()
    
    # calculate weight relative to subset & obtain weighted stability
    subset_true['subset_weight'] = subset_true['avg_balance'] / subset_true['avg_balance'].sum()
    subset_true['weighted_stability'] = subset_true['subset_weight'] * subset_true['stability_index']
    
    return subset_pred, subset_true


In [39]:
# LightGBM
growth_results_df_gbm = get_results_df(growth_test_users, growth_y_test, growth_y_pred_gbm, growth_data)
stable_results_df_gbm = get_results_df(stable_test_users, stable_y_test, stable_y_pred_gbm, stable_data)

growth_pred_gbm, growth_true_gbm = get_subset_pred_true(growth_results_df_gbm)
stable_pred_gbm, stable_true_gbm = get_subset_pred_true(stable_results_df_gbm)

In [40]:
# XGB
growth_results_df_xgb = get_results_df(growth_test_users, growth_y_test, growth_y_pred_xgb, growth_data)
stable_results_df_xgb = get_results_df(stable_test_users, stable_y_test, stable_y_pred_xgb, stable_data)

growth_pred_xgb, growth_true_xgb = get_subset_pred_true(growth_results_df_xgb)
stable_pred_xgb, stable_true_xgb = get_subset_pred_true(stable_results_df_xgb)

In [41]:
# LogReg
growth_results_df_logreg = get_results_df(growth_test_users, growth_y_test, growth_y_pred_logreg, growth_data)
stable_results_df_logreg = get_results_df(stable_test_users, stable_y_test, stable_y_pred_logreg, stable_data)

growth_pred_logreg, growth_true_logreg = get_subset_pred_true(growth_results_df_logreg)
stable_pred_logreg, stable_true_logreg = get_subset_pred_true(stable_results_df_logreg)

In [42]:
# MLP
growth_results_df_mlp = get_results_df(growth_test_users, growth_y_test, growth_y_pred_mlp, growth_data)
stable_results_df_mlp = get_results_df(stable_test_users, stable_y_test, stable_y_pred_mlp, stable_data)

growth_pred_mlp, growth_true_mlp = get_subset_pred_true(growth_results_df_mlp)
stable_pred_mlp, stable_true_mlp = get_subset_pred_true(stable_results_df_mlp)

In [244]:
# Voting
growth_results_df_voting = get_results_df(growth_test_users, growth_y_test, growth_y_pred_voting, growth_data)
stable_results_df_voting = get_results_df(stable_test_users, stable_y_test, stable_y_pred_voting, stable_data)

growth_pred_voting, growth_true_voting = get_subset_pred_true(growth_results_df_voting)
stable_pred_voting, stable_true_voting = get_subset_pred_true(stable_results_df_voting)

In [243]:
# Stacking
growth_results_df_stacking = get_results_df(growth_test_users, growth_y_test, growth_y_pred_stacking, growth_data)
stable_results_df_stacking = get_results_df(stable_test_users, stable_y_test, stable_y_pred_stacking, stable_data)

growth_pred_stacking, growth_true_stacking = get_subset_pred_true(growth_results_df_stacking)
stable_pred_stacking, stable_true_stacking = get_subset_pred_true(stable_results_df_stacking)

## Filtering after prediction

In [84]:
def filter_predicted_growth_subset(growth_pred1):
    
    # Filter for users who exhibit positive trend
    growth_pred1 = growth_pred1[(growth_pred1['stat_sig_positive_kendall'] == 1) | (growth_pred1['trend'] == 2)].copy()
    
    # Calculate growth rate
    balance_growth_rate = (growth_pred1['last_day_balance'] - growth_pred1['avg_balance'])/growth_pred1['avg_balance']
    
    # growth rate will be NaN if avg balance = 0
    # For such cases we use growth coefficient as proxy for growth rate
    growth_pred1['balance_growth_rate'] = np.where(growth_pred1['avg_balance'] != 0,
                                                   balance_growth_rate, 
                                                   growth_pred1['growth_coeff'])
    
    growth_pred1_sorted = growth_pred1.sort_values(by=['balance_growth_rate'], ascending=False)

    # Filter for 5% of subset weight
    growth_pred1_sorted['cumulative_weight'] = growth_pred1_sorted['weight'].cumsum()
    growth_pred1_final = growth_pred1_sorted[growth_pred1_sorted['cumulative_weight'] <= 0.05].copy()
    
    final_weight = growth_pred1_final['weight'] / growth_pred1_final['weight'].sum()
    final_weighted_stability = final_weight * growth_pred1_final['stability_index']
    
    growth_pred1_final['final_weight'] = final_weight
    growth_pred1_final['weighted_stability'].update(final_weighted_stability)
    
    print(growth_pred1_final.true_label.value_counts())
    
    return growth_pred1_final


def filter_predicted_stable_subset(stable_pred1):

    stable_pred1 = stable_pred1[(stable_pred1['stat_sig_positive_kendall'] == 1) | (stable_pred1['trend'] == 2) | ((stable_pred1['stationary'] == 1) & (stable_pred1['trend'] == 1))].copy()
    
    # Can't sort by stability index because we've already established that 
    # the most stable users for the next 180 days were not the most stable for their first 90 days
    stable_pred1_sorted = stable_pred1.sort_values(by=['trend', 'stat_sig_positive_kendall', 'stationary'], ascending=False)
    
    # Filter for 5% of subset weight
    stable_pred1_sorted['cumulative_weight'] = stable_pred1_sorted['weight'].cumsum()
    stable_pred1_final = stable_pred1_sorted[stable_pred1_sorted['cumulative_weight'] <= 0.05].copy()
    
    final_weight = stable_pred1_final['weight'] / stable_pred1_final['weight'].sum()
    final_weighted_stability = final_weight * stable_pred1_final['stability_index']
    
    stable_pred1_final['final_weight'] = final_weight
    stable_pred1_final['weighted_stability'].update(final_weighted_stability)
    
    print(stable_pred1_final.true_label.value_counts())
    
    return stable_pred1_final 



# Run this function
# This function returns 1 user-level dataframe
# It represents the final predicted subset (i.e. predicted subset AFTER filtering)
def filter_predicted_subset(subset, subset_type):
    
    if subset_type == 'growth':
        subset_final = filter_predicted_growth_subset(subset)
        
    else:
        subset_final = filter_predicted_stable_subset(subset)
    
    return subset_final


In [85]:
# LightGBM
growth_pred_gbm_final = filter_predicted_subset(growth_pred_gbm, 'growth')
stable_pred_gbm_final = filter_predicted_subset(stable_pred_gbm, 'stable')

0    545
1    157
Name: true_label, dtype: int64
0    251
1     10
Name: true_label, dtype: int64


In [86]:
# XGB
growth_pred_xgb_final = filter_predicted_subset(growth_pred_xgb, 'growth')
stable_pred_xgb_final = filter_predicted_subset(stable_pred_xgb, 'stable')

0    518
1    159
Name: true_label, dtype: int64
0    302
1     10
Name: true_label, dtype: int64


In [87]:
# LogReg
growth_pred_logreg_final = filter_predicted_subset(growth_pred_logreg, 'growth')
stable_pred_logreg_final = filter_predicted_subset(stable_pred_logreg, 'stable')

0    894
1    104
Name: true_label, dtype: int64
0    1384
1      12
Name: true_label, dtype: int64


In [88]:
# MLP
growth_pred_mlp_final = filter_predicted_subset(growth_pred_mlp, 'growth')
stable_pred_mlp_final = filter_predicted_subset(stable_pred_mlp, 'stable')

0    889
1    135
Name: true_label, dtype: int64
0    391
1      7
Name: true_label, dtype: int64


In [238]:
# Voting
growth_pred_voting_final = filter_predicted_subset(growth_pred_voting, 'growth')
stable_pred_voting_final = filter_predicted_subset(stable_pred_voting, 'stable')

0    1113
1     456
Name: true_label, dtype: int64
0    216
1      6
Name: true_label, dtype: int64


In [245]:
# Stacking
growth_pred_stacking_final = filter_predicted_subset(growth_pred_stacking, 'growth')
stable_pred_stacking_final = filter_predicted_subset(stable_pred_stacking, 'stable')

0    826
1    382
Name: true_label, dtype: int64
0    189
1      9
Name: true_label, dtype: int64


# Benchmarks

## Baseline
We would use the growth rate and stability index of the entire portfolio of the next 180 days actual data as the baseline benchmark for our evaluation. This is the bare minimum as our results should exceed this. 

In [91]:
# function to find the weighted stability index of subset
def get_weighted_stability(n180d, df):

    # Calculate stability_index with next 180 days data
    result = n180d.groupby('user_id').agg(total_balance_std=('total_balance', 'std'),
                                                  avg_balance=('total_balance', 'mean')).reset_index()

    result['cv'] = result['total_balance_std'] / result['avg_balance']
    result['cv_scaled'] = (result['cv'] - min(result['cv'])) / (max(result['cv']) - min(result['cv']))
    result["stability_index"] = 1 - result['cv_scaled']
    
    # sum of final_weight = 1
    # use this weight to generate weighted stability
    if 'final_weight' not in df.columns:
        df['final_weight'] = df['weight'] / df['weight'].sum()
    
    result = df[['user_id', 'final_weight']].merge(result[['user_id', 'stability_index']], on='user_id', how='left')
    result['weighted_stability'] = result['final_weight'] * result['stability_index']
    
    return result['weighted_stability'].sum()


In [92]:
# growth of entire portfolio over next 180 days actual data 
growth_baseline_subset = n180d.copy()
growth_baseline_subset = growth_baseline_subset.groupby('pt_date')['total_balance'].sum().reset_index()
baseline_growth = ((growth_baseline_subset.iloc[-1,1] - growth_baseline_subset.iloc[0,1]) / growth_baseline_subset.iloc[0,1])*100

# stability index of entire portfolio over next 180 days actual data 
stable_baseline_subset = n180d.copy()
stable_baseline_subset = stable_baseline_subset.groupby('user_id').agg(total_balance_std=('total_balance', 'std'),
                                                  avg_balance=('total_balance', 'mean')).reset_index()
stable_baseline_subset['weight'] = stable_baseline_subset['avg_balance'] / np.sum(stable_baseline_subset['avg_balance'])
baseline_stability = get_weighted_stability(n180d, stable_baseline_subset)

print('Baseline (based on next 180 days actual data):')
print(f"Baseline growth rate: ", round(baseline_growth, 3), "%")
print(f"Baseline stability index: ", round(baseline_stability, 5))

Baseline (based on next 180 days actual data):
Baseline growth rate:  -29.701 %
Baseline stability index:  0.97596


## Average Benchmark

This set of benchmark is based on a simple rule of ranking the users in descending order according to their growth coefficient and stability index from the last 90 days and filtering them by their cumulative weight till we hit a threshold of 5% weight of the entire last 90 days portfolio. Based on the respective set of users (growth and stable), we calculate their growth rate and stability index based on their next 180 days actual data and use these results as a benchmark for evaluation. 

In [93]:
# subsetting for the stable and growth based on sorting their growth coefficient and stability index in ascending order
growth_benchmark = growth_data.sort_values(by=['growth_coeff'], ascending=False)
stable_benchmark = stable_data.sort_values(by=['stability_index'], ascending=False)

# filter those who belongs to the 10k test set 
growth_benchmark = growth_benchmark[growth_benchmark['user_id'].isin(growth_test_users)]
stable_benchmark = stable_benchmark[stable_benchmark['user_id'].isin(stable_test_users)]

# filtering the users based on a threshold of 5% weight of entire portfolio
growth_benchmark['weight'] = growth_benchmark['avg_balance'] / growth_benchmark['avg_balance'].sum()
growth_benchmark['cumulative_weight'] = growth_benchmark['weight'].cumsum()
growth_benchmark_subset = growth_benchmark[growth_benchmark['cumulative_weight'] <= 0.05]

stable_benchmark['weight'] = stable_benchmark['avg_balance'] / stable_benchmark['avg_balance'].sum()
stable_benchmark['cumulative_weight'] = stable_benchmark['weight'].cumsum()
stable_benchmark_subset = stable_benchmark[stable_benchmark['cumulative_weight'] <= 0.05]

# feature engineering on the 180 days data
n180d_copy = n180d.copy()
n180d_copy = n180d_copy.sort_values(by=['user_id', 'pt_date'])
n180d_copy = n180d_copy.groupby('user_id').agg(total_balance_std=('total_balance', 'std'),
                                                  avg_balance=('total_balance', 'mean'),
                                              first_day_balance=('total_balance', 'first'),
                                              last_day_balance=('total_balance', 'last')).reset_index()
n180d_copy['weight'] = n180d_copy['avg_balance'] / n180d_copy['avg_balance'].sum()
n180d_copy['cv'] = n180d_copy['total_balance_std'] / n180d_copy['avg_balance']
n180d_copy['cv_scaled'] = (n180d_copy['cv'] - min(n180d_copy['cv'])) / (max(n180d_copy['cv']) - min(n180d_copy['cv']))
n180d_copy["stability_index"] = 1 - n180d_copy['cv_scaled']
n180d_copy['weighted_stability'] = n180d_copy['weight'] * n180d_copy['stability_index']

# growth and stable subset to have the 180 days for growth rate and stability calculation
growth_benchmark_subset  = n180d_copy[n180d_copy['user_id'].isin(growth_benchmark_subset['user_id'])]
stable_benchmark_subset = n180d_copy[n180d_copy['user_id'].isin(stable_benchmark_subset['user_id'])]

# growth rate for benchmark growth subset 
first_day_balance = growth_benchmark_subset['first_day_balance'].sum()
last_day_balance = growth_benchmark_subset['last_day_balance'].sum()
growth_benchmark_growth = ((last_day_balance - first_day_balance) / first_day_balance)*100

# growth rate for benchmark stable subset 
first_day_balance = stable_benchmark_subset['first_day_balance'].sum()
last_day_balance = stable_benchmark_subset['last_day_balance'].sum()
stable_benchmark_growth = ((last_day_balance - first_day_balance) / first_day_balance)*100

print('Growth Subset (based on next 180 days actual data):')
print(f"Growth rate: ", round(growth_benchmark_growth, 3), "%")
print(f"Stability index: ", round(get_weighted_stability(n180d, growth_benchmark_subset), 5))

print('\nStable Subset (based on next 180 days actual data):')
print(f"Growth rate: ", round(stable_benchmark_growth, 3), "%")
print(f"Stability index: ", round(get_weighted_stability(n180d, stable_benchmark_subset), 5))

Growth Subset (based on next 180 days actual data):
Growth rate:  -26.12 %
Stability index:  0.9782

Stable Subset (based on next 180 days actual data):
Growth rate:  -49.469 %
Stability index:  0.98518


## Ideal

This ideal benchmark is the results we aim to achieve. This is based on the subsetting methodology that we use to label the users and train our classification models on (refer to the subsetting notebook file). 

In [94]:
# user_ids with label 1 from stable and growth data 
growth_ideal = growth_data[growth_data['label']==1]
stable_ideal = stable_data[stable_data['label']==1]

# feature engineering on the 180 days data
n180d_copy = n180d.copy()
n180d_copy = n180d_copy.sort_values(by=['user_id', 'pt_date'])
n180d_copy = n180d_copy.groupby('user_id').agg(total_balance_std=('total_balance', 'std'),
                                                  avg_balance=('total_balance', 'mean'),
                                              first_day_balance=('total_balance', 'first'),
                                              last_day_balance=('total_balance', 'last')).reset_index()
n180d_copy['weight'] = n180d_copy['avg_balance'] / n180d_copy['avg_balance'].sum()
n180d_copy['cv'] = n180d_copy['total_balance_std'] / n180d_copy['avg_balance']
n180d_copy['cv_scaled'] = (n180d_copy['cv'] - min(n180d_copy['cv'])) / (max(n180d_copy['cv']) - min(n180d_copy['cv']))
n180d_copy["stability_index"] = 1 - n180d_copy['cv_scaled']
n180d_copy['weighted_stability'] = n180d_copy['weight'] * n180d_copy['stability_index']

# growth and stable subset to have the 180 days for growth rate and stability calculation
growth_ideal_subset  = n180d_copy[n180d_copy['user_id'].isin(growth_ideal['user_id'])]
stable_ideal_subset = n180d_copy[n180d_copy['user_id'].isin(stable_ideal['user_id'])]

# growth rate for ideal growth subset 
first_day_balance = growth_ideal_subset['first_day_balance'].sum()
last_day_balance = growth_ideal_subset['last_day_balance'].sum()
growth_ideal_growth = ((last_day_balance - first_day_balance) / first_day_balance)*100

# growth rate for ideal stable subset 
first_day_balance = stable_ideal_subset['first_day_balance'].sum()
last_day_balance = stable_ideal_subset['last_day_balance'].sum()
stable_ideal_growth = ((last_day_balance - first_day_balance) / first_day_balance)*100

print('Growth Subset (based on next 180 days actual data):')
print(f"Growth rate: ", round(growth_ideal_growth, 3), "%")
print(f"Stability index: ", round(get_weighted_stability(n180d, growth_ideal_subset), 5))

print('\nStable Subset (based on next 180 days actual data):')
print(f"Growth rate: ", round(stable_ideal_growth, 3), "%")
print(f"Stability index: ", round(get_weighted_stability(n180d, stable_ideal_subset), 5))

Growth Subset (based on next 180 days actual data):
Growth rate:  40.068 %
Stability index:  0.99172

Stable Subset (based on next 180 days actual data):
Growth rate:  1.184 %
Stability index:  0.99977


# Evaluation and Graph

In [95]:
# This function takes in 2 user-level dataframes of actual & predicted subset respectively
# It maps and aggregates the total balance of all users in the subset for each day
# Returns 2 dataframes in time series format (pt_date & total_balance), 
# one for actual subset and one for predicted subset
def get_plotting_df(subset_pred, subset_benchmark):
    
    subset_pred_users = subset_pred.user_id.tolist()
    subset_benchmark_users = subset_benchmark.user_id.tolist()

    subset_pred_270 = df_270[df_270['user_id'].isin(subset_pred_users)]\
    .groupby('pt_date')['total_balance'].sum().reset_index()

    subset_benchmark_270 = df_270[df_270['user_id'].isin(subset_benchmark_users)]\
    .groupby('pt_date')['total_balance'].sum().reset_index() 

    return subset_pred_270, subset_benchmark_270



# This function plots the predicted subset against the actual subset
# With the time series dataframes obtained from the above function
def plot_pred_against_actual(subset_pred_270, subset_benchmark_270, 
                             subset_pred, subset_benchmark,
                             subset_type):
    
    plt.figure(figsize=(12, 3))

    # Plot the lines for each column
    plt.plot(subset_pred_270['pt_date'], subset_pred_270['total_balance'], label=f"Predicted {subset_type} subset")
    plt.plot(subset_benchmark_270['pt_date'], subset_benchmark_270['total_balance'], label=f"Benchmark {subset_type} subset")

    # Set labels for the axes and the legend
    plt.xlabel('Date')
    plt.ylabel('Balance')
    plt.title(f'Predicted vs Benchmark {subset_type} Subset Balances Over Time')
    plt.xticks(np.arange(0, len(subset_pred_270['pt_date']), 30), rotation=45)  # Rotate x-axis labels for better readability
    plt.axvline(x='2023-03-01', color='r', linestyle='--', label='End of last 90 days')
    plt.legend()
    plt.show()

    ###################
    
    num_users_pred = len(subset_pred.user_id.tolist())
    num_users_baseline = len(n180d['user_id'].unique())
    num_users_benchmark = len(subset_benchmark)
    if subset_type == 'growth':
        num_users_ideal = len(growth_data[growth_data['label']==1])
    else:
        num_users_ideal = len(stable_data[stable_data['label']==1])

    weight_pred = subset_pred.weight.sum()
    weight_baseline = 1
    weight_benchmark = subset_benchmark.weight.sum()
    if subset_type == 'growth':
        weight_ideal = growth_ideal_subset['weight'].sum()
    else:
        weight_ideal = stable_ideal_subset['weight'].sum()


    # Last 90 days metrics
    weighted_stability_pred = subset_pred.weighted_stability.sum()
    weighted_stability_true = subset_benchmark.weighted_stability.sum()
    
    starting_bal_pred = subset_pred.first_day_balance.sum()
    ending_bal_pred = subset_pred.last_day_balance.sum()
    growth_pred = (ending_bal_pred - starting_bal_pred) / starting_bal_pred
    
    
    # Next 180 days metrics
    growth_pred_n180 = (subset_pred_270.iloc[-1]['total_balance'] - subset_pred_270.iloc[90]['total_balance']) / subset_pred_270.iloc[90]['total_balance']
    
    weighted_stability_pred_n180 = get_weighted_stability(n180d, subset_pred)
    
    ####################
    # Predicted subset #
    ####################
    
    print(f"Predicted {subset_type} subset (based on last 90 days data):")
    print(f"Weight: ", round(weight_pred,5))
    print(f"No. of users: ", num_users_pred)
    
    print(f"Growth rate: ", round(growth_pred_n180*100, 3), "%")
    print(f"Stability index: ", round(weighted_stability_pred_n180, 5))

    
    #################
    #    Baseline   #
    #################

    print(f'\nBaseline (based on next 180 days data):')
    print(f"Weight: ", round(weight_baseline,5))
    print(f"No. of users: ", num_users_baseline)
    
    print(f"Growth rate: ", round(baseline_growth, 3), "%")
    print(f"Stability index: ", round(baseline_stability, 5))
    
    
    #################
    #   Benchmark   #
    #################
    
    print(f"\nBenchmark {subset_type} subset (based on next 180 days data):")
    print(f"Weight: ", round(weight_benchmark,5))
    print(f"No. of users: ", num_users_benchmark)
    
    if subset_type =='growth':
        print(f"Growth rate: ", round(growth_benchmark_growth, 3), "%")
        print(f"Stability index: ", round(get_weighted_stability(n180d, growth_benchmark_subset), 5))
    else:
        print(f"Growth rate: ", round(stable_benchmark_growth, 3), "%")
        print(f"Stability index: ", round(get_weighted_stability(n180d, stable_benchmark_subset), 5))
    
    #################
    #    ideal     #
    #################

    print(f'\nIdeal {subset_type} (based on next 180 days data):')
    print(f"Weight: ", round(weight_ideal,5))
    print(f"No. of users: ", num_users_ideal)

    if subset_type =='growth':
        print(f"Growth rate: ", round(growth_ideal_growth, 3), "%")
        print(f"Stability index: ", round(get_weighted_stability(n180d, growth_ideal_subset), 5))
    else:
        print(f"Growth rate: ", round(stable_ideal_growth, 3), "%")
        print(f"Stability index: ", round(get_weighted_stability(n180d, stable_ideal_subset), 5))
        

### Plot of actual vs predicted subsets BEFORE final filtering

In [98]:
# Can uncomment to see
# LightGBM
# growth_pred_270, growth_benchmark_270 = get_plotting_df(growth_pred_gbm, growth_benchmark_subset)
# stable_pred_270, stable_benchmark_270 = get_plotting_df(stable_pred_gbm, stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_270, growth_benchmark_270, 
#                          growth_pred_gbm, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_270, stable_benchmark_270,  
#                          stable_pred_gbm, stable_benchmark_subset, 
#                          'stable')


# Add XGB code here
# growth_pred_270, growth_benchmark_270 = get_plotting_df(growth_pred_xgb, growth_benchmark_subset)
# stable_pred_270, stable_benchmark_270 = get_plotting_df(stable_pred_xgb, stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_270, growth_benchmark_270, 
#                          growth_pred_xgb, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_270, stable_benchmark_270,  
#                          stable_pred_xgb, stable_benchmark_subset, 
#                          'stable')

# LogReg
# growth_pred_270, growth_benchmark_270 = get_plotting_df(growth_pred_logreg, growth_benchmark_subset)
# stable_pred_270, stable_benchmark_270 = get_plotting_df(stable_pred_logreg, stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_270, growth_benchmark_270, 
#                          growth_pred_logreg, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_270, stable_benchmark_270,  
#                          stable_pred_logreg, stable_benchmark_subset, 
#                          'stable')


# MLP
# growth_pred_270, growth_benchmark_270 = get_plotting_df(growth_pred_mlp, growth_benchmark_subset)
# stable_pred_270, stable_benchmark_270 = get_plotting_df(stable_pred_mlp, stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_270, growth_benchmark_270, 
#                          growth_pred_mlp, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_270, stable_benchmark_270,  
#                          stable_pred_mlp, stable_benchmark_subset, 
#                          'stable')

# Voting
# growth_pred_270, growth_benchmark_270 = get_plotting_df(growth_pred_voting, growth_benchmark_subset)
# stable_pred_270, stable_benchmark_270 = get_plotting_df(stable_pred_voting, stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_270, growth_benchmark_270, 
#                          growth_pred_voting, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_270, stable_benchmark_270,  
#                          stable_pred_voting, stable_benchmark_subset, 
#                          'stable')

# Stacking
# growth_pred_270, growth_benchmark_270 = get_plotting_df(growth_pred_stacking, growth_benchmark_subset)
# stable_pred_270, stable_benchmark_270 = get_plotting_df(stable_pred_stacking, stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_270, growth_benchmark_270, 
#                          growth_pred_stacking, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_270, stable_benchmark_270,  
#                          stable_pred_stacking, stable_benchmark_subset, 
#                          'stable')

### Plot of actual vs predicted subsets AFTER final filtering

In [97]:
# LightGBM
# growth_pred_gbm_270_final, growth_benchmark_270 = get_plotting_df(growth_pred_gbm_final, growth_benchmark_subset)
# stable_pred_gbm_270_final, stable_benchmark_270 = get_plotting_df(stable_pred_gbm_final,  stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_gbm_270_final, growth_benchmark_270,
#                          growth_pred_gbm_final, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_gbm_270_final, stable_benchmark_270,  
#                          stable_pred_gbm_final, stable_benchmark_subset, 
#                          'stable')

# XGB
# growth_pred_xgb_270_final, growth_benchmark_270 = get_plotting_df(growth_pred_xgb_final, growth_benchmark_subset)
# stable_pred_xgb_270_final, stable_benchmark_270 = get_plotting_df(stable_pred_xgb_final,  stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_xgb_270_final, growth_benchmark_270,
#                          growth_pred_xgb_final, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_xgb_270_final, stable_benchmark_270,  
#                          stable_pred_xgb_final, stable_benchmark_subset, 
#                          'stable')

#LogReg
# growth_pred_logreg_270_final, growth_benchmark_270 = get_plotting_df(growth_pred_logreg_final, growth_benchmark_subset)
# stable_pred_logreg_270_final, stable_benchmark_270 = get_plotting_df(stable_pred_logreg_final,  stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_logreg_270_final, growth_benchmark_270,
#                          growth_pred_logreg_final, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_logreg_270_final, stable_benchmark_270,  
#                          stable_pred_logreg_final, stable_benchmark_subset, 
#                          'stable')

# MLP
# growth_pred_mlp_270_final, growth_benchmark_270 = get_plotting_df(growth_pred_mlp_final, growth_benchmark_subset)
# stable_pred_mlp_270_final, stable_benchmark_270 = get_plotting_df(stable_pred_mlp_final,  stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_mlp_270_final, growth_benchmark_270,
#                          growth_pred_mlp_final, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_mlp_270_final, stable_benchmark_270,  
#                          stable_pred_mlp_final, stable_benchmark_subset, 
#                          'stable')

# # Voting
# growth_pred_voting_270_final, growth_benchmark_270 = get_plotting_df(growth_pred_voting_final, growth_benchmark_subset)
# stable_pred_voting_270_final, stable_benchmark_270 = get_plotting_df(stable_pred_voting_final,  stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_voting_270_final, growth_benchmark_270,
#                          growth_pred_voting_final, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_voting_270_final, stable_benchmark_270,  
#                          stable_pred_voting_final, stable_benchmark_subset, 
#                          'stable')

# Stacking
# growth_pred_stacking_270_final, growth_benchmark_270 = get_plotting_df(growth_pred_stacking_final, growth_benchmark_subset)
# stable_pred_stacking_270_final, stable_benchmark_270 = get_plotting_df(stable_pred_stacking_final,  stable_benchmark_subset)

# plot_pred_against_actual(growth_pred_stacking_270_final, growth_benchmark_270,
#                          growth_pred_stacking_final, growth_benchmark_subset, 
#                          'growth')

# plot_pred_against_actual(stable_pred_stacking_270_final, stable_benchmark_270,  
#                          stable_pred_stacking_final, stable_benchmark_subset, 
#                          'stable')

# Export subset users

In [76]:
def export(best_model, subset_type):
    result = pd.DataFrame()

    # lightgbm model
    if best_model == 'lgbm' and subset_type == 'growth':
        result['user_id'] = growth_pred_gbm_final['user_id']
    elif best_model == 'lgbm' and subset_type == 'stable':
        result['user_id'] = stable_pred_gbm_final['user_id']
    # xgboost model
    elif best_model == 'xgb' and subset_type == 'growth':
        result['user_id'] = growth_pred_xgb_final['user_id']
    elif best_model == 'xgb' and subset_type == 'stable':
        result['user_id'] = stable_pred_xgb_final['user_id']
    # logreg 
    elif best_model == 'logreg' and subset_type == 'growth':
        result['user_id'] = growth_pred_logreg_final['user_id']
    elif best_model == 'logreg' and subset_type == 'stable':
        result['user_id'] = stable_pred_logreg_final['user_id']
    # mlp 
    elif best_model == 'mlp' and subset_type == 'growth':
        result['user_id'] = growth_pred_mlp_final['user_id']
    elif best_model == 'mlp' and subset_type == 'stable':
        result['user_id'] = stable_pred_mlp_final['user_id']
    # voting
    elif best_model == 'voting' and subset_type == 'growth':
        result['user_id'] = growth_pred_voting_final['user_id']
    elif best_model == 'voting' and subset_type == 'stable':
        result['user_id'] = stable_pred_voting_final['user_id']
    # stacking
    elif best_model == 'stacking' and subset_type == 'growth':
        result['user_id'] = growth_pred_stacking_final['user_id']
    elif best_model == 'stacking' and subset_type == 'stable':
        result['user_id'] = stable_pred_stacking_final['user_id']
    else:
        raise ValueError("Invalid combination of best_model and subset_type")

    print('user ids exported to ' + best_model + '_' + subset_type + '_' + 'userid.csv')
    # Export to CSV
    result.to_csv(best_model + '_' + subset_type + '_' + 'userid.csv', index=False)



In [77]:
# user id of growth subset for stacking

#inputs: best_model: [lgbm, xgb, logreg, mlp, voting, stacking]
#        subset_type: [table, growth]

export('logreg', 'growth')

user ids exported to logreg_growth_userid.csv
