# Set-up

In [None]:
# Import dependencies

import pandas as pd
import numpy as np
import warnings
import dask.dataframe as dd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold 
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# configurations for this notebook
pd.set_option('display.max_columns', None)
warnings.filterwarnings(action='ignore')

In [None]:
# Dataset too big to be imported with pandas, so we can use dask instead
df = dd.read_csv('./data/65d4f0fcb8af9_amex_campus_challenge_train_3.csv')

nrows = df.shape[0].compute()
ncols = df.shape[1]

print("The dataset has ", nrows, "rows and ", ncols, "columns.")

# Save dataframe as parquet for greater efficiency
df.to_parquet('./data/train_data.parquet', engine='pyarrow')

In [None]:
# Code provided by amex. Do not edit!

### Scoring function for participating teams :
def incr_act_top10(input_df: pd.DataFrame,
                   pred_col: str,
                   cm_key='customer',
                   treated_col='ind_recommended',
                   actual_col='activation') -> float:
    '''
    Function that returns the incremental activation score for the AMEX Singapore Hackathon 2024

    input_df : pandas Dataframe which has customer, ind_recommended, activation and pred_col
    pred_col : name of your prediction score variable
    cm_key : customer unique ID (do not change)
    treated_col : indicator variable whether a merchant was recommended
    actual_col : whether a CM had transacted at a given merchant (target variable)

    Returns - incremental activation
    '''

	# for correcting variable types
    input_df[[treated_col, actual_col, pred_col]] = input_df[[treated_col, actual_col, pred_col]].apply(pd.to_numeric, errors='coerce')

    input_df['rank_per_cm1'] = input_df.groupby(cm_key)[pred_col].rank(method='first', ascending=False)

    input_df = input_df.loc[input_df.rank_per_cm1 <= 10,:]

    agg_df = input_df.groupby(treated_col,as_index=False).agg({actual_col:'mean'})
    agg_df.columns = [treated_col,'avg_30d_act']

    print(agg_df)
    recommended_avg_30d_act = float(agg_df.loc[agg_df[treated_col]==1,'avg_30d_act'])
    not_recommended_avg_30d_act = float(agg_df.loc[agg_df[treated_col]==0,'avg_30d_act'])


    return (recommended_avg_30d_act-not_recommended_avg_30d_act)

# Data Cleaning

In [None]:
df = dd.read_parquet('./data/train_data.parquet', engine='pyarrow')
df.head()

In [None]:
# Display list of all columns with at least one NA
null_values_per_column = df.isna().sum().compute().sort_values(ascending=False)
columns_with_null_values = null_values_per_column[null_values_per_column > 0]

pd.set_option('display.max_rows', None)
print(columns_with_null_values)
pd.reset_option('display.max_rows')

In [None]:
# Remove columns with > 50% missingness
nrows = df.shape[0].compute()
cols_to_drop = df.columns[df.isnull().sum() > nrows/2]
df = df.drop(columns = cols_to_drop)

# remove rows with > 50% missingness in remaining 35 features (aka keep rows w at least 18 non-null values)
df = df.dropna(thresh = 22)

In [None]:
# Replace -999 values with NaN
df['merchant_spend_06'] = df['merchant_spend_06'].mask(df['merchant_spend_06'] == -999)
df['customer_profile_01'] = df['customer_profile_01'].mask(df['customer_profile_01'] == -999)

## Missing Data Imputation

Here I attempted to use dask for parallelized imputation, but it didn't work due to memory limitations.

Imputation strategy: 
- mean imputation for non-skewed numerical variables;
- median imputation for skewed numerical variables;
- mode imputation for categorical variable.

In [None]:
# Imputing missing data using the standard iterative way.

mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')

cols_mean = ['customer_spend_02', 'customer_spend_03', 'merchant_spend_06', 'customer_merchant_03',
             'customer_digital_activity_21', 'customer_digital_activity_22', 'merchant_profile_02',
             'merchant_spend_09', 'merchant_profile_03', 'customer_digital_activity_01',
             'customer_profile_04', 'customer_spend_07']
cols_median = ['customer_spend_01', 'customer_industry_spend_01', 'customer_industry_spend_02',
               'customer_industry_spend_03', 'customer_industry_spend_04', 'customer_industry_spend_05',
               'customer_spend_05', 'customer_spend_06', 'merchant_spend_01', 'merchant_spend_02',
               'merchant_spend_03', 'merchant_spend_04', 'merchant_spend_05', 'merchant_spend_07',
               'merchant_spend_08', 'customer_profile_01', 'customer_profile_02', 'distance_04',
               'merchant_spend_10', 'customer_profile_03', 'customer_digital_activity_02', 'distance_05']
cols_mode = ['merchant_profile_01']

for col in cols_mean:
    df[col] = mean_imputer.fit_transform(df[[col]])

for col in cols_median:
    df[col] = median_imputer.fit_transform(df[[col]])
    
for col in cols_mode:
    df[col] = mode_imputer.fit_transform(df[[col]])

In [None]:
# Save imputed dataset 
df.to_parquet('./data/train_data_imputed.parquet', engine='pyarrow')

# Feature Selection

In [None]:
# Import imputed dataset
df = pd.read_parquet('./data/train_data_imputed.parquet', engine = 'pyarrow')

## Data profiling

In [None]:
# Exclude customer and merchant columns
df_feats = df.drop(columns = ['customer', 'merchant'], axis = 1)

In [None]:
# Generate profile report
profile = ProfileReport(df_feats, title = 'Profiling Report', minimal = True)
profile.to_file("profiling report_imputed_minimal.html")

## Correlation Matrix

In [None]:
# Create correlation matrix
corr_mat = df_feats.corr() # as i did this locally as well, this might not run on colab

# Set figure size
plt.figure(figsize=(40, 30))

# Plot correlation heatmap
heatmap = sns.heatmap(corr_mat, vmin=-1, vmax=1, cmap='BrBG', annot = True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)

# Export as png
fig = heatmap.get_figure()
fig.savefig("corr_heatmap_annot_v2.png")

### Findings from correlation matrix:

- `merchant_spend_02` highly correlated with `merchant_spend_08` (0.96)
  - merchants' customers and merchants' transactions. makes sense.
  - let's keep merchants' transactions as info on merchant customers is probably embedded within it.

- `merchant_spend_01` highly correlated with `merchant_spend_03` (0.88)
  - dummy variables.
  - dropped ms03.

- `customer_spend_05`, `customer_profile_01`, and `customer_profile_02` all highly correlated with one another (cp01-cp02: 0.95; cp01-cs05: 0.79, cp02-cs05: 0.78)
  - customer amount spent, and the other 2 are dummy variables.
  - drop cp01 and cp02.

- `customer_industry_spend_02`, `customer_industry_spend_04` and `customer_industry_spend_05` all highly correlated with one another (cis02-cis04: 0.71, cis02-cis05: 0.85, cis04-cis05: 0.83)
  - cis04 is customer industry transaction, other 2 are dummy variables.
  - drop cis02 and cis05.

- `customer_spend_02`, `customer_spend_03`, `customer_spend_06`, `customer_spend_07` all highly correlated with one another (cs02-cs03: 0.77, cs02-cs06: 0.62, cs02-cs07: 0.72, cs03-cs06: 0.7, cs03-cs07: 0.83, cs06-cs07: 0.74)
  - cs03 is customer unique merchants, cs06 is customer transactions, cs07 is days with spend. cs02 is dummy.
  - drop cs02 as it's dummy thus no meaning to us.
  - also drop cs03 and cs07. rationale is that cs06 which is customer transactions probably embeds the info within cs03 and cs07.

Moderate correlations:

- `merchant_spend_09` with `merchant_profile_03` (0.67)
  - both dummy.

- `customer_profile_03` with `customer_spend_07` (0.61)
  - cs07 is days with spend, cp03 is dummy.
  - drop cp03


In [None]:
# Drop cis02, cis05, cs02, cp01, cp02, cp03 as they are highly correlated w other variables,
# and they're considered dummy variables w not much meaning for us
df = df.drop(columns = ['customer_industry_spend_02', 'customer_industry_spend_05',
                        'customer_spend_02', 'customer_profile_03',
                        'customer_profile_01', 'customer_profile_02'], axis = 1)

# Drop customer_digital_activity_21, customer_digital_activity_22, merchant_spend_03,
# merchant_spend_04, merchant_spend_05
# as there are MANY zeros and these are dummy variables,
# which means we don't know what exactly they represent and the significance of these zeros.
# Including these variables may result in biased estimates, so let's drop them.
df = df.drop(columns = ['customer_digital_activity_21', 'customer_digital_activity_22',
                        'merchant_spend_03', 'merchant_spend_04', 'merchant_spend_05'])

# Drop merchant_spend_02 as the info in this column is likely to be embedded within merchant_spend_08
df = df.drop(columns = ['merchant_spend_02'], axis = 1)

# Drop customer_spend_03 and customer_spend_07 as the info within them is likely to be embedded
# within customer_spend_06
df = df.drop(columns = ['customer_spend_03', 'customer_spend_07'], axis = 1)

In [None]:
# Round imputed variables to whole numbers or floats

# Rounded to whole numbers
vars_int = ['customer_industry_spend_04', 'merchant_spend_06', 'merchant_spend_08',
            'merchant_profile_03', 'customer_spend_06', 'merchant_spend_09']

for var in vars_int:
  df[var] = df[var].round().astype(int)

# Rounded to 2dp floats
vars_twodp = ['customer_spend_05', 'customer_industry_spend_03', 'merchant_spend_07']

for var in vars_twodp:
  df[var] = df[var].round(2)

## PCA

In [None]:
# Use PCA to reduce groups of dummy variables in same feature category into 1 feature

merchant_spend_df = df[['merchant_spend_01', 'merchant_spend_06', 'merchant_spend_09']]
merchant_prof_df = df[['merchant_profile_02', 'merchant_profile_03']]
distance_df = df[['distance_04', 'distance_05']]
dfs_to_transform = {'merchant_spend': merchant_spend_df,
                    'merchant_profile': merchant_prof_df,
                    'distance': distance_df}

def reduce_dimensions(df):
  '''
  Takes a dataframe and scales it to standard normal,
  then uses PCA to reduce columns in the dataframe to 1 column.

  Argument:
  - dataframe with shape (x, y).
  
  Output: 
  - array of shape (x, 1)
  '''
  # scale df to standard normal
  scaler = StandardScaler()
  df = scaler.fit_transform(df)

  # apply PCA to reduce variables to 1 column
  pca = PCA(n_components = 1)
  principal_comps = pca.fit_transform(df)

  return principal_comps

# initialise empty dataframe
result = pd.DataFrame()

# iteratively run PCA on each dataframe and concat them to the result dataframe
for key, value in dfs_to_transform.items():
  principal_comps = reduce_dimensions(value)
  reduced_df = pd.DataFrame(data = principal_comps, columns=[key])
  result = pd.concat([result, reduced_df], axis = 1)

result.head()

In [None]:
# Replace columns in training dataframe with principal components of each dummy variable category
df = df.drop(columns = ['merchant_spend_01', 'merchant_spend_06', 'merchant_spend_09',
                        'merchant_profile_02', 'merchant_profile_03', 'distance_04', 'distance_05'], axis = 1).reset_index(drop = True)

df = pd.concat([df, result], axis = 1)

print(f'Number of features remaining = {df.shape[1] - 4}')

In [None]:
# Save partially cleaned dataset to reduce workload next time
#df.to_parquet('./data/train_data_partial_cleaned_040324.parquet', engine='pyarrow')

# Read in this dataframe
df = pd.read_parquet('./data/train_data_partial_cleaned_040324.parquet', engine = 'pyarrow')

# Feature Engineering

In [None]:
# Log-transform skewed numerical variables with minimum value of >0
cols_to_log = ['customer_industry_spend_01', 'customer_industry_spend_03', 'customer_industry_spend_04',
               'customer_spend_01', 'customer_spend_05', 'customer_spend_06',
               'merchant_spend_07', 'merchant_spend_08', 'merchant_spend_10']

for col in cols_to_log:
  new_colname = col + '_log'
  df[new_colname] = np.log(df[col])

df = df.drop(columns = cols_to_log, axis = 1)

In [None]:
# Discretize customer_digital_activity_02 into 5 buckets based on percentile

def discretize(value):
    '''
    bucket 1: 0-25 percentile
    bucket 2: 25-50 percentile (median)
    bucket 3: 50-75 percentile
    bucket 4: 75-95 percentile
    bucket 5: top 5 percentile
    '''
    if 0 <= value <= 0.8333:
        return 0
    elif 0.8333 < value <= 2.1667:
        return 1
    elif 2.1667 < value <= 5.3333:
        return 2
    elif 5.3333 < value <= 32.3333:
        return 3
    else: # value > 32.3333
        return 4

df['customer_login_discretized'] = df['customer_digital_activity_02'].apply(discretize)
df = df.drop('customer_digital_activity_02', axis = 1)

In [None]:
# save fully cleaned dataset
#df.to_parquet('./data/train_data_clean.parquet', engine='pyarrow')

# read fully cleaned dataset
df = pd.read_parquet('./data/train_data_clean.parquet', engine='pyarrow')

# Model Training

Choose uplift model algorithm based on holdout set of training data

In [None]:
# Standardize continuous features 
# Don't standardize categorical features + PCA features
df_feats = df.drop(columns = ['customer', 'merchant', 'ind_recommended', 'activation'], axis = 1)
cols_to_norm = [col for col in df_feats if col not in ['merchant_profile_01', 'customer_login_discretized', 'distance', 'merchant_profile', 'merchant_spend']]

scaler = StandardScaler()

df_feats[cols_to_norm] = scaler.fit_transform(df_feats[cols_to_norm])

# Declare `merchant_profile_01` and `customer_login_discretized` as categorical data for xgboost
df_feats['merchant_profile_01'] = df_feats["merchant_profile_01"].astype("category")
df_feats['customer_login_discretized'] = df_feats["customer_login_discretized"].astype("category")

## Class Transformation

For more info on this model, visit https://www.uplift-modeling.com/en/latest/user_guide/models/revert_label.html

In [None]:
from sklift.models import ClassTransformation

estimator = XGBClassifier(enable_categorical = True)

ct = ClassTransformation(estimator)
ct = ct.fit(X_train, y_train, treat_train)

uplift_ct = ct.predict(X_val)

val_results_df['pred_col'] = uplift_ct

# Calculate IAR 
ct_score = incr_act_top10(input_df = val_results_df, pred_col = 'pred_col')
print(f"Incremental activation rate is {ct_score}") 

In [None]:
# Create X_train, y_train and treat_train dataframes
X_train = df_feats
y_train = df['activation']
treat_train = df['ind_recommended']

### Hyperparameter Tuning

In [None]:
# Hyperparameter tuning

from sklift.models import ClassTransformation

param_grid = {
    'max_depth': [2, 3, 4],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'lambda': np.power(10., np.arange(0,3)),
    'alpha': np.power(10., np.arange(0,3))
}

skf = StratifiedKFold(n_splits = 5) # 5-fold CV

# Initialise best_params and best_score variables
best_params = None
best_score = -np.inf

for max_depth in param_grid['max_depth']:
    for learning_rate in param_grid['learning_rate']:
        for n_estimators in param_grid['n_estimators']:
            for lmbda in param_grid['lambda']:
                for alpha in param_grid['alpha']:
                    scores = []
                    
                    for train_index, val_index in skf.split(X_train, treat_train):
                        
                        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                        y_train_fold = y_train.iloc[train_index]
                        treat_train_fold = treat_train.iloc[train_index]
                        val_results_df = df.loc[val_index, ['customer', 'ind_recommended', 'activation']]
                        
                        xgb_est_params = {
                            'max_depth': max_depth,
                            'learning_rate': learning_rate, 
                            'n_estimators': n_estimators,
                            'lambda': lmbda,
                            'alpha': alpha,   
                            'seed': 42,
                            'enable_categorical': True
                        }
                        
                        estimator = XGBClassifier(**xgb_est_params)

                        # fit the model and make predictions
                        ct = ClassTransformation(estimator)
                        ct = ct.fit(X_train_fold, y_train_fold, treat_train_fold)
                        uplift_ct = ct.predict(X_val_fold)
                        val_results_df['pred_col'] = uplift_ct

                        # calculate IAR
                        ct_score = incr_act_top10(input_df=val_results_df, pred_col='pred_col')
                        scores.append(ct_score)
                    
                    # calculate average scores across all 5 folds    
                    avg_score = np.mean(scores)
                    
                    # update best score/params if this is the best model so far
                    if avg_score > best_score:
                        best_score = avg_score
                        best_params = xgb_est_params
                    
                    print(f"Parameters: max_depth={max_depth}, learning_rate={learning_rate}, n_estimators={n_estimators}, lambda={lmbda}, alpha={alpha}, score={avg_score}")

# Print out the best parameters and IAR
print(f"Best Incremental Activation Rate: {best_score}")
print(f"Best Parameters: {best_params}")

## RFECV using best model

In [None]:
from sklift.models import ClassTransformation

scores = []
eliminated_features = []
features = X_train.columns.tolist()

skf = StratifiedKFold(n_splits = 5) # 5-fold CV

while len(features) > 1:
    
    current_scores = []
    feature_importances = np.zeros(len(features))
    
    for train_index, val_index in skf.split(X_train[features], treat_train):
                            
        X_train_fold, X_val_fold = X_train.iloc[train_index][features], X_train.iloc[val_index][features]
        y_train_fold = y_train.iloc[train_index]
        treat_train_fold = treat_train.iloc[train_index]
        val_results_df = df.loc[val_index, ['customer', 'ind_recommended', 'activation']]
        
        xgb_est_params = {
            'max_depth':4,
            'learning_rate': 0.2, 
            'n_estimators': 200,
            'lambda': 100,
            'alpha': 10,
            'n_jobs': -1,
            'seed': 42,
            'enable_categorical': True
        }
        
        estimator = XGBClassifier(**xgb_est_params)
        ct = ClassTransformation(estimator)
        ct = ct.fit(X_train_fold, y_train_fold, treat_train_fold)
        uplift_ct = ct.predict(X_val_fold)
        val_results_df['pred_col'] = uplift_ct

        # Calculate IAR and update best score/params if this is the best model so far
        ct_score = incr_act_top10(input_df=val_results_df, pred_col='pred_col')
        current_scores.append(ct_score)
        feature_importances += estimator.feature_importances_ / skf.n_splits

    # calculate average scores across all 5 folds    
    avg_score = np.mean(current_scores)
    scores.append((features, avg_score))
    
    # eliminate the least important feature
    least_important_feature_index = np.argmin(feature_importances)
    eliminated_features.append(features.pop(least_important_feature_index))
    
    print(f"features remaining: {features}; features eliminated: {eliminated_features}")

best_features, best_score = max(scores, key=lambda x: x[1])
print("Best Features:", best_features)
print("Best Score:", best_score)

Best hyperparameters: 
- max depth: 4
- learning rate: 0.2
- no. of estimators: 200
- lambda: 100
- alpha: 10

Best features:
- `merchant_profile_01`
- `customer_login_discretized`

# Submission

Train Class Transformation model on whole training dataset and test on eval dataset

In [None]:
# import cleaned eval dataset
test_df = pd.read_parquet('./data/eval_data_clean.parquet', engine = 'pyarrow')

X_test = test_df.loc[:, ['merchant_profile_01', 'customer_login_discretized']]

# declare `merchant_profile_01` and `customer_login_discretized` as categorical data for xgboost
X_test['merchant_profile_01'] = X_test["merchant_profile_01"].astype("category")
X_test['customer_login_discretized'] = X_test["customer_login_discretized"].astype("category")

# prepare results dataframe
result = test_df[['customer', 'merchant']]

In [None]:
# train model on full training data 
from sklift.models import ClassTransformation

X_train_full = df.loc[:, ['merchant_profile_01', 'customer_login_discretized']]
y_train_full = df.loc[:, 'activation']
treat_train_full = df.loc[:, 'ind_recommended']

xgb_est_params = {
    'max_depth':4,
    'learning_rate': 0.2, 
    'n_estimators': 200,
    'lambda': 100,
    'alpha': 10,
    'n_jobs': -1,
    'seed': 42,
    'enable_categorical': True
}

estimator = XGBClassifier(**xgb_est_params)

ct = ClassTransformation(estimator)
ct = ct.fit(X_train_full, y_train_full, treat_train_full)

# use model to predict test data
uplift_ct_test = ct.predict(X_test)
result['predicted_score'] = uplift_ct_test

In [None]:
# export result dataframe to CSV
result.to_csv('./output/submission.csv', index = False)