In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_pickle(("/Users/alyssaliu/Desktop/StatsM148/updated_feature_engineered_data.pkl"))

In [3]:
sampled_df = pd.read_pickle("/Users/alyssaliu/Desktop/StatsM148/updated_clustering_sample_df.pkl")

In [4]:
df

Unnamed: 0,customer_id,account_id,first_event,days_since_start,first_stage,first_event_month,first_event_day,first_event_hour,ideal_journey,promotion_exposure
0,-2147483541,2059949086,12,715,1,10,5,8,0,1
1,-2147481037,-1245770865,12,25,1,8,26,12,0,1
2,-2147474335,-1929965268,2,159,3,4,14,1,1,1
3,-2147474305,1098145658,6,660,5,11,28,22,0,1
4,-2147472167,889210381,12,213,1,2,19,8,1,1
...,...,...,...,...,...,...,...,...,...,...
1728480,2147480051,-148210516,21,1006,7,12,18,0,0,1
1728481,2147480182,-1286764612,12,90,1,6,22,9,0,1
1728482,2147480920,496493954,2,487,3,5,21,6,0,1
1728483,2147482120,-488077144,12,51,1,7,30,15,0,1


In [5]:
model_df = df.drop(['customer_id', 'account_id'], axis = 1)
X = model_df.drop(['ideal_journey'], axis = 1)
y = model_df['ideal_journey'] # we use ideal_journey rather than order_shipped etc because the two variables have a correlation of 0.99

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [7]:
# scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [19]:
y_train.value_counts()

ideal_journey
0    946061
1    212023
Name: count, dtype: int64

In [16]:
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# Initialize XGBoost with parameters that are suitable for imbalanced datasets
xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss')

# Train the model
xgb_model.fit(X_train, y_train)

# Predictions
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:,1]

# Evaluate the model
print(classification_report(y_test, y_pred))
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc}")

              precision    recall  f1-score   support

           0       0.87      0.65      0.74    465695
           1       0.27      0.57      0.36    104706

    accuracy                           0.64    570401
   macro avg       0.57      0.61      0.55    570401
weighted avg       0.76      0.64      0.67    570401

ROC AUC Score: 0.6677974466833927


In [21]:
scale_pos_weight

4.462067794531725

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [18]:
# Define a parameter grid to search
param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.7, 0.3),
    'max_depth': randint(3, 10),
    'colsample_bytree': uniform(0.7, 0.3),
    'min_child_weight': randint(1, 6)
}

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define the RandomizedSearch object
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, 
                                   scoring='roc_auc', error_score=0, verbose=3, n_jobs=-1, cv=3)

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best ROC AUC found: {random_search.best_score_}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END colsample_bytree=0.7084991855763053, learning_rate=0.1597002065318124, max_depth=7, min_child_weight=5, n_estimators=108, subsample=0.8450666423031574;, score=0.665 total time=  20.4s
[CV 2/3] END colsample_bytree=0.7084991855763053, learning_rate=0.1597002065318124, max_depth=7, min_child_weight=5, n_estimators=108, subsample=0.8450666423031574;, score=0.666 total time=  20.4s
[CV 3/3] END colsample_bytree=0.7084991855763053, learning_rate=0.1597002065318124, max_depth=7, min_child_weight=5, n_estimators=108, subsample=0.8450666423031574;, score=0.666 total time=  20.6s
[CV 1/3] END colsample_bytree=0.9713443350599842, learning_rate=0.30212087420444406, max_depth=5, min_child_weight=2, n_estimators=168, subsample=0.819289977307222;, score=0.665 total time=  24.1s
[CV 2/3] END colsample_bytree=0.9713443350599842, learning_rate=0.30212087420444406, max_depth=5, min_child_weight=2, n_estimators=168, subsample=0.819

In [None]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc}")
classification_report(y_test, y_pred)