In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
from collections import Counter
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from keras.models import Sequential, Model, load_model
import matplotlib.pyplot as plt
%matplotlib inline

### Load data

In [29]:
# Load oversampled X
oversampled_X_train = np.load('../Data/oversampled_X_train.npy')
# Load oversampled y
oversampled_y_train = np.load('../Data/overssampled_y_train.npy')
# load smoted_data
smoted_X_y = np.load('../Data/smoted_train_X_and_y.npy')
# load test set
norm_test_x = np.load('../Data/norm_test.npy')
dim_reduced_test_x = np.load('../Data/autoencoded_data_test.npy')
y_test = np.load('../Data/y_test.npy')

def find_sim_buyers(data, label, model):
    non_buyers = data[label==0]
    probability = model.predict_proba(non_buyers)
    index = np.where(np.logical_and(probability>=0.45, probability<0.5))[0]
    return non_buyers[index]

mlp_classification_oversample = load_model('../02__Classification/mlp_classification_oversample.h5')
mlp_classification_smote = load_model('../02__Classification/mlp_classification_smote.h5')

sim_buyers_x = find_sim_buyers(oversampled_X_train, oversampled_y_train, mlp_classification_oversample)
sim_buyers_y = np.zeros((len(sim_buyers_x),1))

# select buyers only
buyer_index = oversampled_y_train>0

oversampled_X_regression = oversampled_X_train[buyer_index]
oversampled_y_regression = oversampled_y_train[buyer_index]
oversampled_y_regression = oversampled_y_regression.reshape((oversampled_y_regression.shape[0],1))

oversampled_X_regression = np.concatenate((oversampled_X_regression, sim_buyers_x))
oversampled_y_regression = np.vstack((oversampled_y_regression,sim_buyers_y))

smote_x = smoted_X_y[:,:-1]
smote_y = smoted_X_y[:,-1]

sim_buyers_xsmote = find_sim_buyers(smote_x, smote_y, mlp_classification_smote)
sim_buyers_ysmote = np.zeros((len(sim_buyers_xsmote),1))

# select buyers only
buyer_index2 = smote_y>0

xsmote_regression = smote_x[buyer_index2]
ysmote_regression = smote_y[buyer_index2]
ysmote_regression = ysmote_regression.reshape((ysmote_regression.shape[0],1))

xsmote_regression = np.concatenate((xsmote_regression, sim_buyers_xsmote))
ysmote_regression = np.vstack((ysmote_regression,sim_buyers_ysmote))



### Modeling w/ Random Forest Regression & XGBoostRegression

In [21]:
rf_model = RandomForestRegressor(random_state=42,  
                             n_estimators=30,
                             max_depth=40, 
                             min_samples_split=3, 
                             min_samples_leaf=2, 
                             bootstrap=True)

rf_model2 = RandomForestRegressor(random_state=42,  
                             n_estimators=30,
                             max_depth=40, 
                             min_samples_split=3, 
                             min_samples_leaf=2, 
                             bootstrap=True)

In [23]:
xgb_model = XGBRegressor(max_depth=40, seed=42)
xgb_model2 = XGBRegressor(max_depth=40, seed=42)

In [25]:
rf_model.fit(oversampled_X_regression, oversampled_y_regression)
rf_model2.fit(xsmote_regression, ysmote_regression)

  """Entry point for launching an IPython kernel.
  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=40,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [30]:
xgb_model.fit(oversampled_X_regression, oversampled_y_regression,
              eval_set=[(norm_test_x, y_test)], 
              eval_metric=['rmse', 'mae'], 
              early_stopping_rounds=10, verbose=False)
xgb_model2.fit(xsmote_regression, ysmote_regression,
              eval_set=[(dim_reduced_test_x, y_test)], 
              eval_metric=['rmse', 'mae'], 
              early_stopping_rounds=10, verbose=False)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=40,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=1)

In [32]:
def rmse(targets, predictions):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [50]:
spender_test_index = y_test>0

mean_ltv = y_test[spender_test_index].mean()
rf_pred_os = rf_model.predict(norm_test_x[spender_test_index])
rf_pred_smote = rf_model2.predict(dim_reduced_test_x[spender_test_index])
xgb_pred_os = xgb_model.predict(norm_test_x[spender_test_index])
xgb_pred_smote = xgb_model2.predict(dim_reduced_test_x[spender_test_index])

rf_nrmse_all_os = rmse(y_test[spender_test_index].reshape((rf_pred_os.shape)), rf_pred_os)/mean_ltv
rf_nrmse_all_smote = rmse(y_test[spender_test_index].reshape((rf_pred_smote.shape)), rf_pred_smote)/mean_ltv
xgb_nrmse_all_os = rmse(y_test[spender_test_index].reshape((xgb_pred_os.shape)), xgb_pred_os)/mean_ltv
xgb_nrmse_all_smote = rmse(y_test[spender_test_index].reshape((xgb_pred_smote.shape)), xgb_pred_smote)/mean_ltv
print('nrmse for all spenders using random forest trained on oversampled data: ', '%.4f'%rf_nrmse_all_os)
print('nrmse for all spenders using random forest trained on smote data: ', '%.4f'%rf_nrmse_all_smote)
print('nrmse for all spenders using XGB trained on oversampled data: ', '%.4f'%xgb_nrmse_all_os)
print('nrmse for all spenders using XGB trained on smote data: ', '%.4f'%xgb_nrmse_all_smote)

nrmse for all spenders using random forest trained on oversampled data:  1.5450
nrmse for all spenders using random forest trained on smote data:  1.5279
nrmse for all spenders using XGB trained on oversampled data:  1.6992
nrmse for all spenders using XGB trained on smote data:  1.5696


In [49]:
premium_index = np.where(np.logical_and(y_test>30, y_test<=70))[0]

rf_pred_os_prem = rf_model.predict(norm_test_x[premium_index])
rf_pred_smote_prem = rf_model2.predict(dim_reduced_test_x[premium_index])
xgb_pred_os_prem = xgb_model.predict(norm_test_x[premium_index])
xgb_pred_smote_prem = xgb_model2.predict(dim_reduced_test_x[premium_index])

rf_nrmse_os_prem = rmse(y_test[premium_index].reshape((rf_pred_os_prem.shape)), rf_pred_os_prem)/mean_ltv
rf_nrmse_smote_prem = rmse(y_test[premium_index].reshape((rf_pred_smote_prem.shape)), rf_pred_smote_prem)/mean_ltv
xgb_nrmse_os_prem = rmse(y_test[premium_index].reshape((xgb_pred_os_prem.shape)), xgb_pred_os_prem)/mean_ltv
xgb_nrmse_smote_prem = rmse(y_test[premium_index].reshape((xgb_pred_smote_prem.shape)), xgb_pred_smote_prem)/mean_ltv
print('nrmse for the Premium Users using random forest trained on oversampled data: ', '%.4f'%rf_nrmse_os_prem)
print('nrmse for the Premium Users using random forest trained on smote data: ', '%.4f'%rf_nrmse_smote_prem)
print('nrmse for the Premium Users using XGB trained on oversampled data: ', '%.4f'%xgb_nrmse_os_prem)
print('nrmse for the Premium Users using XGB trained on smote data: ', '%.4f'%xgb_nrmse_smote_prem)

nrmse for the Premium Users using random forest trained on oversampled data:  2.1001
nrmse for the Premium Users using random forest trained on smote data:  2.1152
nrmse for the Premium Users using XGB trained on oversampled data:  2.5021
nrmse for the Premium Users using XGB trained on smote data:  2.4624


In [47]:
rf_pred_os_hi = rf_model.predict(norm_test_x[y_test>100])
rf_pred_smote_hi = rf_model2.predict(dim_reduced_test_x[y_test>100])
xgb_pred_os_hi = xgb_model.predict(norm_test_x[y_test>100])
xgb_pred_smote_hi = xgb_model2.predict(dim_reduced_test_x[y_test>100])

rf_nrmse_os_hi = rmse(y_test[y_test>100].reshape((rf_pred_os_hi.shape)), rf_pred_os_hi)/mean_ltv
rf_nrmse_smote_hi = rmse(y_test[y_test>100].reshape((rf_pred_smote_hi.shape)), rf_pred_smote_hi)/mean_ltv
xgb_nrmse_os_hi = rmse(y_test[y_test>100].reshape((xgb_pred_os_hi.shape)), xgb_pred_os_hi)/mean_ltv
xgb_nrmse_smote_hi = rmse(y_test[y_test>100].reshape((xgb_pred_smote_hi.shape)), xgb_pred_smote_hi)/mean_ltv
print('nrmse for the HIGH SPENDERS using random forest trained on oversampled data: ', '%.4f'%rf_nrmse_os_hi)
print('nrmse for the HIGH SPENDERS using random forest trained on smote data: ', '%.4f'%rf_nrmse_smote_hi)
print('nrmse for the HIGH SPENDERS using XGB trained on oversampled data: ', '%.4f'%xgb_nrmse_os_hi)
print('nrmse for the HIGH SPENDERS using XGB trained on smote data: ', '%.4f'%xgb_nrmse_smote_hi)

nrmse for the HIGH SPENDERS using random forest trained on oversampled data:  5.9245
nrmse for the HIGH SPENDERS using random forest trained on smote data:  5.8797
nrmse for the HIGH SPENDERS using XGB trained on oversampled data:  6.3903
nrmse for the HIGH SPENDERS using XGB trained on smote data:  6.2445
