In [1]:
# importing required libraries
import matplotlib
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import sys
from IPython.display import display
from IPython.display import Image
import warnings
import matplotlib.pyplot as plt
import datetime
import pycountry_convert as pc
import pycountry
import folium
import joblib
from folium.plugins import MarkerCluster
from geopy.geocoders import Nominatim

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import make_scorer

from sklearn.metrics import cohen_kappa_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, mean_squared_log_error

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.tree import export_graphviz
import pydot
import seaborn as sns

import time

# library options
%matplotlib inline
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

## Read in the Train and Test data

In [2]:
X_train = np.array(pd.read_parquet('../data/final/X_train.parquet')).astype('float')
X_test = np.array(pd.read_parquet('../data/final/X_test.parquet')).astype('float')
                      
y_test_reg = np.array(pd.read_parquet('../data/final/y_test.parquet')['sum_payments_package_key_ltv']).astype('float')
y_train_reg = np.array(pd.read_parquet('../data/final/y_train.parquet')['sum_payments_package_key_ltv']).astype('float')

### Extra without transactional data

In [3]:
X1 = pd.read_parquet('../data/final/X_train.parquet')
X2 = pd.read_parquet('../data/final/X_test.parquet')
y1 = np.array(pd.read_parquet('../data/final/y_train.parquet')['sum_payments_package_key_ltv'])
y2 = np.array(pd.read_parquet('../data/final/y_test.parquet')['sum_payments_package_key_ltv'])

In [4]:
X1 = X1.drop(axis=0, columns=['sum_day_1','sum_day_2','sum_day_3','gradient','clumpiness'])
X2 = X2.drop(axis=0, columns=['sum_day_1','sum_day_2','sum_day_3','gradient','clumpiness'])
X = np.concatenate((X1, X2), axis=0).astype('float')
y = np.concatenate((y1, y2), axis=0)

In [5]:
rfr = RandomForestRegressor(n_estimators = 150, random_state = 42)
scoring  = {'mse' : make_scorer(mean_squared_error), 'mae' : make_scorer(mean_absolute_error), 'mape' : make_scorer(mean_absolute_percentage_error), 'msl' : make_scorer(mean_squared_log_error)}

score = cross_validate(estimator=rfr, X=X, y=y, cv=10, scoring=scoring)

In [6]:
print('MSE = ' + str(score.get('test_mse').mean()) + ' and SD = ' + str(score.get('test_mse').std()))
print('MAE = ' + str(score.get('test_mae').mean()) + ' and SD = ' + str(score.get('test_mae').std()))
print('MAPE = ' + str(score.get('test_mape').mean()) + ' and SD = ' + str(score.get('test_mape').std()))
print('MSL = ' + str(score.get('test_msl').mean()) + ' and SD = ' + str(score.get('test_msl').std()))

MSE = 775.3283863346481 and SD = 20.578586556220188
MAE = 16.014040083031173 and SD = 0.16251135985753837
MAPE = 1.7899690183334331 and SD = 0.018154792530813157
MSL = 0.8510783736651467 and SD = 0.009517261138386907


## RF

### Regression

In [7]:
rfr = RandomForestRegressor(n_estimators = 100, random_state = 42)

# Train the model on training data
rfr.fit(X_train, y_train_reg)

RandomForestRegressor(random_state=42)

In [4]:
y_pred_r = rfr.predict(X_test)

In [5]:
# Model Accuracy
print("explained_variance:",metrics.explained_variance_score(y_test_reg, y_pred_r))
print("neg_mean_absolute_error:",metrics.mean_absolute_error(y_test_reg, y_pred_r))
print("neg_mean_squared_error:",metrics.mean_squared_error(y_test_reg, y_pred_r))
print("neg_mean_absolute_percentage_error:",metrics.mean_absolute_percentage_error(y_test_reg, y_pred_r))

explained_variance: 0.5573175753297954
neg_mean_absolute_error: 15.654522781594391
neg_mean_squared_error: 772.5040054627561
neg_mean_absolute_percentage_error: 1.753230528837545


### Tuning

In [21]:
for i in [20,40,60,80,100,120,150,200]:
    start_time = time.time()
    rfr = RandomForestRegressor(n_estimators = i, random_state = 42)
    rfr.fit(X_train, y_train_reg)
    y_pred_r = rfr.predict(X_test)
    print("MSE for", i, "trees:", metrics.mean_squared_error(y_test_reg, y_pred_r), "in", round(time.time() - start_time, 2), "seconds")

MSE for 20 trees: 807.8581347795013 in 90.99 seconds
MSE for 40 trees: 786.8596779399688 in 181.63 seconds
MSE for 60 trees: 777.7796969068288 in 272.65 seconds
MSE for 80 trees: 774.4439932176117 in 369.93 seconds
MSE for 100 trees: 772.5040054627561 in 458.76 seconds
MSE for 120 trees: 771.4112489911414 in 548.47 seconds
MSE for 150 trees: 770.7465984683188 in 687.45 seconds
MSE for 200 trees: 769.5381403207202 in 910.28 seconds


In [22]:
rfr = RandomForestRegressor(n_estimators = 150, random_state = 42)
inputs = np.concatenate((X_train, X_test), axis=0)
targets = np.concatenate((y_train_reg, y_test_reg), axis=0)
scoring  = {'mse' : make_scorer(mean_squared_error), 'mae' : make_scorer(mean_absolute_error), 'mape' : make_scorer(mean_absolute_percentage_error), 'msl' : make_scorer(mean_squared_log_error)}

score = cross_validate(estimator=rfr, X=inputs, y=targets, cv=10, scoring=scoring)

In [23]:
score

{'fit_time': array([716.82550693, 720.37387466, 715.73007488, 710.82220292,
        708.45105624, 711.28887415, 708.96021366, 720.07514334,
        717.27077556, 716.44385862]),
 'score_time': array([0.53448606, 0.53448963, 0.56253314, 0.53048253, 0.53233743,
        0.54361725, 0.52313161, 0.52804971, 0.5236876 , 0.54185104]),
 'test_mse': array([743.1586161 , 747.34154653, 729.6891178 , 718.28790592,
        788.31727494, 765.52877331, 781.68085742, 768.06773947,
        783.09749033, 762.86894701]),
 'test_mae': array([15.52109636, 15.56679304, 15.35829953, 15.38959276, 15.9304411 ,
        15.52461362, 15.85865869, 15.68655851, 15.73548572, 15.63977027]),
 'test_mape': array([1.75914805, 1.72617684, 1.76255951, 1.7510513 , 1.73409886,
        1.75068091, 1.78597342, 1.77589573, 1.76280022, 1.75483113]),
 'test_msl': array([0.84395918, 0.82342354, 0.82215028, 0.82619226, 0.82981001,
        0.82953917, 0.84643864, 0.83785139, 0.85319617, 0.83893637])}

In [24]:
print('MSE = ' + str(score.get('test_mse').mean()) + ' and SD = ' + str(score.get('test_mse').std()))
print('MAE = ' + str(score.get('test_mae').mean()) + ' and SD = ' + str(score.get('test_mae').std()))
print('MAPE = ' + str(score.get('test_mape').mean()) + ' and SD = ' + str(score.get('test_mape').std()))
print('MSL = ' + str(score.get('test_msl').mean()) + ' and SD = ' + str(score.get('test_msl').std()))

MSE = 758.8038268823369 and STD = 22.355127011543807
MAE = 15.621130959665123 and STD = 0.17741556150223817
MAPE = 1.7563215966940287 and STD = 0.016779815184257186
MSL = 0.8351497017730972 and STD = 0.009995369072203724
