In [1]:
# importing required libraries
import matplotlib
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import sys
from IPython.display import display
from IPython.display import Image
import warnings
import matplotlib.pyplot as plt
import datetime
import pycountry_convert as pc
import pycountry
import folium
import joblib
from folium.plugins import MarkerCluster
from geopy.geocoders import Nominatim

from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn.metrics import make_scorer

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, mean_squared_log_error


from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.tree import export_graphviz
import pydot
import seaborn as sns

# library options
%matplotlib inline
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

## Read in the Train and Test data

In [2]:
X_train = np.array(pd.read_parquet('../data/final/X_train.parquet')).astype('float')
X_test = np.array(pd.read_parquet('../data/final/X_test.parquet')).astype('float')

y_test_reg = np.array(pd.read_parquet('../data/final/y_test.parquet')['sum_payments_package_key_ltv']).astype('float')
y_train_reg = np.array(pd.read_parquet('../data/final/y_train.parquet')['sum_payments_package_key_ltv']).astype('float')

### Extra analysis without transactional data

In [7]:
X1 = pd.read_parquet('../data/final/X_train.parquet')
X2 = pd.read_parquet('../data/final/X_test.parquet')
y1 = np.array(pd.read_parquet('../data/final/y_train.parquet')['sum_payments_package_key_ltv'])
y2 = np.array(pd.read_parquet('../data/final/y_test.parquet')['sum_payments_package_key_ltv'])

In [8]:
X1 = X1.drop(axis=0, columns=['sum_day_1','sum_day_2','sum_day_3','gradient','clumpiness'])
X2 = X2.drop(axis=0, columns=['sum_day_1','sum_day_2','sum_day_3','gradient','clumpiness'])
X = np.concatenate((X1, X2), axis=0).astype('float')
y = np.concatenate((y1, y2), axis=0)

In [9]:
dtr = DecisionTreeRegressor(random_state=42)
scoring  = {'mse' : make_scorer(mean_squared_error), 'mae' : make_scorer(mean_absolute_error), 'mape' : make_scorer(mean_absolute_percentage_error), 'msl' : make_scorer(mean_squared_log_error)}

score = cross_validate(estimator=dtr, X=X, y=y, cv=10, scoring=scoring)

In [10]:
print('MSE = ' + str(score.get('test_mse').mean()) + ' and SD = ' + str(score.get('test_mse').std()))
print('MAE = ' + str(score.get('test_mae').mean()) + ' and SD = ' + str(score.get('test_mae').std()))
print('MAPE = ' + str(score.get('test_mape').mean()) + ' and SD = ' + str(score.get('test_mape').std()))
print('MSL = ' + str(score.get('test_msl').mean()) + ' and SD = ' + str(score.get('test_msl').std()))

MSE = 1612.5302454370358 and SD = 29.887403568029892
MAE = 19.001310135673236 and SD = 0.21896852827557853
MAPE = 1.90785823759589 and SD = 0.0857137323923946
MSL = 1.0732467582900969 and SD = 0.021139608421549543


### Regression

In [7]:
dtr = DecisionTreeRegressor(random_state=42)

# Train the model on training data
dtr.fit(X_train, y_train_reg)

DecisionTreeRegressor(random_state=42)

In [8]:
y_pred_r = dtr.predict(X_test)

In [9]:
# Model Accuracy
print("explained_variance:",metrics.explained_variance_score(y_test_reg, y_pred_r))
print("neg_mean_absolute_error:",metrics.mean_absolute_error(y_test_reg, y_pred_r))
print("neg_mean_squared_error:",metrics.mean_squared_error(y_test_reg, y_pred_r))
print("neg_mean_absolute_percentage_error:",metrics.mean_absolute_percentage_error(y_test_reg, y_pred_r))

explained_variance: 0.07658172975859778
neg_mean_absolute_error: 18.722827751538574
neg_mean_squared_error: 1610.1351273829966
neg_mean_absolute_percentage_error: 1.9736449312818134


In [10]:
inputs = np.concatenate((X_train, X_test), axis=0)
targets = np.concatenate((y_train_reg, y_test_reg), axis=0)
scoring  = {'mse' : make_scorer(mean_squared_error), 'mae' : make_scorer(mean_absolute_error), 'mape' : make_scorer(mean_absolute_percentage_error), 'msl' : make_scorer(mean_squared_log_error)}

score = cross_validate(estimator=dtr, X=inputs, y=targets, cv=10, scoring=scoring)

In [11]:
score

{'fit_time': array([8.22232819, 8.38334489, 8.17297506, 8.25596595, 8.38779187,
        8.17417622, 8.10618043, 7.99035001, 8.17844176, 8.09929371]),
 'score_time': array([0.01000905, 0.00800753, 0.00900817, 0.00900769, 0.00900769,
        0.00799704, 0.00900817, 0.00900841, 0.00900149, 0.00900817]),
 'test_mse': array([1582.36933441, 1546.86612715, 1523.54351103, 1528.47590277,
        1611.09150666, 1543.73538265, 1593.79448678, 1570.36143917,
        1563.77963191, 1565.73587142]),
 'test_mae': array([18.61379347, 18.43987835, 18.27263418, 18.18608773, 18.8707242 ,
        18.34258265, 18.72652068, 18.46512452, 18.45814298, 18.52913333]),
 'test_mape': array([1.95824218, 1.93256101, 1.92402477, 1.78447975, 1.76288844,
        1.80829647, 1.98335546, 1.84288162, 1.86366373, 1.86806721]),
 'test_msl': array([1.08599611, 1.0765764 , 1.0644259 , 1.01258113, 1.05120694,
        1.05309267, 1.06930617, 1.04486613, 1.07407234, 1.05076448])}

In [12]:
print('MSE = ' + str(score.get('test_mse').mean()) + ' and SD = ' + str(score.get('test_mse').std()))
print('MAE = ' + str(score.get('test_mae').mean()) + ' and SD = ' + str(score.get('test_mae').std()))
print('MAPE = ' + str(score.get('test_mape').mean()) + ' and SD = ' + str(score.get('test_mape').std()))
print('MSL = ' + str(score.get('test_msl').mean()) + ' and SD = ' + str(score.get('test_msl').std()))

MSE = 1562.97531939367 and SD = 26.60582117491754
MAE = 18.490462209341125 and SD = 0.1957639740683085
MAPE = 1.872846065215755 and SD = 0.07124669435471406
MSL = 1.0582888267711517 and SD = 0.019782378496614968
