#### Import modules, declare constants, export data

In [66]:
from bqWrapper.bq import bqWrapper
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler, Normalizer, PCA
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import seaborn as sns, matplotlib.pyplot as plt, json

In [2]:
bqw = bqWrapper()
spark = bqw.connection
df_train = bqw.create_bigquery_connection(connection=spark, table='training_data')
df_val = bqw.create_bigquery_connection(connection=spark, table='validation_data')

In [3]:
COHORTS_DIMENSIONS = ['first_touch_date', 'traffic_source', 'os', 'country']
TARGET_VAR = 'cohort_ltv_avg_lifetime'
evaluator = RegressionEvaluator(labelCol=TARGET_VAR, predictionCol='prediction', metricName='rmse')

#### Generate train, test data

In [4]:
feature_list = [col for col in df_train.columns if (col not in COHORTS_DIMENSIONS) and (col != TARGET_VAR)]
assembler = VectorAssembler(inputCols=feature_list, outputCol='features')

In [5]:
# Base models
df_train_trans = assembler.transform(df_train)
(X_train, X_test) = df_train_trans.randomSplit([0.8, 0.2], seed=143)

In [6]:
# Cross-validation models, adjusted for Pipeline syntax
X_train_cv = X_train.drop(*(COHORTS_DIMENSIONS+['features']))
X_test_cv = X_test.drop(*(COHORTS_DIMENSIONS+['features']))

#### Dummy baseline average-based model

In [7]:
target_avg = X_test.select(mean(df_train.cohort_ltv_avg_lifetime)).collect()[0][f'avg({TARGET_VAR})']
df_avg = X_test.select(TARGET_VAR)
df_avg = df_avg.withColumn('prediction', lit(target_avg))

In [8]:
avg_rmse = evaluator.evaluate(df_avg)
avg_rmse

9.99172185623135

#### Decision Tree

In [20]:
# Default parameters for all types of regression
FEATURES_COL = 'features'
SEED = 511

##### Default Decision Tree

In [21]:
tree = DecisionTreeRegressor(
    labelCol=TARGET_VAR,
    featuresCol=FEATURES_COL,
    seed=SEED
)

In [22]:
tree_default = tree.fit(X_train)
tree_preds = tree_default.transform(X_test)

In [23]:
tree_rmse = evaluator.evaluate(tree_preds)
tree_rmse

8.683845256505068

In [24]:
# print params to see defaults
{i.name: tree_default.extractParamMap()[i] for i in tree_default.extractParamMap()}

{'cacheNodeIds': False,
 'checkpointInterval': 10,
 'featuresCol': 'features',
 'impurity': 'variance',
 'labelCol': 'cohort_ltv_avg_lifetime',
 'maxBins': 32,
 'maxDepth': 5,
 'maxMemoryInMB': 256,
 'minInfoGain': 0.0,
 'minInstancesPerNode': 1,
 'predictionCol': 'prediction',
 'seed': 511}

##### Tuned Decision Tree

In [25]:
pipeline = Pipeline(stages=[assembler, tree])
paramGrid = ParamGridBuilder() \
    .addGrid(tree.maxBins, [4, 8, 16, 32]) \
    .addGrid(tree.maxDepth, [i for i in range(2, 10)]) \
    .addGrid(tree.minInstancesPerNode, [1, 10, 50, 100]) \
    .build()
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=RegressionEvaluator(labelCol=TARGET_VAR, predictionCol="prediction", metricName="rmse"),
    numFolds=10,
    seed=SEED
)

In [27]:
tree_cv = crossval.fit(X_train_cv)
tree_cv_best = tree_cv.bestModel

In [34]:
tree_cv_best_params = tree_cv_best.stages[-1].extractParamMap()
tree_cv_best_params_dict = {param.name: tree_cv_best_params[param] for param in tree_cv_best_params}

In [37]:
tree_cv_best_params_dict['labelCol'] = TARGET_VAR
tree_cv_best_params_dict['featuresCol'] = FEATURES_COL
tree_cv_best_params_dict['seed'] = SEED

In [38]:
tree_tuned = DecisionTreeRegressor(
    **tree_cv_best_params_dict
)

In [40]:
tree_tuned_model = tree_tuned.fit(X_train)
tree_tuned_preds = tree_tuned_model.transform(X_test)

In [41]:
tree_tuned_rmse = evaluator.evaluate(tree_tuned_preds)
tree_tuned_rmse

8.312011342106903

In [42]:
df_val_trans = assembler.transform(df_val)
df_val_cv = df_val.drop(*(COHORTS_DIMENSIONS+['features']))

In [44]:
tree_cv_preds_val = tree_tuned_model.transform(df_val_trans)
tree_cv_rmse_val = evaluator.evaluate(tree_cv_preds_val)
tree_cv_rmse_val

2.5888947600459935

In [46]:
tree_tuned_model.save('models/decision_tree')

#### Random Forest

In [48]:
rf = RandomForestRegressor(
    labelCol=TARGET_VAR,
    featuresCol=FEATURES_COL,
    seed=SEED
)

##### Default Random Forest

In [49]:
rf_default = rf.fit(X_train)
rf_preds = rf_default.transform(X_test)

In [50]:
rf_rmse = evaluator.evaluate(rf_preds)
rf_rmse

8.557255582745627

In [52]:
# print params to see defaults
{i.name: rf_default.extractParamMap()[i] for i in rf_default.extractParamMap()}

{'cacheNodeIds': False,
 'checkpointInterval': 10,
 'featureSubsetStrategy': 'auto',
 'featuresCol': 'features',
 'impurity': 'variance',
 'labelCol': 'cohort_ltv_avg_lifetime',
 'maxBins': 32,
 'maxDepth': 5,
 'maxMemoryInMB': 256,
 'minInfoGain': 0.0,
 'minInstancesPerNode': 1,
 'numTrees': 20,
 'predictionCol': 'prediction',
 'seed': 511,
 'subsamplingRate': 1.0}

##### Tuned Random Forest

In [53]:
pipeline = Pipeline(stages=[assembler, rf])
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxBins, [8, 16, 32]) \
    .addGrid(rf.maxDepth, [i for i in range(2, 10)]) \
    .addGrid(rf.minInstancesPerNode, [1, 10, 50, 100]) \
    .addGrid(rf.numTrees, [20, 50, 100]) \
    .build()
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=RegressionEvaluator(labelCol=TARGET_VAR, predictionCol="prediction", metricName="rmse"),
    numFolds=10,
    seed=SEED
)

In [54]:
rf_cv = crossval.fit(X_train_cv)
rf_cv_best = rf_cv.bestModel

In [55]:
rf_cv_best_params = rf_cv_best.stages[-1].extractParamMap()
rf_cv_best_params_dict = {param.name: rf_cv_best_params[param] for param in rf_cv_best_params}

In [56]:
rf_cv_best_params_dict['labelCol'] = TARGET_VAR
rf_cv_best_params_dict['featuresCol'] = FEATURES_COL
rf_cv_best_params_dict['seed'] = SEED

In [57]:
rf_tuned = RandomForestRegressor(
    **rf_cv_best_params_dict
)

In [65]:
rf_cv_best_params_dict

{'cacheNodeIds': False,
 'checkpointInterval': 10,
 'featureSubsetStrategy': 'auto',
 'featuresCol': 'features',
 'impurity': 'variance',
 'labelCol': 'cohort_ltv_avg_lifetime',
 'maxBins': 32,
 'maxDepth': 7,
 'maxMemoryInMB': 256,
 'minInfoGain': 0.0,
 'minInstancesPerNode': 10,
 'numTrees': 100,
 'predictionCol': 'prediction',
 'seed': 511,
 'subsamplingRate': 1.0}

In [None]:
tree_cv = crossval.fit(X_train_cv)
tree_cv_best = tree_cv.bestModel

In [59]:
rf_tuned_model = rf_tuned.fit(X_train)
rf_tuned_preds = rf_tuned_model.transform(X_test)

In [63]:
rf_tuned_rmse = evaluator.evaluate(rf_tuned_preds)
rf_tuned_rmse

8.110840700950956

In [64]:
rf_cv_preds_val = rf_tuned_model.transform(df_val_trans)
rf_cv_rmse_val = evaluator.evaluate(rf_cv_preds_val)
rf_cv_rmse_val

2.3205526127076106

In [69]:
with open('models_params/random_forest_params.json', 'w') as f:
    json.dump(rf_cv_best_params_dict, f, indent=4, ensure_ascii=False)

##### Dim red didn't help

Some improvement is achieved. However, we can note the d_* features are quite sparse. We can try to reduce dimensionality by adding PCA to the pipeline

In [10]:
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)

In [39]:
STATIC_COLS = ['first_touch_date', 'traffic_source', 'os', 'country', 'cohort_ltv_avg_lifetime', 'avg_session_duration_sec', 'avg_sessions_count']
feature_list = [col for col in df_train.columns if (col not in COHORTS_DIMENSIONS) and (col != TARGET_VAR) and (col not in STATIC_COLS)]
assembler = VectorAssembler(inputCols=feature_list, outputCol='features')
df_train_trans_norm = assembler.transform(df_train)
df_train_trans_norm = normalizer.transform(df_train_trans_norm)

In [40]:
pca = PCA(k=4, inputCol='normFeatures', outputCol='pca_features')
pca_model = pca.fit(df_train_trans_norm)
pca_model.explainedVariance

DenseVector([0.5616, 0.2025, 0.1179, 0.1107])

In [41]:
df_train_trans_norm = pca_model.transform(df_train_trans_norm)

In [42]:
def ith_(v, i):
   return float(v[i])
ith = udf(ith_, DoubleType())

In [43]:
df_train_trans_norm = df_train_trans_norm.withColumn('pca_1', (ith(df_train_trans_norm.pca_features, lit(0)))) \
                                         .withColumn('pca_2', (ith(df_train_trans_norm.pca_features, lit(1)))) \
                                         .withColumn('pca_3', (ith(df_train_trans_norm.pca_features, lit(2)))) \
                                         .withColumn('pca_4', (ith(df_train_trans_norm.pca_features, lit(3))))

In [52]:
feature_list = [col for col in df_train_trans_norm.columns if (col not in COHORTS_DIMENSIONS) and (col != TARGET_VAR) and ('_d_' not in col) and ('ature') not in col]
assembler = VectorAssembler(inputCols=feature_list, outputCol='reduced_features')
df_train_trans_pca = assembler.transform(df_train_trans_norm)

In [53]:
(X_train_pca, X_test_pca) = df_train_trans_pca.randomSplit([0.8, 0.2], seed=143)
X_train_cv_pca = X_train_pca.drop(*(COHORTS_DIMENSIONS+['features']))
X_test_cv_pca = X_test_pca.drop(*(COHORTS_DIMENSIONS+['features']))

In [62]:
tree_pca = DecisionTreeRegressor(
    labelCol=TARGET_VAR,
    featuresCol='reduced_features'
)

In [63]:
tree_default_pca = tree_pca.fit(X_train_pca)
tree_preds_pca = tree_default_pca.transform(X_test_pca)

In [64]:
tree_pca_rmse = evaluator.evaluate(tree_preds_pca)
tree_pca_rmse

9.215573985681894

In [66]:
df_train_trans_pca.select('reduced_features').limit(3).show()

+--------------------+
|    reduced_features|
+--------------------+
|[111.0,1.0,-0.890...|
|(6,[0,1],[353.0,3...|
|[87.5,3.5,0.20026...|
+--------------------+

