# \[07\] Auto-Experiments

In [23]:
EVENT_DATA_URL = "s3a://udacity-dsnd/sparkify/sparkify_event_data.json"
# EVENT_DATA_URL = "s3a://udacity-dsnd/sparkify/mini_sparkify_event_data.json"

CLEAN_DATA_URL = EVENT_DATA_URL.replace("/sparkify/", "/sparkify/output/02-cleaned-")
WEEK_AGGREGATED_DATA_URL = EVENT_DATA_URL.replace("/sparkify/", "/sparkify/output/04-week-aggregated-")


EXECUTOR_INSTANCES = 2
EXECUTOR_MEM = '6g'

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from cryptography.fernet import Fernet
import base64
import socket

!./install-s3-jars.sh

def decrypt(encrypted_text):
    """
    decrypts an encrypted text. The seed (master-password) for decryption is read from the file ".seed.txt"
    
    Input: encrypted_text
    
    Output: the decrypted text. If the text was not encrypted with the same seed, 
            an exception is raised.
    """
    with open('.seed.txt') as f:
        seed = f.read().strip()
    return Fernet(base64.b64encode((seed*32)[:32].encode('ascii')).decode('ascii')).decrypt(encrypted_text.encode('ascii')).decode('ascii')

AWS_ACCESS_KEY_ID='V6ge1JcQpvyYGJjb'
AWS_SECRET_ACCESS_KEY = decrypt('gAAAAABkDFI6865LaVJVgtTYo0aMx9-JTPbTo6cwOUjg5eNNPsZhBDoHbRZ8xuXQT0ImNfvqcecZuoJd1VzYQEpBaxyCnKvosii8O1KeqoL2NwKdKtL_AUfT4eW4dvJVP--VjEvc0gB4')
OWN_IP=socket.gethostbyname(socket.gethostname())
APP_NAME = "Sparkify"
SPARK_MASTER = "spark://bit-spark-master-svc.spark.svc.cluster.local:7077"
S3_HOST = "minio-api-service.minio.svc"

print(f'### SETUP SPARK SESSION "{APP_NAME}"')
spark = SparkSession.builder \
    .master(SPARK_MASTER) \
    .config("spark.jars","/home/jovyan/jars/aws-java-sdk-bundle-1.11.1026.jar,/home/jovyan/jars/hadoop-aws-3.3.2.jar") \
    .config("spark.driver.host", OWN_IP) \
    .config("spark.hadoop.fs.s3a.endpoint", S3_HOST) \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID) \
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.executor.instances", EXECUTOR_INSTANCES) \
    .config("spark.executor.memory", EXECUTOR_MEM) \
    .appName(APP_NAME).getOrCreate()
print(f"Spark version: {spark.version}")
sc = spark.sparkContext
sc.setLogLevel("WARN")



### SETUP SPARK SESSION "Sparkify"
Spark version: 3.3.2


In [24]:
print(f"### LOAD DATA {WEEK_AGGREGATED_DATA_URL}")
df_userweek = spark.read.json(WEEK_AGGREGATED_DATA_URL)
print(f"### PERSIST")
df_userweek_persist = df_userweek.persist()
df_userweek = df_userweek_persist


### LOAD DATA s3a://udacity-dsnd/sparkify/output/04-week-aggregated-sparkify_event_data.json
### PERSIST


In [57]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel, DecisionTreeClassifier, DecisionTreeClassificationModel, LinearSVC
from pyspark.ml.feature import RegexTokenizer, VectorAssembler, Normalizer, StandardScaler, MinMaxScaler, MaxAbsScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql.types import IntegerType
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import datetime

# timestamp constants for ts in milliseconds
one_hour =        60*60*1000  #     3.600.000
one_day =      24*60*60*1000  #    86.400.000
one_week =   7*24*60*60*1000  #   604.800.000
one_month = 28*24*60*60*1000  # 2.419.200.000

def logresult(text):
    print(text)
    with open("result.log", "a") as logf:
        logf.write(text+"\n")

def oversample(df_train):
    df_lab0 = df_train.where(F.col("label") == 0)
    df_lab1 = df_train.where(F.col("label") == 1)
    train0cnt = df_lab0.count()
    train1cnt = df_lab1.count()
    oversampled_train = df_train
    sum1cnt = train1cnt
    while sum1cnt <= train0cnt:
        sum1cnt = sum1cnt+train1cnt
        print(f"oversampling to: {sum1cnt}/{train0cnt}")
        oversampled_train = oversampled_train.union(df_lab1)
    return oversampled_train

def downsample(df_train, factor):
    df_lab0 = df_train.where(F.col("label") == 0)
    df_lab1 = df_train.where(F.col("label") == 1)
    train0cnt = df_lab0.count()
    print(f"orig-label-0: {train0cnt}")
    train1cnt = df_lab1.count()
    print(f"orig-label-1: {train1cnt}")
    frac = train1cnt/(factor*train0cnt+1)
    df_downsampled = df_lab0.sample(fraction = frac, seed=42)
    df_downsampled = df_downsampled.union(df_lab1)
    print(f"downsampled label-1 = {train1cnt}, label-0 ~ {train0cnt*frac}")
    return df_downsampled

def add_weight_col(df_train):
    label_counts = df_train.agg(F.sum(F.col("label")).alias("l1"), F.sum(1-F.col("label")).alias("l0")).collect()[0]
    
    w1 = label_counts.l0 / (label_counts.l0+label_counts.l1)
    w0 = label_counts.l1 / (label_counts.l0+label_counts.l1)
    
    print(f"label 0: {label_counts.l0}, label 1: {label_counts.l1}")
    df_result = df_train.withColumn("weight", F.when(F.col("label")==1, F.lit(w1)).otherwise(F.lit(w0)))
    return df_result
    

def prefix_columns(df_orig, prefix, do_not_change_cols):
    newcols = [prefix+col if not col in do_not_change_cols else col for col in df_orig.columns]
    return df_orig.toDF(*newcols)

def aggregate_week_data(from_week, to_week):
    """
    Input: from_week, to_week
    Output: aggregated sum data for the weeks from_week..to_week (both including)
    """
    dropcols = ["paid", "usermale", "userregistration", "wid"]
    df_weeks = df_userweek.where((F.col("wid")>=from_week)&(F.col("wid")<=to_week))
    if from_week == to_week:
        # no aggregation necessary, if there is only one week
        return df_weeks.drop(*dropcols)
    aggs = [F.sum(F.col(col)).alias(col) for col in df_weeks.columns if not col in ["userId", *dropcols]]
    df_weeks = df_weeks.groupBy("userId").agg(*aggs)
    return df_weeks    


def create_test_data(CF, current_week):

    label_week_min = current_week-CF["FUTURE_LOOKAHEAD_WEEKS"]
    label_week_max = current_week-1

    newhistory_week_min = current_week
    newhistory_week_max = newhistory_week_min+CF["PAST_NEAR_HISTORY_WEEKS"]-1

    oldhistory_week_min = newhistory_week_max+1
    oldhistory_week_max = oldhistory_week_min+CF["PAST_OLD_HISTORY_WEEKS"]-1
    
    df_user = df_userweek.where(F.col("wid") == newhistory_week_min).select("userId", "wid", "paid", "usermale", "userregistration")
    df_user = df_user.withColumn("userregistration", F.col("userregistration")-7*newhistory_week_min)

    df_label = aggregate_week_data(label_week_min, label_week_max)
    df_newhistory = aggregate_week_data(newhistory_week_min, newhistory_week_max)
    df_oldhistory = aggregate_week_data(oldhistory_week_min, oldhistory_week_max)

    if CF["CHURN"]=="cancel":
        df_label = df_label.withColumn("label", F.when(F.col("pg_cancellation_confirmation")>0, F.lit(1)).otherwise(F.lit(0))).select("userid", "label")
    elif CF["CHURN"]=="down":
        df_label = df_label.withColumn("label", F.when(F.col("pg_submit_downgrade")>0, F.lit(1)).otherwise(F.lit(0))).select("userid", "label")
    elif CF["CHURN"]=="canceldown":
        df_label = df_label.withColumn("label", F.when(F.col("pg_cancellation_confirmation")+F.col("pg_submit_downgrade")>0, F.lit(1)).otherwise(F.lit(0))).select("userid", "label")
    else: 
        raise Exception(f'invalid value for CHURN {CF["CHURN"]}')
    df_user = df_user.join(df_label, "userId")

    df_user = df_user.join(prefix_columns(df_newhistory, "nh_", ["userId"]), "userId")
    df_user = df_user.join(prefix_columns(df_oldhistory, "oh_", ["userId"]), "userId")

    for c in df_oldhistory.columns:
        if not c in ["userId", "session_hours", "session_start"]:
            df_user = df_user.withColumn("ohn_"+c, F.col("oh_"+c)/F.greatest(F.col("oh_session_hours"), F.lit(0.01)))
    df_user = df_user.withColumn("ohn_session_hours", F.col("oh_session_hours"))
    df_user = df_user.withColumn("ohn_session_start", F.col("oh_session_start"))
    for c in df_newhistory.columns:
        if not c in ["userId", "session_hours", "session_start"]:
            df_user = df_user.withColumn("nhn_"+c, F.col("nh_"+c)/F.greatest(F.col("nh_session_hours"), F.lit(0.01)))
    df_user = df_user.withColumn("nhn_session_hours", F.col("nh_session_hours"))
    df_user = df_user.withColumn("nhn_session_start", F.col("nh_session_start"))
    for c in df_newhistory.columns:
        if not c in ["userId"]:
            df_user = df_user.withColumn("r_"+c, F.col("nhn_"+c)/F.greatest(F.lit(0.01), F.col("ohn_"+c)))
    for c in df_newhistory.columns:
        if not c in ["userId"]:
            df_user = df_user.withColumn("d_"+c, F.col("nhn_"+c)-F.col("ohn_"+c))
    
    return df_user



def confuse(df_test_pred):
    n00 = df_test_pred.where((F.col("label")==0)&(F.col("prediction")==0)).count()
    n01 = df_test_pred.where((F.col("label")==0)&(F.col("prediction")==1)).count()
    n10 = df_test_pred.where((F.col("label")==1)&(F.col("prediction")==0)).count()
    n11 = df_test_pred.where((F.col("label")==1)&(F.col("prediction")==1)).count()
    s00 = "{:5d}".format(n00)
    s01 = "{:5d}".format(n01)
    s10 = "{:5d}".format(n10)
    s11 = "{:5d}".format(n11)
    logresult(f"                  ")
    logresult(f" Confusion Matrix: ")
    logresult(f"                  ")
    logresult(f"     | prediction| ")
    logresult(f"     |   0 |  1  | ")
    logresult(f" ----+-----+-----+ ")
    logresult(f" l 0 |{s00}|{s01}| ")
    logresult(f" b --+-----+-----+ ")
    logresult(f" l 1 |{s10}|{s11}| ")
    logresult(f" ----+-----+-----+ ")
    logresult(f"                   ")
    TP = n11
    TN = n00
    FP = n01
    FN = n10
    accuracy = 0
    if TP+TN+FP+FN!=0:
        accuracy = (TP+TN)/(TP+TN+FP+FN)
    precision = 0
    if TP+FP!=0:
        precision = TP/(TP+FP)
    recall = 0
    if TP+FN!=0:
        recall = TP/(TP+FN)
    f1 = 0
    if precision+recall!=0:
        f1 = 2*precision*recall/(precision+recall)
    logresult(f"  accuraccy: {accuracy}")
    logresult(f"  precision: {precision}")
    logresult(f"  recall:    {recall}")
    logresult(f"  f1:        {f1}")
    # https://towardsdatascience.com/matthews-correlation-coefficient-when-to-use-it-and-when-to-avoid-it-310b3c923f7e
    mcc = -9
    nenn = (TN+FN)*(FP+TP)*(TN+FP)*(FN+TP)
    if nenn!=0:   
        mcc = (TN*TP-FP*FN)/math.sqrt(nenn)
    logresult(f"  mcc:       {mcc}")
    return (accuracy, precision, recall, f1)
    
    
def hyper_tune_rf(config, num_tree_values, max_depth_values):   # 20,5
    best_f1 = -1
    best_model = None
    best_model_name = "?"
    for num_trees in num_tree_values:
        for max_depth in max_depth_values:
            model_name = f"rf_{num_trees}_{max_depth}"
            rf = RandomForestClassifier(featuresCol="features", numTrees=num_trees, maxDepth=max_depth, weightCol="weight", seed=42)
            rf_model = rf.fit(df_train)
            predict_test  = rf_model.transform(df_test)
            logresult(f"")
            logresult(f"------------------------------------")
            logresult(f"TRAINING {configstr} {model_name}")
            logresult(f"------------------------------------")
            accuracy, precision, recall, f1 = confuse(predict_test)
            print(f"  {model_name}: f1 {f1}")
            if f1 > best_f1:
                best_f1 = f1
                best_model = rf_model
                best_model_name = model_name
    print(f"best f1 {f1} for {best_model_name}")
    return (best_model, best_f1, best_model_name)


def hyper_tune_lr(max_iters, reg_params, elastic_net_params): # 100, 0, 0
    # https://towardsdatascience.com/beginners-guide-to-linear-regression-with-pyspark-bfc39b45a9e9
    evaluator = RegressionEvaluator(predictionCol="prediction_orig", labelCol="label", metricName="rmse") 
    
    best_err = 9999
    best_model = None
    best_model_name = "?"
    for  max_iter in  max_iters:
        for reg_param in reg_params:
            for elastic_net_param in elastic_net_params:
                model_name = f"lr_{max_iter}_{reg_param}_{elastic_net_param}"
                lr = LinearRegression(featuresCol="features", maxIter= max_iter, regParam=reg_param, elasticNetParam=elastic_net_param)
                model = lr.fit(df_train)
                predict_test  = model.transform(df_test)
                predict_test = predict_test.withColumnRenamed("prediction", "prediction_orig")
                err = evaluator.evaluate(predict_test)
                print(f"err: {err}")
                thr = 0.15
                predict_test = predict_test.withColumn("prediction", F.when(F.col("prediction_orig")>=thr,1).otherwise(0))
                accuracy, precision, recall, f1 = confuse(predict_test)
                print(f"  {model_name}: f1 {f1}")
                if err < best_err:
                    best_err = err
                    best_model = model
                    best_model_name = model_name
    print(f"best f1 {f1} for {best_model_name}")
    return (best_model, best_err, best_model_name)


def hyper_tune_dt(max_depths, max_bins_list):
    best_f1 = -1
    best_model = None
    best_model_name = "?"
    for  max_depth in max_depths:
        for max_bins in max_bins_list:
            model_name = f"dt_{max_depth}_{max_bins}"
            dt = DecisionTreeClassifier(featuresCol="features", maxDepth=max_depth, maxBins=max_bins)
            model = dt.fit(df_train)
            predict_test  = model.transform(df_test)
            accuracy, precision, recall, f1 = confuse(predict_test)
            print(f"  {model_name}: f1 {f1}")
            if f1 > best_f1:
                best_f1 = f1
                best_model = model
                best_model_name = model_name
    print(f"best f1 {f1} for {best_model_name}")
    return (best_model, best_f1, best_model_name)


def hyper_tune_sv(max_iters, reg_params):
    best_f1 = -1
    best_model = None
    best_model_name = "?"
    for  max_iter in max_iters:
        for reg_param in reg_params:
            model_name = f"svm_{max_iter}_{reg_param}"
            lsvc = LinearSVC(featuresCol="features", maxIter=max_iter, regParam=reg_param)
            model = lsvc.fit(df_train)
            predict_test  = model.transform(df_test)
            accuracy, precision, recall, f1 = confuse(predict_test)
            print(f"  {model_name}: f1 {f1}")
            if f1 > best_f1:
                best_f1 = f1
                best_model = model
                best_model_name = model_name
    print(f"best f1 {f1} for {best_model_name}")
    return (best_model, best_f1, best_model_name)
    

In [41]:
def create_train_test_data(CF):
    df_testtrain = create_test_data(CF, 1)
    df_testtrain = df_testtrain.union(create_test_data(CF, 2))
    df_testtrain = df_testtrain.union(create_test_data(CF, 3))
    df_testtrain = df_testtrain.union(create_test_data(CF, 4))
    featureCols = ["paid", "usermale", "userregistration"]
    for prefix in CF["FEATURE_COLS"]:
        featureCols = [*featureCols, *[col for col in df_testtrain.columns if col.startswith(prefix)]]
    assembler = VectorAssembler(inputCols=featureCols, outputCol="features")
    df_testtrain_vec=assembler.transform(df_testtrain).select("userId", "wid", "label","features")
    return df_testtrain_vec

In [42]:
train_configs_1 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [1],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [1],
        "FEATURE_COLS":            [["r_", "nh_"]],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    },
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2, 1],
        "PAST_NEAR_HISTORY_WEEKS": [2, 1],
        "PAST_OLD_HISTORY_WEEKS":  [99],
        "FEATURE_COLS":            [["r_", "nh_"]],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]

train_configs_2 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [99],
        "FEATURE_COLS":            [["r_", "nh_"]],
        "DOWNSAMPLE":              [0.25, 0.5, 0.75],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]

train_configs_3 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [1,2,3,4,5],
        "FEATURE_COLS":            [["r_", "nh_"]],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]

train_configs_4 = [
    {
        "CHURN":                   ["canceldown", "cancel", "down"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [1],
        "FEATURE_COLS":            [["r_", "nh_", "oh_", "nhn_", "ohn_", "d_"]],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]


train_configs_5 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [1],
        "FEATURE_COLS":            [["r_", "nh_"],
                                    ["r_", "oh_"], 
                                    ["r_", "nhn_"], 
                                    ["r_", "ohn_"], 
                                    ["r_", "d_"], 
                                    ["nh_", "oh_"], 
                                    ["nh_", "nhn_"], 
                                    ["nh_", "ohn_"], 
                                    ["nh_", "d_"], 
                                    ["oh_", "nhn_"], 
                                    ["oh_", "ohn_"], 
                                    ["oh_", "d_"], 
                                    ["nhn_", "ohn_"], 
                                    ["nhn_", "d_"], 
                                    ["ohn_", "d_"]
                                   ],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]

train_configs_6 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [1],
        "FEATURE_COLS":            [["r_"], ["nh_"], ["oh_"], ["nhn_"], ["ohn_"], ["d_"]],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]

train_configs = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [4],
        "FEATURE_COLS":            [["nhn_", "ohn_"]],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]


In [43]:
CF = {
        "CHURN":                   "canceldown",
        "FUTURE_LOOKAHEAD_WEEKS":  2,
        "PAST_NEAR_HISTORY_WEEKS": 1,
        "PAST_OLD_HISTORY_WEEKS":  4,
        "FEATURE_COLS":            ["nhn_", "ohn_"],
        "DOWNSAMPLE":              1,
        "MODEL":                   {"rf": [[10], [4]]}
    }



In [None]:
df_testtrain_vec = create_train_test_data(CF)
df_train, df_test = df_testtrain_vec.randomSplit([0.7, 0.3], seed=42)

In [61]:
prefixes = str(CF["FEATURE_COLS"]).replace("[","").replace("]","").replace("'","").replace(",","").replace(" ","")
configstr = f'{CF["CHURN"]}-{CF["FUTURE_LOOKAHEAD_WEEKS"]}-{CF["PAST_NEAR_HISTORY_WEEKS"]}-{CF["PAST_OLD_HISTORY_WEEKS"]}-{prefixes}-{CF["DOWNSAMPLE"]}-wc'
print(f"{configstr}")

canceldown-2-1-4-nhn_ohn_-1-wc


In [58]:
df_train = add_weight_col(df_train)

label 0: 25132, label 1: 4431


In [45]:
df_train = df_train.persist()

In [60]:
CF["MODEL"] = {"rf": [[100], [5]]}


In [62]:
model, f1, model_name = hyper_tune_rf(configstr, *CF["MODEL"]["rf"])


------------------------------------
TRAINING canceldown-2-1-4-nhn_ohn_-1-wc rf_100_5
------------------------------------
                  
 Confusion Matrix: 
                  
     | prediction| 
     |   0 |  1  | 
 ----+-----+-----+ 
 l 0 | 7179| 3755| 
 b --+-----+-----+ 
 l 1 |  673| 1197| 
 ----+-----+-----+ 
                   
  accuraccy: 0.6541705716963448
  precision: 0.2417205169628433
  recall:    0.6401069518716578
  f1:        0.35092348284960423
  mcc:       0.21514051325250577
  rf_100_5: f1 0.35092348284960423
best f1 0.35092348284960423 for rf_100_5


In [56]:
df_train.show(5)

+-------+---+-----+--------------------+------------------+
| userId|wid|label|            features|            weight|
+-------+---+-----+--------------------+------------------+
|1030587|  1|    0|(57,[0,2,3,5,6,9,...|0.8501166999289652|
|1033297|  1|    0|(57,[1,2,3,5,6,11...|0.8501166999289652|
|1069552|  1|    0|(57,[2,3,15,17,24...|0.8501166999289652|
|1083324|  1|    0|(57,[1,2,3,5,6,12...|0.8501166999289652|
|1102913|  1|    0|(57,[0,1,2,3,5,6,...|0.8501166999289652|
+-------+---+-----+--------------------+------------------+
only showing top 5 rows



In [15]:
CF = {}
for train_config in train_configs:
 for CHURN in train_config["CHURN"]:
  for FUTURE_LOOKAHEAD_WEEKS in train_config["FUTURE_LOOKAHEAD_WEEKS"]:
   for PAST_NEAR_HISTORY_WEEKS in train_config["PAST_NEAR_HISTORY_WEEKS"]:
    for PAST_OLD_HISTORY_WEEKS in train_config["PAST_OLD_HISTORY_WEEKS"]:
     for FEATURE_COLS in train_config["FEATURE_COLS"]:
      for DOWNSAMPLE in train_config["DOWNSAMPLE"]:
       for MODEL in train_config["MODEL"]:
        CF["CHURN"] = CHURN
        CF["FUTURE_LOOKAHEAD_WEEKS"] = FUTURE_LOOKAHEAD_WEEKS
        CF["PAST_NEAR_HISTORY_WEEKS"] = PAST_NEAR_HISTORY_WEEKS
        CF["PAST_OLD_HISTORY_WEEKS"] = PAST_OLD_HISTORY_WEEKS
        CF["FEATURE_COLS"] = FEATURE_COLS
        CF["DOWNSAMPLE"] = DOWNSAMPLE
        CF["MODEL"] = MODEL
        CF["MODEL_URL"] = EVENT_DATA_URL.replace("/sparkify/", "/sparkify/output/07-model-{PAST_OLD_HISTORY_WEEKS}-{PAST_NEAR_HISTORY_WEEKS}-{FUTURE_LOOKAHEAD_WEEKS}-{CHURN}").replace(".json", "")
        CF["TESTTRAIN_DATA_URL"] = EVENT_DATA_URL.replace("/sparkify/", f"/sparkify/output/07-testtrain-{PAST_OLD_HISTORY_WEEKS}-{PAST_NEAR_HISTORY_WEEKS}-{FUTURE_LOOKAHEAD_WEEKS}-{CHURN}")
        prefixes = str(FEATURE_COLS).replace("[","").replace("]","").replace("'","").replace(",","").replace(" ","")
        configstr = f"{CHURN}-{FUTURE_LOOKAHEAD_WEEKS}-{PAST_NEAR_HISTORY_WEEKS}-{PAST_OLD_HISTORY_WEEKS}-{prefixes}-{DOWNSAMPLE}"
        print(f"{configstr}")
        df_testtrain_vec = create_train_test_data(CF)
        
        print(f"### TRAIN / TEST SPLIT")
        df_train, df_test = df_testtrain_vec.randomSplit([0.7, 0.3], seed=42)
        
        df_train = downsample(df_train, DOWNSAMPLE)

        df_train = df_train.persist()
        model, f1, model_name = hyper_tune_rf(configstr, *CF["MODEL"]["rf"])
        df_train.unpersist()
        
print("FINISHED")

canceldown-2-1-1-r_-1
### TRAIN / TEST SPLIT
orig-label-0: 20127
orig-label-1: 3993
downsampled label-1 = 3993, label-0 ~ 3992.80161963434

------------------------------------
TRAINING canceldown-2-1-1-r_-1 rf_10_4
------------------------------------
                  
 Confusion Matrix: 
                  
     | prediction| 
     |   0 |  1  | 
 ----+-----+-----+ 
 l 0 | 5210| 3656| 
 b --+-----+-----+ 
 l 1 |  607| 1075| 
 ----+-----+-----+ 
                   
  accuraccy: 0.5958475540386803
  precision: 0.22722468822659056
  recall:    0.6391200951248514
  f1:        0.33525651021362857
  mcc:       0.16692215220863454
  rf_10_4: f1 0.33525651021362857
best f1 0.33525651021362857 for rf_10_4
canceldown-2-1-1-nh_-1
### TRAIN / TEST SPLIT
orig-label-0: 20127
orig-label-1: 3993
downsampled label-1 = 3993, label-0 ~ 3992.80161963434

------------------------------------
TRAINING canceldown-2-1-1-nh_-1 rf_10_4
------------------------------------
                  
 Confusion Matrix:

Py4JJavaError: An error occurred while calling o90947.count.
: java.lang.OutOfMemoryError: Not enough memory to build and broadcast the table to all worker nodes. As a workaround, you can either disable broadcast by setting spark.sql.autoBroadcastJoinThreshold to -1 or increase the spark driver memory by setting spark.driver.memory to a higher value.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.notEnoughMemoryToBuildAndBroadcastTableError(QueryExecutionErrors.scala:1838)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.$anonfun$relationFuture$1(BroadcastExchangeExec.scala:183)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$1(SQLExecution.scala:191)
	at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)


In [16]:
# -----------------

ttd_url = TESTTRAIN_DATA_URL.replace(".json",f"-churn{CHURN}.json")
print(f"### SAVING TESTTRAIN DATA {ttd_url}")
df_testtrain.write.format('json').mode('overwrite').save(ttd_url)
print(f"finished")

### SAVING TESTTRAIN DATA s3a://udacity-dsnd/sparkify/output/06-testtrain-sparkify_event_data-churncanceldown.json
finished


In [43]:
# -----------------

ttd_url = TESTTRAIN_DATA_URL.replace(".json",f"-churn{CHURN}.json")
print(f"### LOAD TESTTRAIN DATA {ttd_url}")
df_testtrain = spark.read.json(ttd_url)
print(f"### PERSIST")
df_testtrain_persist = df_testtrain.persist()
df_testtrain = df_testtrain_persist

### LOAD TESTTRAIN DATA s3a://udacity-dsnd/sparkify/output/06-testtrain-sparkify_event_data-churncanceldown.json
### PERSIST


In [17]:
df_testtrain_orig = df_testtrain

In [21]:
# df_testtrain = oversample(df_testtrain)
#df_testtrain = downsample(df_testtrain, 0.5)

In [19]:


print(f"### CREATE FEATURE COLUMN")

featureCols = ["paid", "usermale", "userregistration"]    

featureCols = [*featureCols, *[col for col in df_testtrain.columns if col.startswith("r_")]]
featureCols = [*featureCols, *[col for col in df_testtrain.columns if col.startswith("nh_")]]

#featureCols = [col for col in df_testtrain.columns if not col in ["userId", "wid", "label"]]
assembler = VectorAssembler(inputCols=featureCols, outputCol="features")
df_testtrain_vec=assembler.transform(df_testtrain).select("userId", "wid", "label","features")


df_testtrain_vec_persist = df_testtrain_vec.persist()
df_testtrain_vec = df_testtrain_vec_persist

# -----------------

print(f"### TRAIN / TEST SPLIT")
df_train, df_test = df_testtrain_vec.randomSplit([0.7, 0.3], seed=42)

#print(f"train: {df_train.count()}")
#print(f"  l1: {df_train.where(df_train.label==1).count()}")
#print(f"  l0: {df_train.where(df_train.label==0).count()}")
#print(f"test: {df_test.count()}")
#print(f"  l1: {df_test.where(df_test.label==1).count()}")
#print(f"  l0: {df_test.where(df_test.label==0).count()}")

## Fit scaler to train dataset
#scaler = MaxAbsScaler().setInputCol('features').setOutputCol('scaled_features')
#df_train = df_train.drop("scaled_features")
#scaler_model = scaler.fit(df_train)
## Scale train and test features
#df_train = scaler_model.transform(df_train)
#df_test = df_test.drop("scaled_features")
#df_test = scaler_model.transform(df_test)

# -----------------
df_test_orig = df_test
df_train_orig = df_train


### CREATE FEATURE COLUMN
### TRAIN / TEST SPLIT
orig-label-0: 23793
orig-label-1: 2644
downsampled label-1 = 2644, label-0 ~ 5287.555536877495


In [20]:
#df_train = oversample(df_train, 0.5)

In [52]:
print(f"train: {df_train.count()}")
print(f"  l1: {df_train.where(df_train.label==1).count()}")
print(f"  l0: {df_train.where(df_train.label==0).count()}")
print(f"test: {df_test.count()}")
print(f"  l1: {df_test.where(df_test.label==1).count()}")
print(f"  l0: {df_test.where(df_test.label==0).count()}")


train: 5173
  l1: 2585
  l0: 2588
test: 2274
  l1: 1176
  l0: 1098


In [33]:
df_test = oversample(df_test)

oversampling to: 1400/10233
oversampling to: 2100/10233
oversampling to: 2800/10233
oversampling to: 3500/10233
oversampling to: 4200/10233
oversampling to: 4900/10233
oversampling to: 5600/10233
oversampling to: 6300/10233
oversampling to: 7000/10233
oversampling to: 7700/10233
oversampling to: 8400/10233
oversampling to: 9100/10233
oversampling to: 9800/10233
oversampling to: 10500/10233


In [9]:
df_train_orig = df_train
df_train = oversample(df_train)

oversampling to: 2230/25009
oversampling to: 3345/25009
oversampling to: 4460/25009
oversampling to: 5575/25009
oversampling to: 6690/25009
oversampling to: 7805/25009
oversampling to: 8920/25009
oversampling to: 10035/25009
oversampling to: 11150/25009
oversampling to: 12265/25009
oversampling to: 13380/25009
oversampling to: 14495/25009
oversampling to: 15610/25009
oversampling to: 16725/25009
oversampling to: 17840/25009
oversampling to: 18955/25009
oversampling to: 20070/25009
oversampling to: 21185/25009
oversampling to: 22300/25009
oversampling to: 23415/25009
oversampling to: 24530/25009
oversampling to: 25645/25009


In [None]:
model, f1, model_name = hyper_tune_rf([20], [5])  

In [None]:
model, f1, model_name = hyper_tune_lr([100], [0], [0])  

In [50]:
df_train.groupBy("userId").count().sort(F.desc(F.col("count"))).show()

+-------+-----+
| userId|count|
+-------+-----+
|1554956|    4|
|1339528|    4|
|1538485|    3|
|1125943|    3|
|1831733|    3|
|1586895|    3|
|1392770|    3|
|1373602|    3|
|1178026|    3|
|1602181|    3|
|1390064|    3|
|1591353|    3|
|1812177|    3|
|1141231|    3|
|1888253|    3|
|1116029|    3|
|1386578|    3|
|1558736|    3|
|1996408|    3|
|1037209|    3|
+-------+-----+
only showing top 20 rows



In [26]:
df_train.where(F.col("userId")=="1655208").count()

70

In [51]:
# -----------------

print(f"### PREDICT TRAIN")
predict_train = model.transform(df_train)
predict_train.select("label", "prediction").show(10)
print(f"### PREDICT TEST")
predict_test  = model.transform(df_test)
predict_test.select("label", "prediction").show(10)

# -----------------

print(f"### EVALUATE PREDICTION")
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol ='rawPrediction', labelCol ='label')
predict_test.select("label", "rawPrediction", "prediction", "probability").show(5)
print("The area under ROC for train set is {}".format(evaluator.evaluate(predict_train)))
print("The area under ROC for test set is {}".format(evaluator.evaluate(predict_test)))

print(f"### EVAL TRAIN:")
confuse(predict_train)
print(f"### EVAL TEST:")
acc, prec, rec, f1 = confuse(predict_test)


### PREDICT TRAIN
+-----+----------+
|label|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       1.0|
|    0|       1.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
+-----+----------+
only showing top 10 rows

### PREDICT TEST
+-----+----------+
|label|prediction|
+-----+----------+
|    0|       1.0|
|    0|       0.0|
|    0|       0.0|
|    0|       1.0|
|    0|       1.0|
|    0|       1.0|
|    0|       1.0|
|    0|       0.0|
|    0|       1.0|
|    0|       0.0|
+-----+----------+
only showing top 10 rows

### EVALUATE PREDICTION
+-----+--------------------+----------+--------------------+
|label|       rawPrediction|prediction|         probability|
+-----+--------------------+----------+--------------------+
|    0|[8.59056034958328...|       1.0|[0.42952801747916...|
|    0|[12.1124409849263...|       0.0|[0.60562204924631...|
|    0|[13.7858852326191...|       0.0|[0.6892942616

In [197]:
# -----------------

print(f"### SAVE MODEL {model_name} {f1*100}")
model_url = f'{MODEL_URL}_{model_name}_f1val{round(f1,3)}'
model.write().overwrite().save(model_url)
print(f"model saved to {model_url}")



### SAVE MODEL rf_20_5 22.481265611990008
model saved to s3a://udacity-dsnd/sparkify/output/05-model-sparkify_event_data_rf_20_5_f1val0.225


In [198]:
featimp = model.featureImportances
nameimp = {}
for i in range(len(featimp)):
    nameimp[featureCols[i]] = featimp[i]
sorted(nameimp.items(), key=lambda x:-x[1])

[('paid', 0.09554455155152178),
 ('oh_session_hours', 0.09302707160037064),
 ('oh_pg_thumbs_up', 0.07582709757031283),
 ('nh_pg_home', 0.07340830594860079),
 ('oh_pg_home', 0.0725284688157008),
 ('oh_session_start', 0.06713461595629495),
 ('oh_pg_settings', 0.050296622196977205),
 ('oh_pg_nextsong', 0.03470888922561419),
 ('nh_pg_logout', 0.031418303669647894),
 ('oh_pg_logout', 0.029338603181943555),
 ('oh_pg_thumbs_down', 0.02669508164427074),
 ('oh_pg_downgrade', 0.0263905768134274),
 ('oh_pg_add_to_playlist', 0.02189388493915634),
 ('nh_pg_thumbs_up', 0.021718938628132543),
 ('nh_pg_downgrade', 0.021610587051586076),
 ('oh_pg_login', 0.021310639336320675),
 ('nh_pg_thumbs_down', 0.020975045098593534),
 ('nh_pg_nextsong', 0.01929290425786421),
 ('nh_pg_add_to_playlist', 0.01864976547102134),
 ('nh_status_307', 0.016846133774795467),
 ('nh_session_hours', 0.016745889155579934),
 ('nh_pg_add_friend', 0.014856832748766466),
 ('oh_pg_roll_advert', 0.014159788446922988),
 ('nh_pg_setting

In [63]:
print("### STOP SPARK SESSION")
spark.stop()  

### STOP SPARK SESSION
