# \[06\] Hyperparameter Search

In [1]:
EVENT_DATA_URL = "s3a://udacity-dsnd/sparkify/sparkify_event_data.json"
# EVENT_DATA_URL = "s3a://udacity-dsnd/sparkify/mini_sparkify_event_data.json"

CLEAN_DATA_URL = EVENT_DATA_URL.replace("/sparkify/", "/sparkify/output/02-cleaned-")
WEEK_AGGREGATED_DATA_URL = EVENT_DATA_URL.replace("/sparkify/", "/sparkify/output/04-week-aggregated-")


EXECUTOR_INSTANCES = 2
EXECUTOR_MEM = '6g'

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from cryptography.fernet import Fernet
import base64
import socket

!./install-s3-jars.sh

def decrypt(encrypted_text):
    """
    decrypts an encrypted text. The seed (master-password) for decryption is read from the file ".seed.txt"
    
    Input: encrypted_text
    
    Output: the decrypted text. If the text was not encrypted with the same seed, 
            an exception is raised.
    """
    with open('.seed.txt') as f:
        seed = f.read().strip()
    return Fernet(base64.b64encode((seed*32)[:32].encode('ascii')).decode('ascii')).decrypt(encrypted_text.encode('ascii')).decode('ascii')

AWS_ACCESS_KEY_ID='V6ge1JcQpvyYGJjb'
AWS_SECRET_ACCESS_KEY = decrypt('gAAAAABkDFI6865LaVJVgtTYo0aMx9-JTPbTo6cwOUjg5eNNPsZhBDoHbRZ8xuXQT0ImNfvqcecZuoJd1VzYQEpBaxyCnKvosii8O1KeqoL2NwKdKtL_AUfT4eW4dvJVP--VjEvc0gB4')
OWN_IP=socket.gethostbyname(socket.gethostname())
APP_NAME = "Sparkify"
SPARK_MASTER = "spark://bit-spark-master-svc.spark.svc.cluster.local:7077"
S3_HOST = "minio-api-service.minio.svc"

print(f'### SETUP SPARK SESSION "{APP_NAME}"')
spark = SparkSession.builder \
    .master(SPARK_MASTER) \
    .config("spark.jars","/home/jovyan/jars/aws-java-sdk-bundle-1.11.1026.jar,/home/jovyan/jars/hadoop-aws-3.3.2.jar") \
    .config("spark.driver.host", OWN_IP) \
    .config("spark.hadoop.fs.s3a.endpoint", S3_HOST) \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID) \
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.executor.instances", EXECUTOR_INSTANCES) \
    .config("spark.executor.memory", EXECUTOR_MEM) \
    .appName(APP_NAME).getOrCreate()
print(f"Spark version: {spark.version}")
sc = spark.sparkContext
sc.setLogLevel("WARN")



### SETUP SPARK SESSION "Sparkify"
Spark version: 3.3.2


In [2]:
print(f"### LOAD DATA {WEEK_AGGREGATED_DATA_URL}")
df_userweek = spark.read.json(WEEK_AGGREGATED_DATA_URL)
print(f"### PERSIST")
df_userweek_persist = df_userweek.persist()
df_userweek = df_userweek_persist


### LOAD DATA s3a://udacity-dsnd/sparkify/output/04-week-aggregated-sparkify_event_data.json
### PERSIST


In [34]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel, DecisionTreeClassifier, DecisionTreeClassificationModel, LinearSVC
from pyspark.ml.feature import RegexTokenizer, VectorAssembler, Normalizer, StandardScaler, MinMaxScaler, MaxAbsScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql.types import IntegerType
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import datetime

# timestamp constants for ts in milliseconds
one_hour =        60*60*1000  #     3.600.000
one_day =      24*60*60*1000  #    86.400.000
one_week =   7*24*60*60*1000  #   604.800.000

def logresult(text):
    """
    Input: text
    Print given text to console and also write it at the end of the file "result.log".
    This allows persisting the output of longer /multiple train runs.
    """
    print(text)
    with open("result.log", "a") as logf:
        logf.write(text+"\n")

def oversample(df_train):
    df_lab0 = df_train.where(F.col("label") == 0)
    df_lab1 = df_train.where(F.col("label") == 1)
    train0cnt = df_lab0.count()
    train1cnt = df_lab1.count()
    oversampled_train = df_train
    sum1cnt = train1cnt
    while sum1cnt <= train0cnt:
        sum1cnt = sum1cnt+train1cnt
        print(f"oversampling to: {sum1cnt}/{train0cnt}")
        oversampled_train = oversampled_train.union(df_lab1)
    return oversampled_train

def downsample(df_train, factor):
    df_lab0 = df_train.where(F.col("label") == 0)
    df_lab1 = df_train.where(F.col("label") == 1)
    train0cnt = df_lab0.count()
    print(f"orig-label-0: {train0cnt}")
    train1cnt = df_lab1.count()
    print(f"orig-label-1: {train1cnt}")
    frac = train1cnt/(factor*train0cnt+1)
    df_downsampled = df_lab0.sample(fraction = frac, seed=42)
    df_downsampled = df_downsampled.union(df_lab1)
    print(f"downsampled label-1 = {train1cnt}, label-0 ~ {train0cnt*frac}")
    return df_downsampled

def add_weight_col(df_train):
    """
    Input:  Training DataFrame with a column "label" containing a binary classification (1 or 0)
    Output: Newly created DataFrame, which contains an additional "weight" column, which compensates 
            the different frequency of both partitions. The sum of weights for label "1" is equal to the 
            sum of weights for the label "0".
    """
    label_counts = df_train.agg(F.sum(F.col("label")).alias("l1"), F.sum(1-F.col("label")).alias("l0")).collect()[0]
    l0 = label_counts.l0
    l1 = label_counts.l1
    w1 = l0 / (l0+l1)
    w0 = l1 / (l0+l1)
    print(f"label 0: {l0}, label 1: {l1}")
    df_result = df_train.withColumn("weight", F.when(F.col("label")==1, F.lit(w1)).otherwise(F.lit(w0)))
    return df_result
    
def prefix_columns(df_orig, prefix, do_not_change_cols):
    """
    Input:  df_orig - original DataFrame
            prefix - string to be added to all column names
            do_not_change_cols - columns which should be excluded from beeing prefixed
    Output: new DataFrame with renamed columns. All columns now start with the given prefix,
            with the exception of the columns named in do_not_change_cols
    """
    newcols = [prefix+col if not col in do_not_change_cols else col for col in df_orig.columns]
    return df_orig.toDF(*newcols)

def aggregate_week_data(from_week, to_week):
    """
    Input: from_week, to_week
    Output: aggregated sum data for the weeks from_week..to_week (both including)
    """
    dropcols = ["paid", "usermale", "userregistration", "wid"]
    df_weeks = df_userweek.where((F.col("wid")>=from_week)&(F.col("wid")<=to_week))
    if from_week == to_week:
        # no aggregation necessary, if there is only one week
        return df_weeks.drop(*dropcols)
    aggs = [F.sum(F.col(col)).alias(col) for col in df_weeks.columns if not col in ["userId", *dropcols]]
    df_weeks = df_weeks.groupBy("userId").agg(*aggs)
    return df_weeks    


def create_test_data(CF, current_week):
    """
    Input:  CF - configuration to be used for aggregations
            current_week - the latest week of history data
    split data into three timeslots (future/new-history/old-histor)
    and then use ######
    """

    label_week_min = current_week-CF["FUTURE_LOOKAHEAD_WEEKS"]
    label_week_max = current_week-1

    newhistory_week_min = current_week
    newhistory_week_max = newhistory_week_min+CF["PAST_NEAR_HISTORY_WEEKS"]-1

    oldhistory_week_min = newhistory_week_max+1
    oldhistory_week_max = oldhistory_week_min+CF["PAST_OLD_HISTORY_WEEKS"]-1
    
    df_user = df_userweek.where(F.col("wid") == newhistory_week_min).select("userId", "wid", "paid", "usermale", "userregistration")
    df_user = df_user.withColumn("userregistration", F.col("userregistration")-7*newhistory_week_min)

    df_label = aggregate_week_data(label_week_min, label_week_max)
    df_newhistory = aggregate_week_data(newhistory_week_min, newhistory_week_max)
    df_oldhistory = aggregate_week_data(oldhistory_week_min, oldhistory_week_max)

    if CF["CHURN"]=="canceldown":
        df_label = df_label.withColumn("label", F.when(F.col("pg_cancellation_confirmation")+F.col("pg_submit_downgrade")>0, F.lit(1)).otherwise(F.lit(0))).select("userid", "label")
    elif CF["CHURN"]=="cancel":
        df_label = df_label.withColumn("label", F.when(F.col("pg_cancellation_confirmation")>0, F.lit(1)).otherwise(F.lit(0))).select("userid", "label")
    elif CF["CHURN"]=="down":
        df_label = df_label.withColumn("label", F.when(F.col("pg_submit_downgrade")>0, F.lit(1)).otherwise(F.lit(0))).select("userid", "label")
    else: 
        raise Exception(f'invalid value for CHURN {CF["CHURN"]}')
    df_user = df_user.join(df_label, "userId")

    df_user = df_user.join(prefix_columns(df_newhistory, "nh_", ["userId"]), "userId")
    df_user = df_user.join(prefix_columns(df_oldhistory, "oh_", ["userId"]), "userId")

    for c in df_newhistory.columns:
        if not c in ["userId", "session_hours", "session_start"]:
            df_user = df_user.withColumn("nhn_"+c, F.col("nh_"+c)/F.greatest(F.col("nh_session_hours"), F.lit(0.01)))
    df_user = df_user.withColumn("nhn_session_hours", F.col("nh_session_hours")/CF["PAST_NEAR_HISTORY_WEEKS"])
    df_user = df_user.withColumn("nhn_session_start", F.col("nh_session_start")/CF["PAST_NEAR_HISTORY_WEEKS"])

    for c in df_oldhistory.columns:
        if not c in ["userId", "session_hours", "session_start"]:
            df_user = df_user.withColumn("ohn_"+c, F.col("oh_"+c)/F.greatest(F.col("oh_session_hours"), F.lit(0.01)))
    df_user = df_user.withColumn("ohn_session_hours", F.col("oh_session_hours")/CF["PAST_OLD_HISTORY_WEEKS"])
    df_user = df_user.withColumn("ohn_session_start", F.col("oh_session_start")/CF["PAST_OLD_HISTORY_WEEKS"])
    
    for c in df_newhistory.columns:
        if not c in ["userId"]:
            df_user = df_user.withColumn("r_"+c, F.col("nhn_"+c)/F.greatest(F.lit(0.01), F.col("ohn_"+c)))
    for c in df_newhistory.columns:
        if not c in ["userId"]:
            df_user = df_user.withColumn("d_"+c, F.col("nhn_"+c)-F.col("ohn_"+c))
    
    return df_user



def confuse(df_test_pred):
    n00 = df_test_pred.where((F.col("label")==0)&(F.col("prediction")==0)).count()
    n01 = df_test_pred.where((F.col("label")==0)&(F.col("prediction")==1)).count()
    n10 = df_test_pred.where((F.col("label")==1)&(F.col("prediction")==0)).count()
    n11 = df_test_pred.where((F.col("label")==1)&(F.col("prediction")==1)).count()
    s00 = "{:5d}".format(n00)
    s01 = "{:5d}".format(n01)
    s10 = "{:5d}".format(n10)
    s11 = "{:5d}".format(n11)
    logresult(f"                  ")
    logresult(f" Confusion Matrix: ")
    logresult(f"                  ")
    logresult(f"     | prediction| ")
    logresult(f"     |   0 |  1  | ")
    logresult(f" ----+-----+-----+ ")
    logresult(f" l 0 |{s00}|{s01}| ")
    logresult(f" b --+-----+-----+ ")
    logresult(f" l 1 |{s10}|{s11}| ")
    logresult(f" ----+-----+-----+ ")
    logresult(f"                   ")
    TP = n11
    TN = n00
    FP = n01
    FN = n10
    accuracy = 0
    if TP+TN+FP+FN!=0:
        accuracy = (TP+TN)/(TP+TN+FP+FN)
    precision = 0
    if TP+FP!=0:
        precision = TP/(TP+FP)
    recall = 0
    if TP+FN!=0:
        recall = TP/(TP+FN)
    f1 = 0
    if precision+recall!=0:
        f1 = 2*precision*recall/(precision+recall)
    logresult(f"  accuraccy: {accuracy}")
    logresult(f"  precision: {precision}")
    logresult(f"  recall:    {recall}")
    logresult(f"  f1:        {f1}")
    # https://towardsdatascience.com/matthews-correlation-coefficient-when-to-use-it-and-when-to-avoid-it-310b3c923f7e
    mcc = -9
    nenn = (TN+FN)*(FP+TP)*(TN+FP)*(FN+TP)
    if nenn!=0:   
        mcc = (TN*TP-FP*FN)/math.sqrt(nenn)
    logresult(f"  mcc:       {mcc}")
    return (accuracy, precision, recall, f1)
    
    
def hyper_tune_rf(df_train, df_test, configstr, num_tree_values, max_depth_values):   # 20, 5
    best_f1 = -1
    best_model = None
    best_model_name = "?"
    for num_trees in num_tree_values:
        for max_depth in max_depth_values:
            model_name = f"rf_{num_trees}_{max_depth}"
            logresult(f"")
            logresult(f"------------------------------------")
            logresult(f"TRAINING {configstr} {model_name}")
            logresult(f"------------------------------------")
            rf = RandomForestClassifier(featuresCol="features", numTrees=num_trees, maxDepth=max_depth, weightCol="weight", seed=42)
            model = rf.fit(df_train)
            predict_test  = model.transform(df_test)
            accuracy, precision, recall, f1 = confuse(predict_test)
            print(f"  {model_name}: f1 {f1}")
            if f1 > best_f1:
                best_f1 = f1
                best_model = model
                best_model_name = model_name
    print(f"best f1 {best_f1} for {best_model_name}")
    return (best_model, best_f1, best_model_name)


def hyper_tune_dt(df_train, df_test, configstr, max_depths, max_bins_list):  # 5, 32
    best_f1 = -1
    best_model = None
    best_model_name = "?"
    for  max_depth in max_depths:
        for max_bins in max_bins_list:
            model_name = f"dt_{max_depth}_{max_bins}"
            logresult(f"")
            logresult(f"------------------------------------")
            logresult(f"TRAINING {configstr} {model_name}")
            logresult(f"------------------------------------")
            dt = DecisionTreeClassifier(featuresCol="features", maxDepth=max_depth, maxBins=max_bins, weightCol="weight", seed=42)
            model = dt.fit(df_train)
            predict_test  = model.transform(df_test)
            accuracy, precision, recall, f1 = confuse(predict_test)
            print(f"  {model_name}: f1 {f1}")
            if f1 > best_f1:
                print(f"new best f1")
                best_f1 = f1
                best_model = model
                best_model_name = model_name
    print(f"best f1 {best_f1} for {best_model_name}")
    return (best_model, best_f1, best_model_name)




def hyper_tune_lr(df_train, df_test, configstr, max_iters, reg_params, elastic_net_params): # 100, 0, 0
    # https://towardsdatascience.com/beginners-guide-to-linear-regression-with-pyspark-bfc39b45a9e9
    evaluator = RegressionEvaluator(predictionCol="prediction_orig", labelCol="label", metricName="rmse") 
    
    best_f1 = -1
    best_model = None
    best_model_name = "?"
    for  max_iter in  max_iters:
        for reg_param in reg_params:
            for elastic_net_param in elastic_net_params:
                model_name = f"lr_{max_iter}_{reg_param}_{elastic_net_param}"
                logresult(f"")
                logresult(f"------------------------------------")
                logresult(f"TRAINING {configstr} {model_name}")
                logresult(f"------------------------------------")
                lr = LinearRegression(featuresCol="features", maxIter= max_iter, regParam=reg_param, elasticNetParam=elastic_net_param, weightCol="weight")
                model = lr.fit(df_train)
                predict_test  = model.transform(df_test)
                predict_test = predict_test.withColumnRenamed("prediction", "prediction_orig")
                err = evaluator.evaluate(predict_test)
                logresult(f"err: {err}")
                thr = 0.5
                predict_test = predict_test.withColumn("prediction", F.when(F.col("prediction_orig")>=thr,1).otherwise(0))
                accuracy, precision, recall, f1 = confuse(predict_test)
                print(f"  {model_name}: f1 {f1}")
                if f1 > best_f1:
                    print(f"new best f1")
                    best_f1 = f1
                    best_model = model
                    best_model_name = model_name
    print(f"best f1 {best_f1} for {best_model_name}")
    return (best_model, best_f1, best_model_name)


def hyper_tune_sv(df_train, df_test, configstr, max_iters, reg_params):   # 100, 0
    best_f1 = -1
    best_model = None
    best_model_name = "?"
    for  max_iter in max_iters:
        for reg_param in reg_params:
            model_name = f"svm_{max_iter}_{reg_param}"
            logresult(f"")
            logresult(f"------------------------------------")
            logresult(f"TRAINING {configstr} {model_name}")
            logresult(f"------------------------------------")
            lsvc = LinearSVC(featuresCol="features", maxIter=max_iter, regParam=reg_param, weightCol="weight")
            model = lsvc.fit(df_train)
            predict_test  = model.transform(df_test)
            accuracy, precision, recall, f1 = confuse(predict_test)
            print(f"  {model_name}: f1 {f1}")
            if f1 > best_f1:
                print(f"new best f1")
                best_f1 = f1
                best_model = model
                best_model_name = model_name
    print(f"best f1 {best_f1} for {best_model_name}")
    return (best_model, best_f1, best_model_name)
    

def hyper_tune(df_trainw, df_traind, df_test, configstr, model_config):
    if "rf" in model_config:
          model, f1, model_name = hyper_tune_rf(df_trainw, df_test, configstr, *(model_config["rf"]))
    if "dt" in model_config:
          model, f1, model_name = hyper_tune_dt(df_trainw, df_test, configstr, *(model_config["dt"]))
    if "lr" in model_config:
          model, f1, model_name = hyper_tune_lr(df_trainw, df_test, configstr, *(model_config["lr"]))
    if "sv" in model_config:
          model, f1, model_name = hyper_tune_sv(df_trainw, df_test, configstr, *(model_config["sv"]))
    return (model, f1, model_name)

MAX_WID = 8

def create_train_test_data(CF):
    current_week = CF["FUTURE_LOOKAHEAD_WEEKS"]
    history_weeks = CF["PAST_NEAR_HISTORY_WEEKS"]+CF["PAST_OLD_HISTORY_WEEKS"]
    df_testtrain = create_test_data(CF, current_week)
    while current_week+history_weeks < MAX_WID:
        current_week = current_week+1
        df_testtrain = df_testtrain.union(create_test_data(CF, current_week))
    return df_testtrain

def create_train_test_vector(CF, df_testtrain):
    featureCols = ["paid", "usermale", "userregistration"]
    for prefix in CF["FEATURE_COLS"]:
        featureCols = [*featureCols, *[col for col in df_testtrain.columns if col.startswith(prefix)]]
    assembler = VectorAssembler(inputCols=featureCols, outputCol="features")
    select_cols = ["userId", "wid", "label","features"]
    if "weight" in df_testtrain.columns:
        select_cols = [*select_cols, "weight"]
    df_testtrain_vec=assembler.transform(df_testtrain).select(*select_cols)
    return df_testtrain_vec

In [6]:
train_configs = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [4],
        "FEATURE_COLS":            [["nhn_", "ohn_"]],
        "MODEL":                   [{"dt": [[13,14,15], [1024,2048]]}]
    }
]

CF = {}
for train_config in train_configs:
 for CHURN in train_config["CHURN"]:
  for FUTURE_LOOKAHEAD_WEEKS in train_config["FUTURE_LOOKAHEAD_WEEKS"]:
   for PAST_NEAR_HISTORY_WEEKS in train_config["PAST_NEAR_HISTORY_WEEKS"]:
    for PAST_OLD_HISTORY_WEEKS in train_config["PAST_OLD_HISTORY_WEEKS"]:
     for FEATURE_COLS in train_config["FEATURE_COLS"]:
        CF["CHURN"] = CHURN
        CF["FUTURE_LOOKAHEAD_WEEKS"] = FUTURE_LOOKAHEAD_WEEKS
        CF["PAST_NEAR_HISTORY_WEEKS"] = PAST_NEAR_HISTORY_WEEKS
        CF["PAST_OLD_HISTORY_WEEKS"] = PAST_OLD_HISTORY_WEEKS
        CF["FEATURE_COLS"] = FEATURE_COLS
        CF["MODEL_URL"] = EVENT_DATA_URL.replace("/sparkify/", "/sparkify/output/07-model-{PAST_OLD_HISTORY_WEEKS}-{PAST_NEAR_HISTORY_WEEKS}-{FUTURE_LOOKAHEAD_WEEKS}-{CHURN}").replace(".json", "")
        CF["TESTTRAIN_DATA_URL"] = EVENT_DATA_URL.replace("/sparkify/", f"/sparkify/output/07-testtrain-{PAST_OLD_HISTORY_WEEKS}-{PAST_NEAR_HISTORY_WEEKS}-{FUTURE_LOOKAHEAD_WEEKS}-{CHURN}-")
        prefixes = str(FEATURE_COLS).replace("[","").replace("]","").replace("'","").replace(",","").replace(" ","")
        configstr = f"{CHURN}-{FUTURE_LOOKAHEAD_WEEKS}-{PAST_NEAR_HISTORY_WEEKS}-{PAST_OLD_HISTORY_WEEKS}-{prefixes}"
        print(f"{configstr}")
        df_testtrain = create_train_test_data(CF)

        print(f"### TRAIN / TEST SPLIT based on userId, to avoid overlapping test and train data")
        df_testtrain_userids = df_testtrain.select("userId").dropDuplicates()
        df_train_userids, df_test_userids = df_testtrain_userids.randomSplit([0.7, 0.3], seed=42)

        df_train_feat = df_testtrain.join(df_train_userids, "userId", "inner")
        df_test_feat = df_testtrain.join(df_test_userids, "userId", "inner")
        
        test_url = CF["TESTTRAIN_DATA_URL"].replace("-testtrain-", "-test-")
        traind_url = CF["TESTTRAIN_DATA_URL"].replace("-testtrain-", "-traind-")
        trainw_url = CF["TESTTRAIN_DATA_URL"].replace("-testtrain-", "-trainw-")

        print(f"### SAVING TEST DATA {test_url}")
        df_test_feat.write.format('json').mode('overwrite').save(test_url)

        df_traind_feat = downsample(df_train_feat, 1)
        print(f"### SAVING TRAIND DATA {traind_url}")
        df_traind_feat.write.format('json').mode('overwrite').save(traind_url)

        df_trainw_feat = add_weight_col(df_train_feat)
        print(f"### SAVING TRAINW DATA {trainw_url}")
        df_trainw_feat.write.format('json').mode('overwrite').save(trainw_url)

        df_test = create_train_test_vector(CF, df_test_feat)
        df_traind = create_train_test_vector(CF, df_traind_feat)
        df_trainw = create_train_test_vector(CF, df_trainw_feat)

        df_test = df_test.unpersist()
        df_traind = df_traind.unpersist()
        df_trainw = df_trainw.unpersist()
        
        for MODEL in train_config["MODEL"]:
            model, f1, model_name = hyper_tune(df_trainw, df_traind, df_test, configstr, MODEL)
        
        df_test.unpersist()
        df_traind.unpersist()
        df_trainw.unpersist()
        
print("FINISHED")

canceldown-2-1-4-nhn_ohn_
### TRAIN / TEST SPLIT based on userId, to avoid overlapping test and train data
### SAVING TEST DATA s3a://udacity-dsnd/sparkify/output/07-test-4-1-2-canceldown-sparkify_event_data.json
orig-label-0: 13036
orig-label-1: 2448
downsampled label-1 = 2448, label-0 ~ 2447.8122267392805
### SAVING TRAIND DATA s3a://udacity-dsnd/sparkify/output/07-traind-4-1-2-canceldown-sparkify_event_data.json
label 0: 13036, label 1: 2448
### SAVING TRAINW DATA s3a://udacity-dsnd/sparkify/output/07-trainw-4-1-2-canceldown-sparkify_event_data.json

------------------------------------
TRAINING canceldown-2-1-4-nhn_ohn_ dt_13_1024
------------------------------------
                  
 Confusion Matrix: 
                  
     | prediction| 
     |   0 |  1  | 
 ----+-----+-----+ 
 l 0 | 3422| 1934| 
 b --+-----+-----+ 
 l 1 |  524|  540| 
 ----+-----+-----+ 
                   
  accuraccy: 0.6171339563862929
  precision: 0.21827000808407437
  recall:    0.5075187969924813
  f1:

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/jovyan/.condaenvs/py38/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [35]:
train_configs = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [4],
        "FEATURE_COLS":            [["nhn_", "ohn_"]],
        "MODEL":                   [
#                                     {"rf": [[10,20,40], [4,5,6]]},
#                                     {"dt": [[4,5,6], [16,32,64]]},
#                                     {"sv": [[50,100,200], [0,0.01,0.1]]},
                                     {"lr": [[50,100,200,1000], [0], [0]]},
#                                     {"rf": [[20], [5]]},
#                                     {"dt": [[5], [32]]},
#                                     {"sv": [[100], [0]]},
#                                     {"lr": [[100], [0], [0]]},
                                   ]
    }
]

CF = {}
for train_config in train_configs:
 for CHURN in train_config["CHURN"]:
  for FUTURE_LOOKAHEAD_WEEKS in train_config["FUTURE_LOOKAHEAD_WEEKS"]:
   for PAST_NEAR_HISTORY_WEEKS in train_config["PAST_NEAR_HISTORY_WEEKS"]:
    for PAST_OLD_HISTORY_WEEKS in train_config["PAST_OLD_HISTORY_WEEKS"]:
     for FEATURE_COLS in train_config["FEATURE_COLS"]:
        CF["CHURN"] = CHURN
        CF["FUTURE_LOOKAHEAD_WEEKS"] = FUTURE_LOOKAHEAD_WEEKS
        CF["PAST_NEAR_HISTORY_WEEKS"] = PAST_NEAR_HISTORY_WEEKS
        CF["PAST_OLD_HISTORY_WEEKS"] = PAST_OLD_HISTORY_WEEKS
        CF["FEATURE_COLS"] = FEATURE_COLS
        CF["MODEL_URL"] = EVENT_DATA_URL.replace("/sparkify/", "/sparkify/output/07-model-{PAST_OLD_HISTORY_WEEKS}-{PAST_NEAR_HISTORY_WEEKS}-{FUTURE_LOOKAHEAD_WEEKS}-{CHURN}").replace(".json", "")
        CF["TESTTRAIN_DATA_URL"] = EVENT_DATA_URL.replace("/sparkify/", f"/sparkify/output/07-testtrain-{PAST_OLD_HISTORY_WEEKS}-{PAST_NEAR_HISTORY_WEEKS}-{FUTURE_LOOKAHEAD_WEEKS}-{CHURN}-")
        prefixes = str(FEATURE_COLS).replace("[","").replace("]","").replace("'","").replace(",","").replace(" ","")
        configstr = f"{CHURN}-{FUTURE_LOOKAHEAD_WEEKS}-{PAST_NEAR_HISTORY_WEEKS}-{PAST_OLD_HISTORY_WEEKS}-{prefixes}"
        print(f"{configstr}")

        test_url = CF["TESTTRAIN_DATA_URL"].replace("-testtrain-", "-test-")
        trainw_url = CF["TESTTRAIN_DATA_URL"].replace("-testtrain-", "-trainw-")
        traind_url = CF["TESTTRAIN_DATA_URL"].replace("-testtrain-", "-traind-")

        print(f"### LOADING TEST DATA {test_url}")
        df_test_feat = spark.read.json(test_url)

        print(f"### LOADING TRAIND DATA {traind_url}")
        df_traind_feat = spark.read.json(traind_url)

        print(f"### LOADING TRAINW DATA {trainw_url}")
        df_trainw_feat = spark.read.json(trainw_url)
        
        df_test = create_train_test_vector(CF, df_test_feat)
        df_traind = create_train_test_vector(CF, df_traind_feat)
        df_trainw = create_train_test_vector(CF, df_trainw_feat)

        df_test = df_test.unpersist()
        df_traind = df_traind.unpersist()
        df_trainw = df_trainw.unpersist()

        for MODEL in train_config["MODEL"]:
            model, f1, model_name = hyper_tune(df_trainw, df_traind, df_test, configstr, MODEL)
        
        df_test.unpersist()
        df_traind.unpersist()
        df_trainw.unpersist()
        
print("FINISHED")

canceldown-2-1-4-nhn_ohn_
### LOADING TEST DATA s3a://udacity-dsnd/sparkify/output/07-test-4-1-2-canceldown-sparkify_event_data.json
### LOADING TRAIND DATA s3a://udacity-dsnd/sparkify/output/07-traind-4-1-2-canceldown-sparkify_event_data.json
### LOADING TRAINW DATA s3a://udacity-dsnd/sparkify/output/07-trainw-4-1-2-canceldown-sparkify_event_data.json

------------------------------------
TRAINING canceldown-2-1-4-nhn_ohn_ lr_100_0_0
------------------------------------
err: 16.461213930997054
                  
 Confusion Matrix: 
                  
     | prediction| 
     |   0 |  1  | 
 ----+-----+-----+ 
 l 0 | 3536| 1820| 
 b --+-----+-----+ 
 l 1 |  417|  647| 
 ----+-----+-----+ 
                   
  accuraccy: 0.6515576323987539
  precision: 0.2622618565058776
  recall:    0.6080827067669173
  f1:        0.3664684225431889
  mcc:       0.2050814435083378
  lr_100_0_0: f1 0.3664684225431889
new best f1

------------------------------------
TRAINING canceldown-2-1-4-nhn_ohn_ l

In [14]:
train_configs_1 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [1],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [1],
        "FEATURE_COLS":            [["r_", "nh_"]],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    },
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2, 1],
        "PAST_NEAR_HISTORY_WEEKS": [2, 1],
        "PAST_OLD_HISTORY_WEEKS":  [99],
        "FEATURE_COLS":            [["r_", "nh_"]],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]

train_configs_2 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [99],
        "FEATURE_COLS":            [["r_", "nh_"]],
        "DOWNSAMPLE":              [0.25, 0.5, 0.75],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]

train_configs_3 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [1,2,3,4,5],
        "FEATURE_COLS":            [["r_", "nh_"]],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]

train_configs_4 = [
    {
        "CHURN":                   ["canceldown", "cancel", "down"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [1],
        "FEATURE_COLS":            [["r_", "nh_", "oh_", "nhn_", "ohn_", "d_"]],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]


train_configs_5 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [1],
        "FEATURE_COLS":            [["r_", "nh_"],
                                    ["r_", "oh_"], 
                                    ["r_", "nhn_"], 
                                    ["r_", "ohn_"], 
                                    ["r_", "d_"], 
                                    ["nh_", "oh_"], 
                                    ["nh_", "nhn_"], 
                                    ["nh_", "ohn_"], 
                                    ["nh_", "d_"], 
                                    ["oh_", "nhn_"], 
                                    ["oh_", "ohn_"], 
                                    ["oh_", "d_"], 
                                    ["nhn_", "ohn_"], 
                                    ["nhn_", "d_"], 
                                    ["ohn_", "d_"]
                                   ],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]

train_configs_6 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [1],
        "FEATURE_COLS":            [["r_"], ["nh_"], ["oh_"], ["nhn_"], ["ohn_"], ["d_"]],
        "DOWNSAMPLE":              [1],
        "MODEL":                   [{"rf": [[10], [4]]}]
    }
]

train_configs_7 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [1],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [4],
        "FEATURE_COLS":            [["nhn_", "ohn_"]],
        "MODEL":                   [{"rf": [[10], [4]]},{"rf": [[100,20], [5]]}]
    }
]

train_configs_8 = [
    {
        "CHURN":                   ["canceldown"],
        "FUTURE_LOOKAHEAD_WEEKS":  [2],
        "PAST_NEAR_HISTORY_WEEKS": [1],
        "PAST_OLD_HISTORY_WEEKS":  [4],
        "FEATURE_COLS":            [["nhn_", "ohn_"]],
        "MODEL":                   [{"dt": [[5], [32]]},{"rf": [[10], [4]]}]
    }
]


In [36]:
print("### STOP SPARK SESSION")
spark.stop()  

### STOP SPARK SESSION
