# \[06\] Experiment Models

## Setup Spark Session

for a detailed description what is done here see [01-setup-spark-session.ipynb](01-setup-spark-session.ipynb)


In [40]:
EVENT_DATA_URL = "s3a://udacity-dsnd/sparkify/sparkify_event_data.json"
# EVENT_DATA_URL = "s3a://udacity-dsnd/sparkify/mini_sparkify_event_data.json"

CLEAN_DATA_URL = EVENT_DATA_URL.replace("/sparkify/", "/sparkify/output/02-cleaned-")
WEEK_AGGREGATED_DATA_URL = EVENT_DATA_URL.replace("/sparkify/", "/sparkify/output/04-week-aggregated-")
MODEL_URL = EVENT_DATA_URL.replace("/sparkify/", "/sparkify/output/05-model-").replace(".json", "")
TESTTRAIN_DATA_URL = EVENT_DATA_URL.replace("/sparkify/", "/sparkify/output/06-testtrain-")

# CHURN="cancel"
# CHURN="down"
CHURN="canceldown"

EXECUTOR_INSTANCES = 2
EXECUTOR_MEM = '6g'

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from cryptography.fernet import Fernet
import base64
import socket

!./install-s3-jars.sh

def decrypt(encrypted_text):
    """
    decrypts an encrypted text. The seed (master-password) for decryption is read from the file ".seed.txt"
    
    Input: encrypted_text
    
    Output: the decrypted text. If the text was not encrypted with the same seed, 
            an exception is raised.
    """
    with open('.seed.txt') as f:
        seed = f.read().strip()
    return Fernet(base64.b64encode((seed*32)[:32].encode('ascii')).decode('ascii')).decrypt(encrypted_text.encode('ascii')).decode('ascii')

AWS_ACCESS_KEY_ID='V6ge1JcQpvyYGJjb'
AWS_SECRET_ACCESS_KEY = decrypt('gAAAAABkDFI6865LaVJVgtTYo0aMx9-JTPbTo6cwOUjg5eNNPsZhBDoHbRZ8xuXQT0ImNfvqcecZuoJd1VzYQEpBaxyCnKvosii8O1KeqoL2NwKdKtL_AUfT4eW4dvJVP--VjEvc0gB4')
OWN_IP=socket.gethostbyname(socket.gethostname())
APP_NAME = "Sparkify"
SPARK_MASTER = "spark://bit-spark-master-svc.spark.svc.cluster.local:7077"
S3_HOST = "minio-api-service.minio.svc"

print(f'### SETUP SPARK SESSION "{APP_NAME}"')
spark = SparkSession.builder \
    .master(SPARK_MASTER) \
    .config("spark.jars","/home/jovyan/jars/aws-java-sdk-bundle-1.11.1026.jar,/home/jovyan/jars/hadoop-aws-3.3.2.jar") \
    .config("spark.driver.host", OWN_IP) \
    .config("spark.hadoop.fs.s3a.endpoint", S3_HOST) \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID) \
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.executor.instances", EXECUTOR_INSTANCES) \
    .config("spark.executor.memory", EXECUTOR_MEM) \
    .appName(APP_NAME).getOrCreate()
print(f"Spark version: {spark.version}")
sc = spark.sparkContext
sc.setLogLevel("WARN")



### SETUP SPARK SESSION "Sparkify"
Spark version: 3.3.2


In [10]:
print(f"### LOAD DATA {WEEK_AGGREGATED_DATA_URL}")
df_userweek = spark.read.json(WEEK_AGGREGATED_DATA_URL)
print(f"### PERSIST")
df_userweek_persist = df_userweek.persist()
df_userweek = df_userweek_persist


### LOAD DATA s3a://udacity-dsnd/sparkify/output/04-week-aggregated-sparkify_event_data.json
### PERSIST


## Settings

In [41]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel, DecisionTreeClassifier, DecisionTreeClassificationModel, LinearSVC
from pyspark.ml.feature import RegexTokenizer, VectorAssembler, Normalizer, StandardScaler, MinMaxScaler, MaxAbsScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql.types import IntegerType
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import datetime

# timestamp constants for ts in milliseconds
one_hour =        60*60*1000  #     3.600.000
one_day =      24*60*60*1000  #    86.400.000
one_week =   7*24*60*60*1000  #   604.800.000
one_month = 28*24*60*60*1000  # 2.419.200.000


# weeks to look into the future from the predict-timestamp for label
FUTURE_LOOKAHEAD_WEEKS = 1
# weeks to look into the past from the predict-timestamp for new history
PAST_NEAR_HISTORY_WEEKS = 1
# weeks to look into the past from the predict-timestamp for old history
PAST_OLD_HISTORY_WEEKS = 4



In [42]:
def prefix_columns(df_orig, prefix, do_not_change_cols):
    newcols = [prefix+col if not col in do_not_change_cols else col for col in df_orig.columns]
    return df_orig.toDF(*newcols)

def aggregate_week_data(from_week, to_week):
    """
    Input: from_week, to_week
    Output: aggregated sum data for the weeks from_week..to_week (both including)
    """
    dropcols = ["paid", "usermale", "userregistration", "wid"]
    df_weeks = df_userweek.where((F.col("wid")>=from_week)&(F.col("wid")<=to_week))
    if from_week == to_week:
        # no aggregation necessary, if there is only one week
        return df_weeks.drop(*dropcols)
    aggs = [F.sum(F.col(col)).alias(col) for col in df_weeks.columns if not col in ["userId", *dropcols]]
    df_weeks = df_weeks.groupBy("userId").agg(*aggs)
    return df_weeks    


def create_test_data(current_week):

    label_week_min = current_week-FUTURE_LOOKAHEAD_WEEKS
    label_week_max = current_week-1

    newhistory_week_min = current_week
    newhistory_week_max = current_week+PAST_NEAR_HISTORY_WEEKS-1

    oldhistory_week_min = newhistory_week_max+1
    oldhistory_week_max = current_week+PAST_OLD_HISTORY_WEEKS-1
    
    df_user = df_userweek.where(F.col("wid") == newhistory_week_min).select("userId", "wid", "paid", "usermale", "userregistration")
    df_user = df_user.withColumn("userregistration", F.col("userregistration")-7*newhistory_week_min)

    df_label = aggregate_week_data(label_week_min, label_week_max)
    df_newhistory = aggregate_week_data(newhistory_week_min, newhistory_week_max)
    df_oldhistory = aggregate_week_data(oldhistory_week_min, oldhistory_week_max)

    if CHURN=="cancel":
        print(f"CHURNCANCEL")
        df_label = df_label.withColumn("label", F.when(F.col("pg_cancellation_confirmation")>0, F.lit(1)).otherwise(F.lit(0))).select("userid", "label")
    elif CHURN=="down":
        print(f"CHURNDOWN")
        df_label = df_label.withColumn("label", F.when(F.col("pg_submit_downgrade")>0, F.lit(1)).otherwise(F.lit(0))).select("userid", "label")
    else: 
        print(f"CHURNCANCELDOWN")
        df_label = df_label.withColumn("label", F.when(F.col("pg_cancellation_confirmation")+F.col("pg_submit_downgrade")>0, F.lit(1)).otherwise(F.lit(0))).select("userid", "label")
    df_user = df_user.join(df_label, "userId")

    df_user = df_user.join(prefix_columns(df_newhistory, "nh_", ["userId"]), "userId")
    df_user = df_user.join(prefix_columns(df_oldhistory, "oh_", ["userId"]), "userId")

    for c in df_oldhistory.columns:
        if not c in ["userId", "session_hours", "session_start"]:
            df_user = df_user.withColumn("ohn_"+c, F.col("oh_"+c)/F.greatest(F.col("oh_session_hours"), F.lit(0.01)))
    df_user = df_user.withColumn("ohn_session_hours", F.col("oh_session_hours"))
    df_user = df_user.withColumn("ohn_session_start", F.col("oh_session_start"))
    for c in df_newhistory.columns:
        if not c in ["userId", "session_hours", "session_start"]:
            df_user = df_user.withColumn("nhn_"+c, F.col("nh_"+c)/F.greatest(F.col("nh_session_hours"), F.lit(0.01)))
    df_user = df_user.withColumn("nhn_session_hours", F.col("nh_session_hours"))
    df_user = df_user.withColumn("nhn_session_start", F.col("nh_session_start"))
    for c in df_newhistory.columns:
        if not c in ["userId"]:
            df_user = df_user.withColumn("r_"+c, F.col("nhn_"+c)/F.greatest(F.lit(0.01), F.col("ohn_"+c)))
    
    df_user = df_user.persist()
    return df_user


def oversample(df_train):
    df_lab0 = df_train.where(F.col("label") == 0)
    df_lab1 = df_train.where(F.col("label") == 1)
    train0cnt = df_lab0.count()
    train1cnt = df_lab1.count()
    oversampled_train = df_train
    sum1cnt = train1cnt
    while sum1cnt <= train0cnt:
        sum1cnt = sum1cnt+train1cnt
        print(f"oversampling to: {sum1cnt}/{train0cnt}")
        oversampled_train = oversampled_train.union(df_lab1)
    return oversampled_train

def downsample(df_train):
    df_lab0 = df_train.where(F.col("label") == 0)
    df_lab1 = df_train.where(F.col("label") == 1)
    train0cnt = df_lab0.count()
    print(f"orig-label-0: {train0cnt}")
    train1cnt = df_lab1.count()
    print(f"orig-label-1: {train1cnt}")
    df_downsampled = df_lab0.sample(fraction = train1cnt/(train0cnt+1), seed=42)
    df_downsampled = df_downsampled.union(df_lab1)
    print(f"downsampled label-1 = {train1cnt}, label-0 ~ {train0cnt*train1cnt/(train0cnt+1)}")
    return df_downsampled


def confuse(df_test_pred):
    n00 = df_test_pred.where((F.col("label")==0)&(F.col("prediction")==0)).count()
    n01 = df_test_pred.where((F.col("label")==0)&(F.col("prediction")==1)).count()
    n10 = df_test_pred.where((F.col("label")==1)&(F.col("prediction")==0)).count()
    n11 = df_test_pred.where((F.col("label")==1)&(F.col("prediction")==1)).count()
    s00 = "{:5d}".format(n00)
    s01 = "{:5d}".format(n01)
    s10 = "{:5d}".format(n10)
    s11 = "{:5d}".format(n11)
    print(f"                  ")
    print(f" Confusion Matrix: ")
    print(f"                  ")
    print(f"     | prediction| ")
    print(f"     |   0 |  1  | ")
    print(f" ----+-----+-----+ ")
    print(f" l 0 |{s00}|{s01}| ")
    print(f" b --+-----+-----+ ")
    print(f" l 1 |{s10}|{s11}| ")
    print(f" ----+-----+-----+ ")
    print(f"                   ")
    TP = n11
    TN = n00
    FP = n01
    FN = n10
    accuracy = 0
    if TP+TN+FP+FN!=0:
        accuracy = (TP+TN)/(TP+TN+FP+FN)
    precision = 0
    if TP+FP!=0:
        precision = TP/(TP+FP)
    recall = 0
    if TP+FN!=0:
        recall = TP/(TP+FN)
    f1 = 0
    if precision+recall!=0:
        f1 = 2*precision*recall/(precision+recall)
    print(f"CALC")
    print(f"  accuraccy: {accuracy}")
    print(f"  precision: {precision}")
    print(f"  recall:    {recall}")
    print(f"  f1:        {f1}")
    # https://towardsdatascience.com/matthews-correlation-coefficient-when-to-use-it-and-when-to-avoid-it-310b3c923f7e
    mcc = -9
    nenn = (TN+FN)*(FP+TP)*(TN+FP)*(FN+TP)
    if nenn!=0:   
        mcc = (TN*TP-FP*FN)/math.sqrt(nenn)
    print(f"  mcc:       {mcc}")
    return (accuracy, precision, recall, f1)
    
    
def hyper_tune_rf(num_tree_values, max_depth_values):
    best_f1 = -1
    best_model = None
    best_model_name = "?"
    for num_trees in num_tree_values:
        for max_depth in max_depth_values:
            model_name = f"rf_{num_trees}_{max_depth}"
            rf = RandomForestClassifier(featuresCol="features", numTrees=num_trees, maxDepth=max_depth, seed=42)
            rf_model = rf.fit(df_train)
            predict_test  = rf_model.transform(df_test)
            accuracy, precision, recall, f1 = confuse(predict_test)
            print(f"  {model_name}: f1 {f1}")
            if f1 > best_f1:
                best_f1 = f1
                best_model = rf_model
                best_model_name = model_name
    print(f"best f1 {f1} for {best_model_name}")
    return (best_model, best_f1, best_model_name)


def hyper_tune_lr(max_iters, reg_params, elastic_net_params):
    # https://towardsdatascience.com/beginners-guide-to-linear-regression-with-pyspark-bfc39b45a9e9
    evaluator = RegressionEvaluator(predictionCol="prediction_orig", labelCol="label", metricName="rmse") 
    
    best_err = 9999
    best_model = None
    best_model_name = "?"
    for  max_iter in  max_iters:
        for reg_param in reg_params:
            for elastic_net_param in elastic_net_params:
                model_name = f"lr_{max_iter}_{reg_param}_{elastic_net_param}"
                lr = LinearRegression(featuresCol="features", maxIter= max_iter, regParam=reg_param, elasticNetParam=elastic_net_param)
                model = lr.fit(df_train)
                predict_test  = model.transform(df_test)
                predict_test = predict_test.withColumnRenamed("prediction", "prediction_orig")
                err = evaluator.evaluate(predict_test)
                print(f"err: {err}")
                thr = 0.15
                predict_test = predict_test.withColumn("prediction", F.when(F.col("prediction_orig")>=thr,1).otherwise(0))
                accuracy, precision, recall, f1 = confuse(predict_test)
                print(f"  {model_name}: f1 {f1}")
                if err < best_err:
                    best_err = err
                    best_model = model
                    best_model_name = model_name
    print(f"best f1 {f1} for {best_model_name}")
    return (best_model, best_err, best_model_name)


def hyper_tune_dt(max_depths, max_bins_list):
    best_f1 = -1
    best_model = None
    best_model_name = "?"
    for  max_depth in max_depths:
        for max_bins in max_bins_list:
            model_name = f"dt_{max_depth}_{max_bins}"
            dt = DecisionTreeClassifier(featuresCol="features", maxDepth=max_depth, maxBins=max_bins)
            model = dt.fit(df_train)
            predict_test  = model.transform(df_test)
            accuracy, precision, recall, f1 = confuse(predict_test)
            print(f"  {model_name}: f1 {f1}")
            if f1 > best_f1:
                best_f1 = f1
                best_model = model
                best_model_name = model_name
    print(f"best f1 {f1} for {best_model_name}")
    return (best_model, best_f1, best_model_name)


def hyper_tune_sv(max_iters, reg_params):
    best_f1 = -1
    best_model = None
    best_model_name = "?"
    for  max_iter in max_iters:
        for reg_param in reg_params:
            model_name = f"svm_{max_iter}_{reg_param}"
            lsvc = LinearSVC(featuresCol="features", maxIter=max_iter, regParam=reg_param)
            model = lsvc.fit(df_train)
            predict_test  = model.transform(df_test)
            accuracy, precision, recall, f1 = confuse(predict_test)
            print(f"  {model_name}: f1 {f1}")
            if f1 > best_f1:
                best_f1 = f1
                best_model = model
                best_model_name = model_name
    print(f"best f1 {f1} for {best_model_name}")
    return (best_model, best_f1, best_model_name)
    

In [13]:
print(f"week1")
df_testtrain = create_test_data(1)
print(f"week2")
df_testtrain = df_testtrain.union(create_test_data(2))
print(f"week3")
df_testtrain = df_testtrain.union(create_test_data(3))
print(f"week4")
df_testtrain = df_testtrain.union(create_test_data(4))

week1
CHURNDOWN
week2
CHURNDOWN
week3
CHURNDOWN
week4
CHURNDOWN


In [14]:
# -----------------

ttd_url = TESTTRAIN_DATA_URL.replace(".json",f"-churn{CHURN}.json")
print(f"### SAVING TESTTRAIN DATA {ttd_url}")
df_testtrain.write.format('json').mode('overwrite').save(ttd_url)
print(f"finished")

### SAVING TESTTRAIN DATA s3a://udacity-dsnd/sparkify/output/06-testtrain-sparkify_event_data-churndown.json
finished


In [43]:
# -----------------

ttd_url = TESTTRAIN_DATA_URL.replace(".json",f"-churn{CHURN}.json")
print(f"### LOAD TESTTRAIN DATA {ttd_url}")
df_testtrain = spark.read.json(ttd_url)
print(f"### PERSIST")
df_testtrain_persist = df_testtrain.persist()
df_testtrain = df_testtrain_persist

### LOAD TESTTRAIN DATA s3a://udacity-dsnd/sparkify/output/06-testtrain-sparkify_event_data-churncanceldown.json
### PERSIST


In [44]:
df_testtrain_orig = df_testtrain

In [46]:
# df_testtrain = oversample(df_testtrain)
df_testtrain = downsample(df_testtrain)

orig-label-0: 33310
orig-label-1: 3761
downsampled label-1 = 3761, label-0 ~ 3760.8870943532165


In [47]:


print(f"### CREATE FEATURE COLUMN")

featureCols = ["paid", "usermale", "userregistration"]    

featureCols = [*featureCols, *[col for col in df_testtrain.columns if col.startswith("r_")]]
featureCols = [*featureCols, *[col for col in df_testtrain.columns if col.startswith("nh_")]]

#featureCols = [col for col in df_testtrain.columns if not col in ["userId", "wid", "label"]]
assembler = VectorAssembler(inputCols=featureCols, outputCol="features")
df_testtrain_vec=assembler.transform(df_testtrain).select("userId", "wid", "label","features")


df_testtrain_vec_persist = df_testtrain_vec.persist()
df_testtrain_vec = df_testtrain_vec_persist

# -----------------

print(f"### TRAIN / TEST SPLIT")
df_train, df_test = df_testtrain_vec.randomSplit([0.7, 0.3], seed=42)

#df_train_orig = df_train
#df_train = downsample(df_train)

#print(f"train: {df_train.count()}")
#print(f"  l1: {df_train.where(df_train.label==1).count()}")
#print(f"  l0: {df_train.where(df_train.label==0).count()}")
#print(f"test: {df_test.count()}")
#print(f"  l1: {df_test.where(df_test.label==1).count()}")
#print(f"  l0: {df_test.where(df_test.label==0).count()}")

## Fit scaler to train dataset
#scaler = MaxAbsScaler().setInputCol('features').setOutputCol('scaled_features')
#df_train = df_train.drop("scaled_features")
#scaler_model = scaler.fit(df_train)
## Scale train and test features
#df_train = scaler_model.transform(df_train)
#df_test = df_test.drop("scaled_features")
#df_test = scaler_model.transform(df_test)

# -----------------
df_test_orig = df_test
df_train_orig = df_train



### CREATE FEATURE COLUMN
### TRAIN / TEST SPLIT


In [31]:
df_train = oversample(df_train)

oversampling to: 3164/24556
oversampling to: 4746/24556
oversampling to: 6328/24556
oversampling to: 7910/24556
oversampling to: 9492/24556
oversampling to: 11074/24556
oversampling to: 12656/24556
oversampling to: 14238/24556
oversampling to: 15820/24556
oversampling to: 17402/24556
oversampling to: 18984/24556
oversampling to: 20566/24556
oversampling to: 22148/24556
oversampling to: 23730/24556
oversampling to: 25312/24556


In [48]:
print(f"train: {df_train.count()}")
print(f"  l1: {df_train.where(df_train.label==1).count()}")
print(f"  l0: {df_train.where(df_train.label==0).count()}")
print(f"test: {df_test.count()}")
print(f"  l1: {df_test.where(df_test.label==1).count()}")
print(f"  l0: {df_test.where(df_test.label==0).count()}")


train: 5173
  l1: 2585
  l0: 2588
test: 2274
  l1: 1176
  l0: 1098


In [33]:
df_test = oversample(df_test)

oversampling to: 1400/10233
oversampling to: 2100/10233
oversampling to: 2800/10233
oversampling to: 3500/10233
oversampling to: 4200/10233
oversampling to: 4900/10233
oversampling to: 5600/10233
oversampling to: 6300/10233
oversampling to: 7000/10233
oversampling to: 7700/10233
oversampling to: 8400/10233
oversampling to: 9100/10233
oversampling to: 9800/10233
oversampling to: 10500/10233


In [9]:
df_train_orig = df_train
df_train = oversample(df_train)

oversampling to: 2230/25009
oversampling to: 3345/25009
oversampling to: 4460/25009
oversampling to: 5575/25009
oversampling to: 6690/25009
oversampling to: 7805/25009
oversampling to: 8920/25009
oversampling to: 10035/25009
oversampling to: 11150/25009
oversampling to: 12265/25009
oversampling to: 13380/25009
oversampling to: 14495/25009
oversampling to: 15610/25009
oversampling to: 16725/25009
oversampling to: 17840/25009
oversampling to: 18955/25009
oversampling to: 20070/25009
oversampling to: 21185/25009
oversampling to: 22300/25009
oversampling to: 23415/25009
oversampling to: 24530/25009
oversampling to: 25645/25009


In [49]:
model, f1, model_name = hyper_tune_rf([20], [5])  

                  
 Confusion Matrix: 
                  
     | prediction| 
     |   0 |  1  | 
 ----+-----+-----+ 
 l 0 |  667|  431| 
 b --+-----+-----+ 
 l 1 |  413|  763| 
 ----+-----+-----+ 
                   
CALC
  accuraccy: 0.6288478452066842
  precision: 0.6390284757118928
  recall:    0.6488095238095238
  f1:        0.6438818565400843
  mcc:       0.2564493005019037
  rf_20_5: f1 0.6438818565400843
best f1 0.6438818565400843 for rf_20_5


In [50]:
df_train.groupBy("userId").count().sort(F.desc(F.col("count"))).show()

+-------+-----+
| userId|count|
+-------+-----+
|1554956|    4|
|1339528|    4|
|1538485|    3|
|1125943|    3|
|1831733|    3|
|1586895|    3|
|1392770|    3|
|1373602|    3|
|1178026|    3|
|1602181|    3|
|1390064|    3|
|1591353|    3|
|1812177|    3|
|1141231|    3|
|1888253|    3|
|1116029|    3|
|1386578|    3|
|1558736|    3|
|1996408|    3|
|1037209|    3|
+-------+-----+
only showing top 20 rows



In [26]:
df_train.where(F.col("userId")=="1655208").count()

70

In [51]:
# -----------------

print(f"### PREDICT TRAIN")
predict_train = model.transform(df_train)
predict_train.select("label", "prediction").show(10)
print(f"### PREDICT TEST")
predict_test  = model.transform(df_test)
predict_test.select("label", "prediction").show(10)

# -----------------

print(f"### EVALUATE PREDICTION")
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol ='rawPrediction', labelCol ='label')
predict_test.select("label", "rawPrediction", "prediction", "probability").show(5)
print("The area under ROC for train set is {}".format(evaluator.evaluate(predict_train)))
print("The area under ROC for test set is {}".format(evaluator.evaluate(predict_test)))

print(f"### EVAL TRAIN:")
confuse(predict_train)
print(f"### EVAL TEST:")
acc, prec, rec, f1 = confuse(predict_test)


### PREDICT TRAIN
+-----+----------+
|label|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       1.0|
|    0|       1.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
+-----+----------+
only showing top 10 rows

### PREDICT TEST
+-----+----------+
|label|prediction|
+-----+----------+
|    0|       1.0|
|    0|       0.0|
|    0|       0.0|
|    0|       1.0|
|    0|       1.0|
|    0|       1.0|
|    0|       1.0|
|    0|       0.0|
|    0|       1.0|
|    0|       0.0|
+-----+----------+
only showing top 10 rows

### EVALUATE PREDICTION
+-----+--------------------+----------+--------------------+
|label|       rawPrediction|prediction|         probability|
+-----+--------------------+----------+--------------------+
|    0|[8.59056034958328...|       1.0|[0.42952801747916...|
|    0|[12.1124409849263...|       0.0|[0.60562204924631...|
|    0|[13.7858852326191...|       0.0|[0.6892942616

In [197]:
# -----------------

print(f"### SAVE MODEL {model_name} {f1*100}")
model_url = f'{MODEL_URL}_{model_name}_f1val{round(f1,3)}'
model.write().overwrite().save(model_url)
print(f"model saved to {model_url}")



### SAVE MODEL rf_20_5 22.481265611990008
model saved to s3a://udacity-dsnd/sparkify/output/05-model-sparkify_event_data_rf_20_5_f1val0.225


In [198]:
featimp = model.featureImportances
nameimp = {}
for i in range(len(featimp)):
    nameimp[featureCols[i]] = featimp[i]
sorted(nameimp.items(), key=lambda x:-x[1])

[('paid', 0.09554455155152178),
 ('oh_session_hours', 0.09302707160037064),
 ('oh_pg_thumbs_up', 0.07582709757031283),
 ('nh_pg_home', 0.07340830594860079),
 ('oh_pg_home', 0.0725284688157008),
 ('oh_session_start', 0.06713461595629495),
 ('oh_pg_settings', 0.050296622196977205),
 ('oh_pg_nextsong', 0.03470888922561419),
 ('nh_pg_logout', 0.031418303669647894),
 ('oh_pg_logout', 0.029338603181943555),
 ('oh_pg_thumbs_down', 0.02669508164427074),
 ('oh_pg_downgrade', 0.0263905768134274),
 ('oh_pg_add_to_playlist', 0.02189388493915634),
 ('nh_pg_thumbs_up', 0.021718938628132543),
 ('nh_pg_downgrade', 0.021610587051586076),
 ('oh_pg_login', 0.021310639336320675),
 ('nh_pg_thumbs_down', 0.020975045098593534),
 ('nh_pg_nextsong', 0.01929290425786421),
 ('nh_pg_add_to_playlist', 0.01864976547102134),
 ('nh_status_307', 0.016846133774795467),
 ('nh_session_hours', 0.016745889155579934),
 ('nh_pg_add_friend', 0.014856832748766466),
 ('oh_pg_roll_advert', 0.014159788446922988),
 ('nh_pg_setting

In [39]:
print("### STOP SPARK SESSION")
spark.stop()  

### STOP SPARK SESSION
