# w261 Final Project - Clickthrough Rate Prediction


Team 2  
Danielle Adler, Craig Fujii, Conor Healy, YoungKoung Kim

Summer 2019, section [Your section numbers>]   

## Setup

In [1]:
# imports
import re
import ast
import time
import shutil
import os
import copy
import numpy as np
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
from IPython.display import display

from pyspark.sql import Window, Row
from pyspark.sql.functions import col, desc, mean, isnan, when, count, isnull, rank, sum, countDistinct, avg, stddev, round, lit, rand, broadcast, udf, log, monotonically_increasing_id
from pyspark.sql.types import LongType, IntegerType, StringType, DoubleType, ArrayType, FloatType
from pyspark.sql.window import Window

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, OneHotEncoderEstimator, VectorAssembler, MinMaxScaler, Imputer
from pyspark.ml.stat import Correlation
from pyspark.mllib.stat import Statistics

from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import warnings
warnings.filterwarnings('ignore')

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
# store path to notebook
PWD = !pwd
PWD = PWD[0]

In [4]:
# # start Spark Session
# from pyspark.sql import SparkSession
# app_name = "final_project"
# master = "local[*]"
# spark = SparkSession\
#         .builder\
#         .appName(app_name)\
#         .master(master)\
#         .getOrCreate()
# sc = spark.sparkContext
spark

In [5]:
sc = spark.sparkContext

In [6]:
master_start = time.time()

# __Section 4__ - Algorithm Implementation

## Functions

In [7]:
def metrics(col_name):
    """Calculates key metrics on a binary column of 0s and 1s
    Input: Column Name
    Output: TP, FP, TN, FP, accuracy, precision, recall, F1 score
    """
    
    TP = train_set.filter(col('_c0')==0).filter(col(col_name)==0).count()
    FP = train_set.filter(col('_c0')==1).filter(col(col_name)==0).count()
    TN = train_set.filter(col('_c0')==1).filter(col(col_name)==1).count()
    FN = train_set.filter(col('_c0')==0).filter(col(col_name)==1).count()
    accuracy = (TP + TN) / (TP + FP + FN + TN)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    F1 = 2*(precision*recall)/(precision+recall)
    print("True Positives: {} \nFalse Positives: {} \nTrue Negatives: {} \nFalse Negatives: {} \nAccuracy: {} \nPrecision: {} \nRecall: {} \nF1 Score: {}".format(TP, FP, TN, FN, accuracy, precision, recall, F1))

In [8]:
def feature_prep (input_df):
    """Parses out types of features within a dataframe
    Input: Dataframe
    Output: Dataframe, Categorical Column List, Numeric Column List
    """
    
    # develop feature set
    dep_var = ['_c0']

    model_df = input_df.select(input_df.columns[1:])

    feature_df = model_df.select([column for column in model_df.columns 
                              if column not in dep_var])
    
    # numerical and categorical column split
    cat_cols = feature_df.columns[1:]
    num_cols = feature_df.columns[0]
    
    return model_df, cat_cols, num_cols

In [9]:
def column_vector_maker(df, columns, lookup_df, suffix):
    """One-hot encodes categorical variables
    Input: Dataframe, Categorical Column List, Lookup Dataframe, Suffix for New Columns
    Output: Dataframe, Categorical Column List, Numeric Column List
    """
    
    # Creating the lookup table vectors for each category
    # we are doing a map-side join
    for i in columns:
        new_col = str(i) + str(suffix)
        df = df.join(lookup_df, df[i]==lookup_df['id'], "left_outer" )
        df = df.withColumnRenamed("category", new_col) 
        lst = df.columns
        cols = [i for i in lst if i not in ['id']]
        df = df.select(cols)
        
    df = df.select([c for c in df.columns if c not in columns])
    
    cat_cols = df.columns[2:]
    num_cols = df.columns[0]
    
    return df, cat_cols, num_cols

In [10]:
def pipeline_prep (object_type):
    """Assembles a dataframe of label and feature fectors for machine learning algorithm
    Input: Dataframe, Categorical Column List, Numeric Column List
    Ouput: Dataframe
    """
    
    model_df = object_type[0]
    cat_cols = object_type[1]
    num_cols = object_type[2]

    # developing the string indexer and vector assembler of input and output columns
    label_stringIdx = StringIndexer(inputCol = '_c0', outputCol = 'label')

    assemblerInputs = cat_cols + [num_cols]
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
    
    # pipeline work for more setup
    pipeline = Pipeline(stages = [label_stringIdx, assembler])

    pipelineModel = pipeline.fit(model_df)
    model_df = pipelineModel.transform(model_df)

    selectedCols = ['label', 'features'] 
    model_df = model_df.select(selectedCols)
    return model_df

In [11]:
def lr_cross_validate (lr_model):
    """Hypertunes a logistic regression model based on a set of parameters
    Input: Logistic regression classification
    Output: Hypertuned logistic regression classification
    """

    # for elasticNetParam, 0 is L2 and 1 is L1
    paramGrid = ParamGridBuilder() \
        .addGrid(lr_model.elasticNetParam, [0.0, 0.25, 0.75, 1.0]) \
        .addGrid(lr_model.regParam, [0.0, 0.01, 0.1, 0.5, 0.75, 1.0, 1.25]) \
        .build()

    # optimizing for an f1 score
    crossval = CrossValidator(estimator = lr, estimatorParamMaps = paramGrid,
                              evaluator = MulticlassClassificationEvaluator(
                                  labelCol = "label", predictionCol="prediction", 
                                  metricName="f1"), numFolds=3)
    return crossval

In [12]:
def dt_cross_validate (dt_model):
    """Hypertunes a decision tree model based on a set of parameters
    Input: Decision tree classification
    Output: Hypertuned decision tree classification
    """

    paramGrid = ParamGridBuilder() \
        .addGrid(dt_model.maxBins, [28, 30, 32, 34, 36]) \
        .addGrid(dt_model.maxDepth, [3, 4, 5, 6, 7]) \
        .addGrid(dt_model.impurity, ['gini', 'entropy']) \
        .build()

    # optimizing for an f1 score
    crossval = CrossValidator(estimator = dt, estimatorParamMaps = paramGrid,
                              evaluator = MulticlassClassificationEvaluator(
                                  labelCol = "label", predictionCol="prediction", 
                                  metricName="f1"), numFolds=3)
    return crossval

In [13]:
def train_results (trainingSummary):
    """Showcases key metrics on logistic regression training summary
    Input: Logistic regression training model
    Output: Training set accuracy, precision, recall, F1 score
    """

    # predicting on the training set
    falsePositiveRate = trainingSummary.weightedFalsePositiveRate
    truePositiveRate = trainingSummary.weightedTruePositiveRate
    accuracy = trainingSummary.accuracy
    precision = trainingSummary.weightedPrecision
    recall = trainingSummary.weightedRecall
    f1 = trainingSummary.weightedFMeasure()

    print("False Positive Rate: %s\nTrue Positive Rate: %s\nAccuracy: %s\nPrecision: %s\nRecall: %s\nF1 Score: %s"
          % (falsePositiveRate, truePositiveRate, accuracy, precision, recall, f1))

In [14]:
def test_results (predictions):
    """Showcases key metrics on all algorithm test datasets
    Input: Classification training model
    Output: Testing set accuracy, precision, recall, F1 score
    """
    
    evaluator = MulticlassClassificationEvaluator(labelCol = "label", predictionCol="prediction", metricName="accuracy")
    print("Accuracy:", evaluator.evaluate(predictions))

    evaluator = MulticlassClassificationEvaluator(labelCol = "label", predictionCol="prediction", metricName="weightedPrecision")
    print("Precision:", evaluator.evaluate(predictions))
    
    evaluator = MulticlassClassificationEvaluator(labelCol = "label", predictionCol="prediction", metricName="weightedRecall")
    print("Recall:", evaluator.evaluate(predictions))
    
    evaluator = MulticlassClassificationEvaluator(labelCol = "label", predictionCol="prediction", metricName="f1")
    print("F1 Score:", evaluator.evaluate(predictions))

In [15]:
def to_array(col):
    """Separating a vector into distinct columns
    Input: Vector column
    Output: Separate array columns of vector
    """
    
    # separating a vector into its array types
    def to_array_(v):
        return v.toArray().tolist()
    return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)

In [34]:
def null_as_Missing(x):
    return when(col(x).isNotNull(), col(x)).otherwise('L')

<br>

## Reading in Model Files

In [57]:
# read parquet files of features
start = time.time()

folder_hl = 'data_good'
# mean_wgt_HL_df = spark.read.parquet("gs://gsod_23456/"+folder_hl+"/df_mean_wgt_HL.parquet")
# mean_wgt_HL_test_df = spark.read.parquet("gs://gsod_23456/"+folder_hl+"/test_df_mean_wgt_HL.parquet")

zero_wgt_HL_df = spark.read.parquet("gs://gsod_23456/"+folder_hl+"/df_zero_wgt_HL.parquet")
zero_wgt_HL_test_df = spark.read.parquet("gs://gsod_23456/"+folder_hl+"/test_df_zero_wgt_HL.parquet")


print(f'... completed job in {time.time() - start} seconds.')

... completed job in 1.5045206546783447 seconds.


In [58]:
# mean_wgt_HL_df.count(), mean_wgt_HL_test_df.count() , zero_wgt_HL_df.count(), zero_wgt_HL_test_df.count()
zero_wgt_HL_df.count(), zero_wgt_HL_test_df.count()

(36670740, 36671468)

In [59]:
# # see the recoded variables
# start = time.time()
# print("Counts for Each Category:")
# for i in zero_wgt_HL_df.columns[:1]: 
zero_wgt_HL_df.groupby('_c14_wgt').count().sort(desc('count')).show()
print(f'... completed job in {time.time() - start} seconds.')

+--------+--------+
|_c14_wgt|   count|
+--------+--------+
|    null|30152742|
|       H| 6516116|
|       L|    2610|
+--------+--------+

... completed job in 20.613758087158203 seconds.


In [60]:
# # see the recoded variables
# start = time.time()
# print("Counts for Each Category:")
# for i in zero_wgt_HL_df.columns[:1]: 
zero_wgt_HL_test_df.groupby('_c14_wgt').count().sort(desc('count')).show()
print(f'... completed job in {time.time() - start} seconds.')

+--------+-------+
|_c14_wgt|  count|
+--------+-------+
|    null|7539232|
|       H|1629274|
|       L|    643|
+--------+-------+

... completed job in 24.47908902168274 seconds.


In [61]:
# convert nulls to L for variable columns
col = ['_c14_wgt', '_c15_wgt', '_c16_wgt', '_c17_wgt', '_c18_wgt', '_c19_wgt', '_c20_wgt', '_c21_wgt', '_c23_wgt', '_c24_wgt', '_c25_wgt', '_c26_wgt', '_c27_wgt', '_c28_wgt', '_c29_wgt', '_c30_wgt', '_c31_wgt', '_c32_wgt', '_c33_wgt', '_c34_wgt', '_c35_wgt', '_c36_wgt', '_c37_wgt', '_c38_wgt', '_c39_wgt', '_c22_wgt']
zero_wgt_HL_df = zero_wgt_HL_df.fillna('L')
zero_wgt_HL_test_df = zero_wgt_HL_test_df.fillna('L')

In [62]:
# confirm 
zero_wgt_HL_df.groupby('_c14_wgt').count().sort(desc('count')).show()
print(f'... completed job in {time.time() - start} seconds.')
zero_wgt_HL_test_df.groupby('_c14_wgt').count().sort(desc('count')).show()
print(f'... completed job in {time.time() - start} seconds.')
# zero_wgt_HL_df.show(2)

+--------+--------+
|_c14_wgt|   count|
+--------+--------+
|       L|30155352|
|       H| 6516116|
+--------+--------+

... completed job in 207.72215056419373 seconds.
+--------+-------+
|_c14_wgt|  count|
+--------+-------+
|       L|7539875|
|       H|1629274|
+--------+-------+

... completed job in 212.78889298439026 seconds.


In [48]:
# broadcasting the lookup dataframe as we have to use it multiple times
lookup_df_HL = broadcast(spark.createDataFrame(
    [('H',  Vectors.dense(1.0)), 
     ('L', Vectors.dense(0.0))],
    ["id", "category"]))

In [20]:
# # broadcasting the lookup dataframe as we have to use it multiple times
# lookup_df_HLM = broadcast(spark.createDataFrame(
#     [('H6', Vectors.dense(1.0,0.0,0.0)), 
#      ('H1', Vectors.dense(0.0,1.0,0.0)),
#      ('L',  Vectors.dense(0.0,0.0,1.0)),
#      ('M',  Vectors.dense(0.0,0.0,0.0))],
#     ["id", "category"]))

In [65]:
f1_scores = {}
evaluator = MulticlassClassificationEvaluator(labelCol = "label", 
                                              predictionCol="prediction", 
                                              metricName="f1")

**<center>Algorithm Modeling Matrix</center>**

|	 3 Transformation Types	|	2 Imputing Methods	|	2 Algorithms	|	2 Model Runs	|
|	---	|	---	|	---	|	---	|
|	Weighted Value	|	Nulls => Mean	|	Logistic Regression	|	Default	|
|	Hi Low	|	Nulls => 0	|	Decision Tree	|	Hypertuned	|
|	Hi Mid Low Missing	|		|		|		|

<br>

## Logistic Regression

### Recategorized HL with Mean

In [None]:
# start = time.time()

# lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
# lrModel = lr.fit(pipeline_prep(column_vector_maker(feature_prep(mean_wgt_HL_df)[0], 
#                                   feature_prep(mean_wgt_HL_df)[1], lookup_df_HL, "HL")))
# trainingSummary = lrModel.summary
# print("Train Results:")
# train_results(trainingSummary)

# print("\nTest Results:")
# predictions = lrModel.transform(pipeline_prep(column_vector_maker(feature_prep(mean_wgt_HL_test_df)[0], 
#                                   feature_prep(mean_wgt_HL_test_df)[1], lookup_df_HL, "HL")))
# test_results(predictions)

# f1_scores['lr_mean_wgt_HL'] = format(evaluator.evaluate(predictions), '.6f')

# print(f'... completed job in {time.time() - start} seconds.')

In [None]:
# # parameter tuned
# start = time.time()

# lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
# lrModel = lr_cross_validate(lr).fit(pipeline_prep(column_vector_maker(feature_prep(mean_wgt_HL_df)[0], 
#                                   feature_prep(mean_wgt_HL_df)[1], lookup_df_HL, "HL")))

# print("Test Results:")
# predictions = lrModel.transform(pipeline_prep(column_vector_maker(feature_prep(mean_wgt_HL_test_df)[0], 
#                                   feature_prep(mean_wgt_HL_test_df)[1], lookup_df_HL, "HL")))
# test_results(predictions)

# f1_scores['lr_mean_wgt_HL_tuned'] = format(evaluator.evaluate(predictions), '.6f')

# print(f'... completed job in {time.time() - start} seconds.')

<br>

### Recategorized HL with Zero

In [68]:
start = time.time()

lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(pipeline_prep(column_vector_maker(feature_prep(zero_wgt_HL_df)[0], 
                                  feature_prep(zero_wgt_HL_df)[1], lookup_df_HL, "HL")))
trainingSummary = lrModel.summary
print("Train Results:")
train_results(trainingSummary)

print("\nTest Results:")
predictions = lrModel.transform(pipeline_prep(column_vector_maker(feature_prep(zero_wgt_HL_test_df)[0], 
                                  feature_prep(zero_wgt_HL_test_df)[1], lookup_df_HL, "HL")))
test_results(predictions)

f1_scores['lr_zero_wgt_HL'] = format(evaluator.evaluate(predictions), '.6f')

print(f'... completed job in {time.time() - start} seconds.')

Train Results:
False Positive Rate: 0.6180518768258426
True Positive Rate: 0.7566615549723834
Accuracy: 0.7566615549723834
Precision: 0.7228676693477389
Recall: 0.7566615549723834
F1 Score: 0.7067075343618696

Test Results:
Accuracy: 0.397585206653311
Precision: 0.5939620904393667
Recall: 0.397585206653311
F1 Score: 0.41578348580869606
... completed job in 663.4937987327576 seconds.


In [69]:
# parameter tuned
start = time.time()

lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr_cross_validate(lr).fit(pipeline_prep(column_vector_maker(feature_prep(zero_wgt_HL_df)[0], 
                                  feature_prep(zero_wgt_HL_df)[1], lookup_df_HL, "HL")))

print("Test Results:")
predictions = lrModel.transform(pipeline_prep(column_vector_maker(feature_prep(zero_wgt_HL_test_df)[0], 
                                  feature_prep(zero_wgt_HL_test_df)[1], lookup_df_HL, "HL")))
test_results(predictions)

f1_scores['lr_zero_wgt_HL_tuned'] = format(evaluator.evaluate(predictions), '.6f')

print(f'... completed job in {time.time() - start} seconds.')

Test Results:
Accuracy: 0.397585206653311
Precision: 0.5939620904393667
Recall: 0.397585206653311
F1 Score: 0.41578348580869606
... completed job in 1869.2647018432617 seconds.


<br>

## Decision Tree

### Recategorized with Mean HL 

In [70]:
# start = time.time()

# dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
# dtModel = dt.fit(pipeline_prep(column_vector_maker(feature_prep(mean_wgt_HL_df)[0], 
#                                   feature_prep(mean_wgt_HL_df)[1], lookup_df_HL, "HL")))

# print("Test Results:")
# predictions = dtModel.transform(pipeline_prep(column_vector_maker(feature_prep(mean_wgt_HL_test_df)[0], 
#                                   feature_prep(mean_wgt_HL_test_df)[1], lookup_df_HL, "HL")))
# test_results(predictions)

# f1_scores['dt_mean_wgt_HL'] = format(evaluator.evaluate(predictions), '.6f')

# print(f'... completed job in {time.time() - start} seconds.')

In [71]:
# # parameter tuned
# start = time.time()

# dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
# dtModel = dt_cross_validate(dt).fit(pipeline_prep(column_vector_maker(feature_prep(mean_wgt_HL_df)[0], 
#                                   feature_prep(mean_wgt_HL_df)[1], lookup_df_HL, "HL")))

# print("Test Results:")
# predictions = dtModel.transform(pipeline_prep(column_vector_maker(feature_prep(mean_wgt_HL_test_df)[0], 
#                                   feature_prep(mean_wgt_HL_test_df)[1], lookup_df_HL, "HL")))
# test_results(predictions)

# f1_scores['dt_mean_wgt_HL_tuned'] = format(evaluator.evaluate(predictions), '.6f')

# print(f'... completed job in {time.time() - start} seconds.')

<br>

<br>

### Recategorized with Zero HL & HLM

In [72]:
start = time.time()

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(pipeline_prep(column_vector_maker(feature_prep(zero_wgt_HL_df)[0], 
                                  feature_prep(zero_wgt_HL_df)[1], lookup_df_HL, "HL")))

print("Test Results:")
predictions = dtModel.transform(pipeline_prep(column_vector_maker(feature_prep(zero_wgt_HL_test_df)[0], 
                                  feature_prep(zero_wgt_HL_test_df)[1], lookup_df_HL, "HL")))
test_results(predictions)

f1_scores['dt_zero_wgt_HL'] = format(evaluator.evaluate(predictions), '.6f')

print(f'... completed job in {time.time() - start} seconds.')

Test Results:
Accuracy: 0.7533489749157746
Precision: 0.7173419093907185
Recall: 0.7533489749157747
F1 Score: 0.7095760914695812
... completed job in 885.2794954776764 seconds.


In [73]:
# parameter tuned
start = time.time()

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt_cross_validate(dt).fit(pipeline_prep(column_vector_maker(feature_prep(zero_wgt_HL_df)[0], 
                                  feature_prep(zero_wgt_HL_df)[1], lookup_df_HL, "HL")))

print("Test Results:")
predictions = dtModel.transform(pipeline_prep(column_vector_maker(feature_prep(zero_wgt_HL_test_df)[0], 
                                  feature_prep(zero_wgt_HL_test_df)[1], lookup_df_HL, "HL")))
test_results(predictions)

f1_scores['dt_zero_wgt_HL_tuned'] = format(evaluator.evaluate(predictions), '.6f')

print(f'... completed job in {time.time() - start} seconds.')

Test Results:
Accuracy: 0.7639955463696795
Precision: 0.735735533899509
Recall: 0.7639955463696795
F1 Score: 0.723051933997398
... completed job in 5283.550374507904 seconds.


<br>

<br>

In [74]:
print(f'... master completed job in {time.time() - master_start} seconds.')

... master completed job in 11279.086911439896 seconds.


<br>