In [1]:
!pip install pyspark



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

In [4]:
spark = SparkSession.builder.appName('sparkify-train').getOrCreate()

In [5]:
# load data and change is_churn column into label column

isOnColab = True # CHANGE THIS VARIABLE IF RUNNING ON DATAPROC

path = '/content/drive/MyDrive/datasets/dsnd-sparkify/ml_df.parquet' if isOnColab else 'gs://udacity-dsnd/ml_df.parquet'
df = spark.read.parquet('/content/drive/MyDrive/datasets/dsnd-sparkify/ml_df.parquet')
df = df.withColumn('label', F.when(F.col("is_churn"), 1).otherwise(0))
df.show(5)

+-------+-------------------+--------+----------+-------------+------------------+----------+-----------+---------+------------------+----------+-----+------+-----+-----+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----------+-----+-----+-----+-----+-----+-----+-----+-----+--------+--------+-----+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--------+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----------+-----+-----+-----+-----+-----+-----+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--------+-----+-----+-----+-----+-----+
| userId|              up_ts|is_churn|song_count|subs_duration|         song_rate|n_playlist|thumbs_down|thumbs_up|      avg_sess_len|sess_count| ipad|iphone|linux|macos|windows|   AK|   A

In [6]:
# features columns without state one-hot encoding
featureCols_1 = ["song_count", "subs_duration", "song_rate", "n_playlist", "thumbs_down",
               "thumbs_up", "avg_sess_len", "sess_count", "ipad", "iphone", "linux",
               "macos", "windows"]

# features column with state one-hot encoding
featureCols_2 = df.columns[3:-1]

assembler_1 = VectorAssembler(inputCols=featureCols_1, outputCol="features_1")
assembler_2 = VectorAssembler(inputCols=featureCols_2, outputCol="features_2")

df = assembler_1.transform(df)
df = assembler_2.transform(df)
df = df.select(["features_1", "features_2", "label"])

In [14]:
train_df, test_df = df.randomSplit([0.9,0.1])

In [13]:
gbt = GBTClassifier()

paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [4,8,12]) \
    .addGrid(gbt.maxIter, [5,10,15]) \
    .addGrid(gbt.featuresCol, ["features_1", "features_2"]) \
    .build()

tvs = TrainValidationSplit(estimator=gbt,
                           estimatorParamMaps=paramGrid,
                           evaluator=BinaryClassificationEvaluator(),
                           trainRatio=0.75)

In [None]:
# train model
model1 = tvs.fit(train_df)

In [9]:
# predict on test dataframe
preds_df = model1.transform(test_df)

In [16]:
# show prediction result
preds_df.select(["label", "probability", "prediction"]).take(20)

[Row(label=0, probability=DenseVector([0.9602, 0.0398]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.9697, 0.0303]), prediction=0.0),
 Row(label=1, probability=DenseVector([0.7274, 0.2726]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.969, 0.031]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.9688, 0.0312]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.969, 0.031]), prediction=0.0),
 Row(label=1, probability=DenseVector([0.7938, 0.2062]), prediction=0.0),
 Row(label=1, probability=DenseVector([0.1537, 0.8463]), prediction=1.0),
 Row(label=0, probability=DenseVector([0.9445, 0.0555]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.9684, 0.0316]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.969, 0.031]), prediction=0.0),
 Row(label=1, probability=DenseVector([0.5419, 0.4581]), prediction=0.0),
 Row(label=1, probability=DenseVector([0.1585, 0.8415]), prediction=1.0),
 Row(label=0, probability=DenseVector([0.975

In [11]:
# evaluation
binary_evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                                labelCol='label', metricName='areaUnderPR')
res = binary_evaluator.evaluate(preds_df)
res

0.6102168316953208