In [1]:
!pip install pyspark



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

In [4]:
spark = SparkSession.builder.appName('sparkify-train').getOrCreate()

In [5]:
# load data and change is_churn column into label column

isOnColab = True # CHANGE THIS VARIABLE IF RUNNING ON DATAPROC

path = '/content/drive/MyDrive/datasets/dsnd-sparkify/ml_df.parquet' if isOnColab else 'gs://udacity-dsnd/ml_df.parquet'
df = spark.read.parquet(path)
df = df.withColumn('label', F.when(F.col("is_churn"), 1).otherwise(0))
df.show(5)

+-------+-------------------+--------+----------+-------------+------------------+----------+-----------+---------+------------------+----------+-----+------+-----+-----+-------+-------+------------+-------------+----------+------------+-----+
| userId|              up_ts|is_churn|song_count|subs_duration|         song_rate|n_playlist|thumbs_down|thumbs_up|      avg_sess_len|sess_count| ipad|iphone|linux|macos|windows|n_error|n_friend_add|n_cancel_page|n_unq_song|n_unq_artist|label|
+-------+-------------------+--------+----------+-------------+------------------+----------+-----------+---------+------------------+----------+-----+------+-----+-----+-------+-------+------------+-------------+----------+------------+-----+
|1071843|2018-11-08 13:16:59|   false|      1190|           22| 54.09090909090909|        39|          6|       51|27020.272727272728|        11|false| false|false| true|  false|      1|          32|            0|      1104|         902|    0|
|1120784|2018-10-11 14:1

In [6]:
# features columns without state one-hot encoding
featureCols_1 = df.columns[3:-1]



assembler_1 = VectorAssembler(inputCols=featureCols_1, outputCol="features")

df = assembler_1.transform(df)
df = df.select(["features", "label"])

In [7]:
train_df, test_df = df.randomSplit([0.9,0.1])

In [8]:
train_df.groupby("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1| 4473|
|    0| 9111|
+-----+-----+



In [9]:
test_df.groupby("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|  477|
|    0| 1074|
+-----+-----+



In [10]:
gbt = GBTClassifier()

paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [4, 5, 6]) \
    .addGrid(gbt.maxIter, [15, 20, 25]) \
    .build()
    # .addGrid(gbt.featuresCol, ["features_1", "features_2"]) \

tvs = TrainValidationSplit(estimator=gbt,
                           estimatorParamMaps=paramGrid,
                           evaluator=BinaryClassificationEvaluator(),
                           trainRatio=0.75,
                           seed=42)

In [11]:
# train model
model = tvs.fit(train_df)

In [13]:
print(model.bestModel)

GBTClassificationModel: uid = GBTClassifier_83dce7e865b8, numTrees=25, numClasses=2, numFeatures=18


In [17]:
# predict on test dataframe
preds_df = model.transform(test_df)

In [18]:
model.bestModel.featureImportances

SparseVector(18, {0: 0.0034, 1: 0.3828, 2: 0.1185, 3: 0.0185, 4: 0.0392, 5: 0.0429, 6: 0.0842, 7: 0.1417, 8: 0.0018, 9: 0.0007, 10: 0.006, 11: 0.0021, 13: 0.012, 14: 0.0222, 15: 0.1174, 16: 0.0063, 17: 0.0004})

In [19]:
# evaluation
binary_evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                                labelCol='label')
res = binary_evaluator.evaluate(preds_df)
res

0.8720842166083023

In [20]:
model.save('/content/drive/MyDrive/datasets/dsnd-sparkify/sparkify_model_87')