In [1]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 74kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 20.6MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=011f4f2f6f5f72557d10cf89b78fb9f198fec0dcd25e60a66cb3575cf56881d4
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

In [4]:
spark = SparkSession.builder.appName('sparkify-train').getOrCreate()

In [5]:
# load data and change is_churn column into label column

isOnColab = True # CHANGE THIS VARIABLE IF RUNNING ON DATAPROC

path = '/content/drive/MyDrive/datasets/dsnd-sparkify/ml_df.parquet' if isOnColab else 'gs://udacity-dsnd/ml_df.parquet'
df = spark.read.parquet(path)
df = df.withColumn('label', F.when(F.col("is_churn"), 1).otherwise(0))
df.show(5)

+-------+-------------------+--------+----------+-------------+------------------+----------+-----------+---------+------------------+----------+-----+------+-----+-----+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----------+-----+-----+-----+-----+-----+-----+-----+-----+--------+--------+-----+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--------+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----------+-----+-----+-----+-----+-----+-----+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--------+-----+-----+-----+-----+-----+
| userId|              up_ts|is_churn|song_count|subs_duration|         song_rate|n_playlist|thumbs_down|thumbs_up|      avg_sess_len|sess_count| ipad|iphone|linux|macos|windows|   AK|   A

In [6]:
# features columns without state one-hot encoding
featureCols_1 = ["song_count", "subs_duration", "song_rate", "n_playlist", "thumbs_down",
               "thumbs_up", "avg_sess_len", "sess_count", "ipad", "iphone", "linux",
               "macos", "windows"]

# features column with state one-hot encoding
featureCols_2 = df.columns[3:-1]

assembler_1 = VectorAssembler(inputCols=featureCols_1, outputCol="features_1")
assembler_2 = VectorAssembler(inputCols=featureCols_2, outputCol="features_2")

df = assembler_1.transform(df)
df = assembler_2.transform(df)
df = df.select(["features_1", "features_2", "label"])

In [7]:
train_df, test_df = df.randomSplit([0.9,0.1])

In [8]:
gbt = GBTClassifier()

paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [4, 5, 6]) \
    .addGrid(gbt.maxIter, [10, 15, 20]) \
    .addGrid(gbt.featuresCol, ["features_1"]) \
    .build()
    # .addGrid(gbt.featuresCol, ["features_1", "features_2"]) \

tvs = TrainValidationSplit(estimator=gbt,
                           estimatorParamMaps=paramGrid,
                           evaluator=BinaryClassificationEvaluator(),
                           trainRatio=0.75,
                           seed=42)

In [9]:
# train model
model1 = tvs.fit(train_df)

In [12]:
print(model1.bestModel)

GBTClassificationModel: uid = GBTClassifier_42f86e405200, numTrees=20, numClasses=2, numFeatures=13


In [13]:
# predict on test dataframe
preds_df = model1.transform(test_df)

In [11]:
model1.save('/content/drive/MyDrive/datasets/dsnd-sparkify/sparkify_model_4')

In [14]:
model1.bestModel.featureImportances

SparseVector(13, {0: 0.0271, 1: 0.4081, 2: 0.1499, 3: 0.0297, 4: 0.0543, 5: 0.0678, 6: 0.0657, 7: 0.1757, 8: 0.0042, 9: 0.0027, 10: 0.0072, 11: 0.0075, 12: 0.0})

In [15]:
# show prediction result
preds_df.select(["label", "probability", "prediction"]).take(20)

[Row(label=0, probability=DenseVector([0.9375, 0.0625]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.9468, 0.0532]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.9612, 0.0388]), prediction=0.0),
 Row(label=1, probability=DenseVector([0.235, 0.765]), prediction=1.0),
 Row(label=0, probability=DenseVector([0.8399, 0.1601]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.9538, 0.0462]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.9465, 0.0535]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.9473, 0.0527]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.5693, 0.4307]), prediction=0.0),
 Row(label=0, probability=DenseVector([0.8739, 0.1261]), prediction=0.0),
 Row(label=1, probability=DenseVector([0.3657, 0.6343]), prediction=1.0),
 Row(label=0, probability=DenseVector([0.9445, 0.0555]), prediction=0.0),
 Row(label=1, probability=DenseVector([0.5407, 0.4593]), prediction=0.0),
 Row(label=0, probability=DenseVector([0

In [16]:
# evaluation
binary_evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                                labelCol='label')
res = binary_evaluator.evaluate(preds_df)
res

0.8445504886773773