In [1]:
!pip install pyspark



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName('sparkify-train').getOrCreate()

In [5]:
# load data and change is_churn column into label column
df = spark.read.parquet('/content/drive/MyDrive/datasets/dsnd-sparkify/ml_df.parquet')
df = df.withColumn('label', F.when(F.col("is_churn"), 1).otherwise(0))
df.show(5)

+-------+-------------------+--------+----------+-------------+------------------+----------+-----------+---------+------------------+----------+-----+------+-----+-----+-------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----------+-----+-----+-----+-----+-----+-----+-----+-----+--------+--------+-----+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--------+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----------+-----+-----+-----+-----+-----+-----+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--------+-----+-----+-----+-----+-----+
| userId|              up_ts|is_churn|song_count|subs_duration|         song_rate|n_playlist|thumbs_down|thumbs_up|      avg_sess_len|sess_count| ipad|iphone|linux|macos|windows|   AK|   A

In [6]:
# split dataframe into train and test dataframe
train_df, test_df = df.randomSplit([0.7, 0.3], seed=15)

In [7]:
# features columns without state one-hot encoding
featureCols = ["song_count", "subs_duration", "song_rate", "n_playlist", "thumbs_down",
               "thumbs_up", "avg_sess_len", "sess_count", "ipad", "iphone", "linux",
               "macos", "windows"]

# features column with state one-hot encoding
featureCols = df.columns[3:]

assembler = VectorAssembler(inputCols=featureCols, outputCol="features")

train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)

In [8]:
# train model
model = GBTClassifier(maxDepth=4)
model1 = model.fit(train_df)

In [9]:
# predict on test dataframe
preds_df = model1.transform(test_df)

In [10]:
# show prediction result
preds_df.select(["features", "label", "rawPrediction", "probability", "prediction"]).show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(113,[0,1,2,3,4,5...|    0|[1.54350200272501...|[0.95635347857270...|       0.0|
|(113,[0,1,2,3,4,5...|    0|[1.54350200272501...|[0.95635347857270...|       0.0|
|(113,[0,1,2,3,4,5...|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|(113,[0,1,2,3,4,5...|    0|[1.54350200272501...|[0.95635347857270...|       0.0|
|(113,[0,1,2,3,4,5...|    0|[1.54350200272501...|[0.95635347857270...|       0.0|
|(113,[0,1,2,3,5,6...|    0|[1.54350200272501...|[0.95635347857270...|       0.0|
|(113,[0,1,2,3,4,5...|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|(113,[0,1,2,3,4,5...|    1|[-1.5435020027249...|[0.04364652142729...|       1.0|
|(113,[0,1,2,3,4,5...|    0|[1.54350200272500...|[0.95635347857270...|       0.0|
|(113,[0,1,2,3,4

In [11]:
# evaluation
binary_evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='label', metricName='areaUnderPR')
res = binary_evaluator.evaluate(preds_df)
res

1.0