### Membuat Spark Context untuk aplikasi Spotiplay

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

spark = SparkSession.builder \
    .appName("SpotifyStreaming") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
    .getOrCreate()

In [2]:
print(spark.sparkContext.getConf().get('spark.jars'))


file:///home/bigdata/.ivy2/jars/org.mongodb.spark_mongo-spark-connector_2.11-2.4.1.jar,file:///home/bigdata/.ivy2/jars/org.apache.spark_spark-sql-kafka-0-10_2.11-2.4.5.jar,file:///home/bigdata/.ivy2/jars/org.mongodb_mongo-java-driver-3.10.2.jar,file:///home/bigdata/.ivy2/jars/org.apache.kafka_kafka-clients-2.0.0.jar,file:///home/bigdata/.ivy2/jars/org.spark-project.spark_unused-1.0.0.jar,file:///home/bigdata/.ivy2/jars/org.lz4_lz4-java-1.4.0.jar,file:///home/bigdata/.ivy2/jars/org.xerial.snappy_snappy-java-1.1.7.3.jar,file:///home/bigdata/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar


## Latest Played Streaming

### Get From parquet

In [3]:
df = spark.read.parquet("/home/bigdata/spotiplay/data")
df.createOrReplaceTempView("songs")
df = df.sort("timestamp", ascending=False)
df.show()

+--------------------+--------------------+-----------+--------------------+------------+------+--------+----+-----------+------------+----------------+--------+-------+-------+--------------------+
|            track_id|          track_name|artist_name|         album_image|danceability|energy|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|           timestamp|
+--------------------+--------------------+-----------+--------------------+------------+------+--------+----+-----------+------------+----------------+--------+-------+-------+--------------------+
|5wqGP4hnECDxaO6bs...|   bloom. [intro...]|       null|https://i.scdn.co...|       0.819|  0.32| -16.552| 0.0|     0.0998|       0.973|           0.804|   0.141|  0.781| 85.017|2023-03-12 06:52:...|
|6TYuszb6kOBCQm2Ar...|          Cloud Nine|       null|https://i.scdn.co...|       0.645| 0.429| -10.104| 1.0|      0.038|       0.919|           0.958|   0.117|  0.623| 87.005|2023-03-12 06:52:...|
|1wJy

In [4]:
mongo_uri = "mongodb://localhost:27017/spotiplay.tracks"
mongo_conf = {"spark.mongodb.input.uri": mongo_uri}
df_tracks = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
             .options(**mongo_conf) \
             .load()
             
df_tracks.createOrReplaceTempView("tracks")
df_tracks.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+-----------+--------+--------------+--------------------+--------------------+--------------------+--------+--------------------+----------+--------------------+------------+-----+--------------------+
|                 _id|               album|             artists|      audio_features|   available_markets|disc_number|duration_ms|explicit|  external_ids|       external_urls|                href|                  id|is_local|                name|popularity|         preview_url|track_number| type|                 uri|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+-----------+--------+--------------+--------------------+--------------------+--------------------+--------+--------------------+----------+--------------------+------------+-----+--------------------+
|[640c321fb08f031d...|[single, single, .

In [27]:
from pyspark.sql.functions import col

df_features = df_tracks.select(col("name"), 
                               col("artists").getItem("name").alias("artist"),
                               col("audio_features").getItem("danceability").alias("danceability"), 
                               col("audio_features").getItem("energy").alias("energy"), 
                               col("audio_features").getItem("loudness").alias("loudness"), 
                               col("audio_features").getItem("speechiness").alias("speechiness"), 
                               col("audio_features").getItem("acousticness").alias("acousticness"), 
                               col("audio_features").getItem("instrumentalness").alias("instrumentalness"), 
                               col("audio_features").getItem("liveness").alias("liveness"), 
                               col("audio_features").getItem("valence").alias("valence"), 
                               col("audio_features").getItem("tempo").alias("tempo"), )

df_features.show()

+--------------------+--------------------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+
|                name|              artist|danceability|energy|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|
+--------------------+--------------------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+
|                 urs|              [NIKI]|        0.72| 0.415|  -9.527|      0.354|       0.834|         1.01E-6|  0.0893|  0.235|  94.88|
|High School in Ja...|              [NIKI]|       0.873| 0.488|  -8.285|     0.0413|       0.531|         1.32E-4|   0.153|  0.462|119.983|
|   Sick Little Games| [First and Forever]|       0.593| 0.901|  -3.695|     0.0792|     0.00492|         1.58E-5|  0.0805|  0.532| 96.564|
|          Drive Safe|        [Rich Brian]|       0.635| 0.312|  -8.134|     0.0265|       0.635|             0.0|   0.138|  0.414| 98.017|
|La La Lost You - ..

In [40]:
from pyspark.ml.feature import VectorAssembler

assembler=VectorAssembler(inputCols=[
 'danceability',
 'energy',
 'loudness',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo'], outputCol='features')
assembled_data=assembler.setHandleInvalid("skip").transform(df_features)

In [41]:
from pyspark.ml.feature import StandardScaler
scale=StandardScaler(inputCol='features',outputCol='standardized')
data_scale=scale.fit(assembled_data)
df_scaled=data_scale.transform(assembled_data)

In [42]:
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
silhouette_score=[]
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='standardized', \
                                metricName='silhouette', distanceMeasure='squaredEuclidean')


KMeans_algo=KMeans(featuresCol='standardized', k=3)
    
KMeans_fit=KMeans_algo.fit(df_scaled)
    
output_df =KMeans_fit.transform(df_scaled)

pipeline = Pipeline(stages=[assembler, scale, KMeans_algo])

In [43]:
output_df.show()

+--------------------+--------------------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+----------+
|                name|              artist|danceability|energy|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|            features|        standardized|prediction|
+--------------------+--------------------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+----------+
|                 urs|              [NIKI]|        0.72| 0.415|  -9.527|      0.354|       0.834|         1.01E-6|  0.0893|  0.235|  94.88|[0.72,0.415,-9.52...|[4.81108869035855...|         0|
|High School in Ja...|              [NIKI]|       0.873| 0.488|  -8.285|     0.0413|       0.531|         1.32E-4|   0.153|  0.462|119.983|[0.873,0.488,-8.2...|[5.83344503705974...|         0|
|   Sick Little Games| [First and F

In [18]:
df.show()

+--------------------+--------------------+-----------+--------------------+------------+------+--------+----+-----------+------------+----------------+--------+-------+-------+--------------------+
|            track_id|          track_name|artist_name|         album_image|danceability|energy|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|           timestamp|
+--------------------+--------------------+-----------+--------------------+------------+------+--------+----+-----------+------------+----------------+--------+-------+-------+--------------------+
|5wqGP4hnECDxaO6bs...|   bloom. [intro...]|       null|https://i.scdn.co...|       0.819|  0.32| -16.552| 0.0|     0.0998|       0.973|           0.804|   0.141|  0.781| 85.017|2023-03-12 06:52:...|
|6TYuszb6kOBCQm2Ar...|          Cloud Nine|       null|https://i.scdn.co...|       0.645| 0.429| -10.104| 1.0|      0.038|       0.919|           0.958|   0.117|  0.623| 87.005|2023-03-12 06:52:...|
|1wJy

In [30]:
#rename df.track_name to df.name
df = df.withColumnRenamed("track_name", "name")
df = df.withColumnRenamed("artist_name", "artist")

cols = ['name', 'artist', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

cols_drop = ['track_id','album_image','timestamp','mode']

df_new = df.drop(*cols_drop)

df_new.show()

+--------------------+------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+
|                name|artist|danceability|energy|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|
+--------------------+------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+
|   bloom. [intro...]|  null|       0.819|  0.32| -16.552|     0.0998|       0.973|           0.804|   0.141|  0.781| 85.017|
|          Cloud Nine|  null|       0.645| 0.429| -10.104|      0.038|       0.919|           0.958|   0.117|  0.623| 87.005|
|    Faithful Mission|  null|       0.677| 0.443| -13.612|      0.077|       0.682|           0.889|   0.228|  0.726| 73.963|
|           Afternoon|  null|       0.797| 0.429|  -8.913|       0.45|       0.759|         0.00478|   0.297|  0.345|165.072|
|      Lazy Afternoon|  null|       0.681| 0.232| -14.236|     0.0694|       0.971|            0.84|  0.0939| 0.0918| 

In [44]:
df_predict_scaled = pipeline.fit(df_new).transform(df_new)

df_predict = df_predict_scaled.select("name", "artist", "prediction")

df_predict.show()

+--------------------+------+----------+
|                name|artist|prediction|
+--------------------+------+----------+
|   bloom. [intro...]|  null|         1|
|          Cloud Nine|  null|         1|
|    Faithful Mission|  null|         1|
|           Afternoon|  null|         2|
|            nice day|  null|         1|
|      Lazy Afternoon|  null|         1|
|             Tanpopo|  null|         1|
|          In My Head|  null|         1|
|            Sidewalk|  null|         1|
|             2:23 AM|  null|         0|
|                lazy|  null|         1|
|            I'm Fine|  null|         1|
|             3:03 PM|  null|         1|
|          You and Me|  null|         1|
|Samishii obake to...|  null|         1|
|Santa wa chuo-sen...|  null|         0|
|Shumatsu Kyoto ge...|  null|         0|
|              Parade|  null|         0|
|Kamikakushi no sh...|  null|         1|
|             Morning|  null|         0|
+--------------------+------+----------+
only showing top

In [47]:
#show the songs in the same cluster
output_df.filter(output_df.prediction == 2).limit(5).show()

+--------------------+-------------------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+----------+
|                name|             artist|danceability|energy|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|            features|        standardized|prediction|
+--------------------+-------------------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+----------+
|Let This Love Lie...|[First and Forever]|        0.49| 0.857|  -3.783|     0.0601|     6.15E-5|             0.0|   0.113|  0.249|107.527|[0.49,0.857,-3.78...|[3.27421313649401...|         2|
|    In Loving Memory|[First and Forever]|       0.371| 0.937|  -3.542|     0.0512|     4.43E-5|             0.0|   0.538|  0.628| 84.771|[0.371,0.937,-3.5...|[2.47904708905975...|         2|
|               VIVID| [Rich Brian, $NOT