In [87]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, 
    BooleanType, LongType, ArrayType
)

In [88]:
class SpotifyPlaylistDataLoader:
    def __init__(self, dataset_path):
        # Initialize Spark Session
        self.spark = SparkSession.builder \
            .appName("SpotifyPlaylistDataLoader") \
            .config("spark.driver.memory", "4g") \
            .config("spark.executor.memory", "4g") \
            .getOrCreate()
        
        self.dataset_path = dataset_path

    def define_schema(self):
        """
        Define the schema based on the provided JSON structure
        """
        track_schema = StructType([
            StructField("pos", IntegerType(), True),
            StructField("artist_name", StringType(), True),
            StructField("track_uri", StringType(), True),
            StructField("artist_uri", StringType(), True),
            StructField("track_name", StringType(), True),
            StructField("album_uri", StringType(), True),
            StructField("duration_ms", LongType(), True),
            StructField("album_name", StringType(), True)
        ])

        playlist_schema = StructType([
            StructField("name", StringType(), True),
            StructField("collaborative", StringType(), True),
            StructField("pid", IntegerType(), True),
            StructField("modified_at", LongType(), True),
            StructField("num_tracks", IntegerType(), True),
            StructField("num_albums", IntegerType(), True),
            StructField("num_followers", IntegerType(), True),
            StructField("tracks", ArrayType(track_schema), True)
        ])

        root_schema = StructType([
            StructField("info", StructType([
                StructField("generated_on", StringType(), True),
                StructField("slice", StringType(), True),
                StructField("version", StringType(), True)
            ]), True),
            StructField("playlists", ArrayType(playlist_schema), True)
        ])

        return root_schema

    def load_spotify_dataset(self, sample_percentage=0.05, max_playlists=50000):
        """
        Load Spotify Playlist dataset using Spark
        """
        # Read the dataset with the defined schema
        df = self.spark.read \
            .schema(self.define_schema()) \
            .json(self.dataset_path)
        
        # Explode the playlists
        exploded_df = df.select(F.explode("playlists").alias("playlist"))
        
        # Extract playlist and track details
        processed_df = exploded_df.select(
            F.col("playlist.pid").alias("playlist_id"),
            F.col("playlist.name").alias("playlist_name"),
            F.col("playlist.collaborative").alias("is_collaborative"),
            F.col("playlist.num_tracks").alias("total_tracks"),
            F.col("playlist.num_followers").alias("followers"),
            F.explode("playlist.tracks").alias("track")
        )
        
        # Extract track details
        final_df = processed_df.select(
            "playlist_id",
            "playlist_name",
            "is_collaborative",
            "total_tracks",
            "followers",
            F.col("track.pos").alias("track_position"),
            F.col("track.artist_name").alias("artist_name"),
            F.col("track.track_uri").alias("track_uri"),
            F.col("track.track_name").alias("track_name"),
            F.col("track.album_name").alias("album_name"),
            F.col("track.duration_ms").alias("duration_ms")
        )
        
        # Sample and limit if needed
        sampled_df = final_df.sample(fraction=sample_percentage, seed=42)
        sampled_df = sampled_df.limit(max_playlists)
        
        return sampled_df

    def compute_dataset_stats(self, df):
        """
        Compute basic statistics about the dataset
        """
        stats = {
            "total_playlists": df.select("playlist_id").distinct().count(),
            "total_tracks": df.select("track_uri").distinct().count(),
            "total_artists": df.select("artist_name").distinct().count(),
            "avg_tracks_per_playlist": df.groupBy("playlist_id").count().select(F.mean("count")).first()[0],
            "most_common_artists": df.groupBy("artist_name").count().orderBy(F.col("count").desc()).limit(10).collect()
        }
        
        return stats

    def save_processed_data(self, df, output_path):
        """
        Save processed data to a specified location
        """
        df.write.mode("overwrite").parquet(output_path)

    def __del__(self):
        # Close Spark session when object is deleted
        if hasattr(self, 'spark'):
            self.spark.stop()


In [89]:
# Example usage
def main():
    # Path to your Spotify Playlist dataset
    dataset_path = "mpd.slice.0-999.json"
    
    # Initialize loader
    loader = SpotifyPlaylistDataLoader(dataset_path)
    
    try:
        # Load and process data
        processed_data = loader.load_spotify_dataset(
            sample_percentage=0.1,  # 10% sample
            max_playlists=50000
        )
        
        # Compute and print statistics
        stats = loader.compute_dataset_stats(processed_data)
        print("Dataset Statistics:")
        for stat, value in stats.items():
            print(f"{stat}: {value}")
        
        # Optional: Save processed data
        loader.save_processed_data(processed_data, "output/processed_playlists")
    
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()


In [90]:
if __name__ == "__main__":
    main()

Dataset Statistics:
total_playlists: 0
total_tracks: 0
total_artists: 0
avg_tracks_per_playlist: None
most_common_artists: []


In [103]:
import pandas as pd

pd.read_parquet('output/processed_playlists/part-00000-ec25451e-c88c-4141-9767-eb7979f1426e-c000.snappy.parquet', engine='fastparquet')

Unnamed: 0,playlist_name,playlist_id,num_tracks,num_albums,num_artists,playlist_duration,num_followers,modified_at,track_position,artist_name,track_uri,artist_uri,track_name,album_uri,track_duration,album_name
0,Throwbacks,0,52,47,37,11532414,1,1493424000,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook
1,Throwbacks,0,52,47,37,11532414,1,1493424000,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone
2,Throwbacks,0,52,47,37,11532414,1,1493424000,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit)
3,Throwbacks,0,52,47,37,11532414,1,1493424000,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified
4,Throwbacks,0,52,47,37,11532414,1,1493424000,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,korean,2,64,51,31,14039958,1,1505692800,4,BTS,spotify:track:0WNGsQ1oAuHzNTk8jivBKW,spotify:artist:3Nrfpe0tUJi4K4DXYWgMUX,Spring Day,spotify:album:7LF4N7lvyDhrPBuCJ1rplJ,274097,You Never Walk Alone
96,korean,2,64,51,31,14039958,1,1505692800,5,Lovelyz,spotify:track:24psBRmEw3kHjBGZfl1dmb,spotify:artist:3g34PW5oNmDBxMVUTzx2XK,Ah-Choo,spotify:album:5ZJuawNI3RvxURIBtsDHs0,218474,Lovelyz8
97,korean,2,64,51,31,14039958,1,1505692800,6,LEE HI,spotify:track:06L1apH8kLF47dbhZ4Zg9A,spotify:artist:7cVZApDoQZpS447nHTsNqu,BREATHE,spotify:album:1xnXVzinhfO4I9CzTocPfh,288992,SEOULITE
98,korean,2,64,51,31,14039958,1,1505692800,7,LEE HI,spotify:track:2qWgqPdW1OiAP8KSBH1b93,spotify:artist:7cVZApDoQZpS447nHTsNqu,FXXK WIT US,spotify:album:1xnXVzinhfO4I9CzTocPfh,217861,SEOULITE


In [93]:
spark = SparkSession.builder \
            .appName("SpotifyPlaylistDataLoader") \
            .config("spark.driver.memory", "16g") \
            .config("spark.executor.memory", "16g") \
            .getOrCreate()
        

In [97]:
data = spark.read \
.option("badRecordsPath", "mpd.slice.0-999.json") \
.json("mpd.slice.0-999.json")
data.printSchema()

root
 |-- _corrupt_record: string (nullable = true)



In [96]:
# data.show(5, truncate=False)

In [78]:
dataset_path = "mpd.slice.0-999.json"

# Define schema for the dataset (adjust based on actual dataset structure)
track_schema = StructType([
            StructField("pos", IntegerType(), True),
            StructField("artist_name", StringType(), True),
            StructField("track_uri", StringType(), True),
            StructField("artist_uri", StringType(), True),
            StructField("track_name", StringType(), True),
            StructField("album_uri", StringType(), True),
            StructField("duration_ms", LongType(), True),
            StructField("album_name", StringType(), True)
        ])
playlist_schema = StructType([
            StructField("name", StringType(), True),
            StructField("collaborative", StringType(), True),
            StructField("pid", IntegerType(), True),
            StructField("modified_at", LongType(), True),
            StructField("num_tracks", IntegerType(), True),
            StructField("num_albums", IntegerType(), True),
            StructField("num_followers", IntegerType(), True),
            StructField("tracks", ArrayType(track_schema), True)
        ])

root_schema = StructType([
            StructField("info", StructType([
                StructField("generated_on", StringType(), True),
                StructField("slice", StringType(), True),
                StructField("version", StringType(), True)
            ]), True),
            StructField("playlists", ArrayType(playlist_schema), True)
        ])


        # Read the dataset
df = spark.read.schema(root_schema) \
            .json(dataset_path)  # Assuming JSON format
        
# Explode the playlists
exploded_df = df.select(F.explode("playlists").alias("playlist"))
        
        # Extract playlist and track details
processed_df = exploded_df.select(
            F.col("playlist.pid").alias("playlist_id"),
            F.col("playlist.name").alias("playlist_name"),
            F.col("playlist.collaborative").alias("is_collaborative"),
            F.col("playlist.num_tracks").alias("total_tracks"),
            F.col("playlist.num_followers").alias("followers"),
            F.explode("playlist.tracks").alias("track")
        )
        
        # Extract track details
final_df = processed_df.select(
            "playlist_id",
            "playlist_name",
            "is_collaborative",
            "total_tracks",
            "followers",
            F.col("track.pos").alias("track_position"),
            F.col("track.artist_name").alias("artist_name"),
            F.col("track.track_uri").alias("track_uri"),
            F.col("track.track_name").alias("track_name"),
            F.col("track.album_name").alias("album_name"),
            F.col("track.duration_ms").alias("duration_ms")
        )
        
        # Sample and limit if needed
sampled_df = final_df.sample(fraction=0.05, seed=42)
sampled_df = sampled_df.limit(50000)

In [80]:
sampled_df 

DataFrame[playlist_id: int, playlist_name: string, is_collaborative: string, total_tracks: int, followers: int, track_position: int, artist_name: string, track_uri: string, track_name: string, album_name: string, duration_ms: bigint]

In [81]:
sampled_df.printSchema()

root
 |-- playlist_id: integer (nullable = true)
 |-- playlist_name: string (nullable = true)
 |-- is_collaborative: string (nullable = true)
 |-- total_tracks: integer (nullable = true)
 |-- followers: integer (nullable = true)
 |-- track_position: integer (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- track_uri: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- duration_ms: long (nullable = true)



In [82]:
sampled_df.show()

+-----------+-------------+----------------+------------+---------+--------------+-----------+---------+----------+----------+-----------+
|playlist_id|playlist_name|is_collaborative|total_tracks|followers|track_position|artist_name|track_uri|track_name|album_name|duration_ms|
+-----------+-------------+----------------+------------+---------+--------------+-----------+---------+----------+----------+-----------+
+-----------+-------------+----------------+------------+---------+--------------+-----------+---------+----------+----------+-----------+



In [84]:
stats = {
            "total_playlists": df.select("playlist_id").distinct().count(),
            "total_tracks": df.select("track_uri").distinct().count(),
            "total_artists": df.select("artist_name").distinct().count(),
            "avg_tracks_per_playlist": df.groupBy("playlist_id").count().select(F.mean("count")).first()[0],
            "most_common_artists": df.groupBy("artist_name").count().orderBy(F.col("count").desc()).limit(10).collect()
        }

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `track_uri` cannot be resolved. Did you mean one of the following? [`info`, `playlists`].;
'Project ['track_uri]
+- Relation [info#2771,playlists#2772] json


In [102]:
spark.stop()

In [86]:
df

DataFrame[info: struct<generated_on:string,slice:string,version:string>, playlists: array<struct<name:string,collaborative:string,pid:int,modified_at:bigint,num_tracks:int,num_albums:int,num_followers:int,tracks:array<struct<pos:int,artist_name:string,track_uri:string,artist_uri:string,track_name:string,album_uri:string,duration_ms:bigint,album_name:string>>>>]