In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, LongType

def load_and_process_data(filepath, n_samples=None):
    """
    Load and preprocess the Million Playlist Dataset using Spark with explicit schema.

    Args:
        filepath (str): Path to the JSON file.
        n_samples (int, optional): Number of playlists to sample. Load all if None.

    Returns:
        Spark DataFrame: A preprocessed DataFrame.
    """
    # Initialize SparkSession
    spark = SparkSession.builder \
        .appName("Million Playlist Dataset Processing") \
        .getOrCreate()

    # Define the schema explicitly
    track_schema = StructType([
        StructField("pos", IntegerType(), True),
        StructField("artist_name", StringType(), True),
        StructField("track_uri", StringType(), True),
        StructField("artist_uri", StringType(), True),
        StructField("track_name", StringType(), True),
        StructField("album_uri", StringType(), True),
        StructField("duration_ms", LongType(), True),
        StructField("album_name", StringType(), True)
    ])

    playlist_schema = StructType([
        StructField("name", StringType(), True),
        StructField("collaborative", StringType(), True),
        StructField("pid", IntegerType(), True),
        StructField("modified_at", LongType(), True),
        StructField("num_tracks", IntegerType(), True),
        StructField("num_albums", IntegerType(), True),
        StructField("num_followers", IntegerType(), True),
        StructField("tracks", ArrayType(track_schema), True),
        StructField("num_edits", IntegerType(), True),
        StructField("duration_ms", LongType(), True),
        StructField("num_artists", IntegerType(), True)
    ])

    schema = StructType([
        StructField("info", StructType([
            StructField("generated_on", StringType(), True),
            StructField("slice", StringType(), True),
            StructField("version", StringType(), True)
        ]), True),
        StructField("playlists", ArrayType(playlist_schema), True)
    ])

    # Load the JSON file with the schema
    df = spark.read.schema(schema).option("multiline", "true").json(filepath)

    # Flatten the playlists structure
    playlists_df = df.select(explode(col("playlists")).alias("playlist"))

    # Extract playlist-level metadata
    playlists_metadata = playlists_df.select(
        col("playlist.name").alias("playlist_name"),
        col("playlist.pid").alias("playlist_id"),
        col("playlist.num_tracks"),
        col("playlist.num_albums"),
        col("playlist.num_artists"),
        col("playlist.duration_ms").alias("playlist_duration"),
        col("playlist.num_followers"),
        col("playlist.modified_at"),
        explode(col("playlist.tracks")).alias("track")
    )

    # Extract track-level details
    tracks_df = playlists_metadata.select(
        col("playlist_name"),
        col("playlist_id"),
        col("num_tracks"),
        col("num_albums"),
        col("num_artists"),
        col("playlist_duration"),
        col("num_followers"),
        col("modified_at"),
        col("track.pos").alias("track_position"),
        col("track.artist_name").alias("artist_name"),
        col("track.track_uri").alias("track_uri"),
        col("track.artist_uri").alias("artist_uri"),
        col("track.track_name").alias("track_name"),
        col("track.album_uri").alias("album_uri"),
        col("track.duration_ms").alias("track_duration"),
        col("track.album_name").alias("album_name")
    )

    # Sample the data if n_samples is specified
    if n_samples:
        tracks_df = tracks_df.limit(n_samples)

    return tracks_df


if __name__ == "__main__":
    input_path = "mpd.slice.0-999.json"  # Replace with your JSON file path
    output_path = "output/processed_playlists"

    # Load and process the data
    tracks_df = load_and_process_data(input_path, n_samples=100)

    # Show a preview of the data
    tracks_df.show(5, truncate=False)


+-------------+-----------+----------+----------+-----------+-----------------+-------------+-----------+--------------+-----------------+------------------------------------+-------------------------------------+------------------------------------------+------------------------------------+--------------+--------------------------------------------+
|playlist_name|playlist_id|num_tracks|num_albums|num_artists|playlist_duration|num_followers|modified_at|track_position|artist_name      |track_uri                           |artist_uri                           |track_name                                |album_uri                           |track_duration|album_name                                  |
+-------------+-----------+----------+----------+-----------+-----------------+-------------+-----------+--------------+-----------------+------------------------------------+-------------------------------------+------------------------------------------+------------------------------------

In [32]:
def save_data(df, output_path):
    """
    Save the preprocessed DataFrame to a Parquet file.

    Args:
        df (DataFrame): The Spark DataFrame to save.
        output_path (str): The path to save the Parquet file.
    """
    df.write.parquet(output_path, mode="overwrite")
    print(f"Data saved to {output_path}")

In [33]:
  # Save the preprocessed data
save_data(tracks_df, output_path)

Data saved to output/processed_playlists


In [18]:
spark = SparkSession.builder \
            .appName("Million Playlist Dataset Processing") \
            .getOrCreate()
        

24/12/05 01:40:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [23]:
simple_schema = StructType([
            StructField("name", StringType(), True),
            StructField("age", StringType(), True),
            StructField("car", IntegerType(), True),
        ])

In [24]:
    # Load the JSON file with the schema
df = spark.read.schema(simple_schema).json("simple.json")
df

DataFrame[name: string, age: string, car: int]

In [25]:
df.show()

+----+---+----+
|name|age| car|
+----+---+----+
|John| 30|NULL|
+----+---+----+



In [28]:
# Read the file without a schema
df1 = spark.read.option("multiline", "true").json("sample.json")

# Show the schema
df1.printSchema()

root
 |-- info: struct (nullable = true)
 |    |-- generated_on: string (nullable = true)
 |    |-- slice: string (nullable = true)
 |    |-- version: string (nullable = true)
 |-- playlists: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- collaborative: string (nullable = true)
 |    |    |-- duration_ms: long (nullable = true)
 |    |    |-- modified_at: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- num_albums: long (nullable = true)
 |    |    |-- num_artists: long (nullable = true)
 |    |    |-- num_edits: long (nullable = true)
 |    |    |-- num_followers: long (nullable = true)
 |    |    |-- num_tracks: long (nullable = true)
 |    |    |-- pid: long (nullable = true)
 |    |    |-- tracks: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- album_name: string (nullable = true)
 |    |    |    |    |-- album_uri: string (nullable = true)
 |    |  

In [29]:
# Show the DataFrame
df1.show(truncate=False)

+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [19]:
spark.stop()

In [7]:
import os
import json

import pandas as pd

In [47]:
!mkdir df_data

mkdir: df_data: File exists


In [17]:
def create_df_data():
    
    path = '../data/raw/data'
    
    playlist_col = ['collaborative', 'duration_ms', 'modified_at', 
                'name', 'num_albums', 'num_artists', 'num_edits',
                'num_followers', 'num_tracks', 'pid']
    tracks_col = ['album_name', 'album_uri', 'artist_name', 'artist_uri', 
                  'duration_ms', 'track_name', 'track_uri'] 
    playlist_test_col = ['name', 'num_holdouts', 'num_samples', 'num_tracks', 'pid']
    
    filenames = os.listdir(path)
    
    data_playlists = []
    data_tracks = []
    playlists = []

    tracks = set()

    for filename in filenames:
        fullpath = os.sep.join((path, filename))
        f = open(fullpath)
        print(fullpath)
        js = f.read()
        f.close()

        mpd_slice = json.loads(js)

        for playlist in mpd_slice['playlists']:
            data_playlists.append([playlist[col] for col in playlist_col])
            for track in playlist['tracks']:
                playlists.append([playlist['pid'], track['track_uri'], track['pos']])
                if track['track_uri'] not in tracks:
                    data_tracks.append([track[col] for col in tracks_col])
                    tracks.add(track['track_uri'])
                    
    f = open('challenge_set.json')
    js = f.read()
    f.close()
    mpd_slice = json.loads(js)

    data_playlists_test = []
    playlists_test = []

    for playlist in mpd_slice['playlists']:
        data_playlists_test.append([playlist.get(col, '') for col in playlist_test_col])
        for track in playlist['tracks']:
            playlists_test.append([playlist['pid'], track['track_uri'], track['pos']])
            if track['track_uri'] not in tracks:
                data_tracks.append([track[col] for col in tracks_col])
                tracks.add(track['track_uri'])
                
    df_playlists_info = pd.DataFrame(data_playlists, columns=playlist_col)
    df_playlists_info['collaborative'] = df_playlists_info['collaborative'].map({'false': False, 'true': True})

    df_tracks = pd.DataFrame(data_tracks, columns=tracks_col)
    df_tracks['tid'] = df_tracks.index

    track_uri2tid = df_tracks.set_index('track_uri').tid

    df_playlists = pd.DataFrame(playlists, columns=['pid', 'tid', 'pos'])
    df_playlists.tid = df_playlists.tid.map(track_uri2tid)

    df_playlists_test_info = pd.DataFrame(data_playlists_test, columns=playlist_test_col)

    df_playlists_test = pd.DataFrame(playlists_test, columns=['pid', 'tid', 'pos'])
    df_playlists_test.tid = df_playlists_test.tid.map(track_uri2tid)

    df_tracks.to_parquet('../data/processed/df_tracks.parquet')
    df_playlists.to_parquet('../data/processed/df_playlists.parquet')
    df_playlists_info.to_parquet('../data/processed/df_playlists_info.parquet')
    df_playlists_test.to_parquet('../data/processed/df_playlists_test.parquet')
    df_playlists_test_info.to_parquet('../data/processed/df_playlists_test_info.parquet')

In [18]:
create_df_data()

../data/raw/data/mpd.slice.549000-549999.json
../data/raw/data/mpd.slice.613000-613999.json
../data/raw/data/mpd.slice.115000-115999.json
../data/raw/data/mpd.slice.778000-778999.json
../data/raw/data/mpd.slice.290000-290999.json
../data/raw/data/mpd.slice.596000-596999.json
../data/raw/data/mpd.slice.324000-324999.json
../data/raw/data/mpd.slice.422000-422999.json
../data/raw/data/mpd.slice.974000-974999.json
../data/raw/data/mpd.slice.679000-679999.json
../data/raw/data/mpd.slice.7000-7999.json
../data/raw/data/mpd.slice.391000-391999.json
../data/raw/data/mpd.slice.497000-497999.json
../data/raw/data/mpd.slice.225000-225999.json
../data/raw/data/mpd.slice.523000-523999.json
../data/raw/data/mpd.slice.875000-875999.json
../data/raw/data/mpd.slice.448000-448999.json
../data/raw/data/mpd.slice.712000-712999.json
../data/raw/data/mpd.slice.193000-193999.json
../data/raw/data/mpd.slice.38000-38999.json
../data/raw/data/mpd.slice.695000-695999.json
../data/raw/data/mpd.slice.899000-899999

../data/raw/data/mpd.slice.925000-925999.json
../data/raw/data/mpd.slice.375000-375999.json
../data/raw/data/mpd.slice.891000-891999.json
../data/raw/data/mpd.slice.729000-729999.json
../data/raw/data/mpd.slice.572000-572999.json
../data/raw/data/mpd.slice.824000-824999.json
../data/raw/data/mpd.slice.274000-274999.json
../data/raw/data/mpd.slice.990000-990999.json
../data/raw/data/mpd.slice.628000-628999.json
../data/raw/data/mpd.slice.743000-743999.json
../data/raw/data/mpd.slice.419000-419999.json
../data/raw/data/mpd.slice.65000-65999.json
../data/raw/data/mpd.slice.528000-528999.json
../data/raw/data/mpd.slice.174000-174999.json
../data/raw/data/mpd.slice.672000-672999.json
../data/raw/data/mpd.slice.719000-719999.json
../data/raw/data/mpd.slice.915000-915999.json
../data/raw/data/mpd.slice.443000-443999.json
../data/raw/data/mpd.slice.345000-345999.json
../data/raw/data/mpd.slice.618000-618999.json
../data/raw/data/mpd.slice.814000-814999.json
../data/raw/data/mpd.slice.542000-54

../data/raw/data/mpd.slice.438000-438999.json
../data/raw/data/mpd.slice.553000-553999.json
../data/raw/data/mpd.slice.805000-805999.json
../data/raw/data/mpd.slice.255000-255999.json
../data/raw/data/mpd.slice.609000-609999.json
../data/raw/data/mpd.slice.731000-731999.json
../data/raw/data/mpd.slice.889000-889999.json
../data/raw/data/mpd.slice.685000-685999.json
../data/raw/data/mpd.slice.183000-183999.json
../data/raw/data/mpd.slice.206000-206999.json
../data/raw/data/mpd.slice.856000-856999.json
../data/raw/data/mpd.slice.500000-500999.json
../data/raw/data/mpd.slice.307000-307999.json
../data/raw/data/mpd.slice.957000-957999.json
../data/raw/data/mpd.slice.401000-401999.json
../data/raw/data/mpd.slice.630000-630999.json
../data/raw/data/mpd.slice.136000-136999.json
../data/raw/data/mpd.slice.988000-988999.json
../data/raw/data/mpd.slice.21000-21999.json
../data/raw/data/mpd.slice.784000-784999.json
../data/raw/data/mpd.slice.105000-105999.json
../data/raw/data/mpd.slice.603000-60

../data/raw/data/mpd.slice.517000-517999.json
../data/raw/data/mpd.slice.692000-692999.json
../data/raw/data/mpd.slice.62000-62999.json
../data/raw/data/mpd.slice.194000-194999.json
../data/raw/data/mpd.slice.726000-726999.json
../data/raw/data/mpd.slice.919000-919999.json
../data/raw/data/mpd.slice.89000-89999.json
../data/raw/data/mpd.slice.349000-349999.json
../data/raw/data/mpd.slice.715000-715999.json
../data/raw/data/mpd.slice.178000-178999.json
../data/raw/data/mpd.slice.490000-490999.json
../data/raw/data/mpd.slice.46000-46999.json
../data/raw/data/mpd.slice.396000-396999.json
../data/raw/data/mpd.slice.872000-872999.json
../data/raw/data/mpd.slice.524000-524999.json
../data/raw/data/mpd.slice.222000-222999.json
../data/raw/data/mpd.slice.591000-591999.json
../data/raw/data/mpd.slice.297000-297999.json
../data/raw/data/mpd.slice.973000-973999.json
../data/raw/data/mpd.slice.425000-425999.json
../data/raw/data/mpd.slice.323000-323999.json
../data/raw/data/mpd.slice.818000-818999

../data/raw/data/mpd.slice.913000-913999.json
../data/raw/data/mpd.slice.99000-99999.json
../data/raw/data/mpd.slice.343000-343999.json
../data/raw/data/mpd.slice.140000-140999.json
../data/raw/data/mpd.slice.646000-646999.json
../data/raw/data/mpd.slice.895000-895999.json
../data/raw/data/mpd.slice.477000-477999.json
../data/raw/data/mpd.slice.921000-921999.json
../data/raw/data/mpd.slice.371000-371999.json
../data/raw/data/mpd.slice.699000-699999.json
../data/raw/data/mpd.slice.994000-994999.json
../data/raw/data/mpd.slice.576000-576999.json
../data/raw/data/mpd.slice.820000-820999.json
../data/raw/data/mpd.slice.270000-270999.json
../data/raw/data/mpd.slice.798000-798999.json
../data/raw/data/mpd.slice.747000-747999.json
../data/raw/data/mpd.slice.328000-328999.json
../data/raw/data/mpd.slice.978000-978999.json
../data/raw/data/mpd.slice.774000-774999.json
../data/raw/data/mpd.slice.119000-119999.json
../data/raw/data/mpd.slice.243000-243999.json
../data/raw/data/mpd.slice.545000-54

../data/raw/data/mpd.slice.200000-200999.json
../data/raw/data/mpd.slice.506000-506999.json
../data/raw/data/mpd.slice.850000-850999.json
../data/raw/data/mpd.slice.705000-705999.json
../data/raw/data/mpd.slice.359000-359999.json
../data/raw/data/mpd.slice.909000-909999.json
../data/raw/data/mpd.slice.232000-232999.json
../data/raw/data/mpd.slice.534000-534999.json
../data/raw/data/mpd.slice.862000-862999.json
../data/raw/data/mpd.slice.386000-386999.json
../data/raw/data/mpd.slice.45000-45999.json
../data/raw/data/mpd.slice.168000-168999.json
../data/raw/data/mpd.slice.480000-480999.json
../data/raw/data/mpd.slice.333000-333999.json
../data/raw/data/mpd.slice.435000-435999.json
../data/raw/data/mpd.slice.963000-963999.json
../data/raw/data/mpd.slice.287000-287999.json
../data/raw/data/mpd.slice.581000-581999.json
../data/raw/data/mpd.slice.604000-604999.json
../data/raw/data/mpd.slice.102000-102999.json
../data/raw/data/mpd.slice.258000-258999.json
../data/raw/data/mpd.slice.808000-80

In [50]:
dft = pd.read_csv('df_data/df_tracks.csv')
dft

Unnamed: 0.1,Unnamed: 0,album_name,album_uri,artist_name,artist_uri,duration_ms,track_name,track_uri,tid
0,0,ABC,spotify:album:4GuzZh2dtsOjG3sMkx52eR,The Jackson 5,spotify:artist:2iE18Oxc8YSumAU232n4rW,174866,ABC,spotify:track:6cb0HzFQPN4BGADOmSzPCw,0
1,1,Everything Goes Numb,spotify:album:3phH2ZoACvpLVcPtyIk8jp,Streetlight Manifesto,spotify:artist:1OKOTYGoCE2buxTYMegJp7,327920,Point/Counterpoint,spotify:track:0HBvwy7XVhrkQljkCONgsq,1
2,2,Thriller 25 Super Deluxe Edition,spotify:album:1C2h7mLntPSeVYciMRTF4a,Michael Jackson,spotify:artist:3fMbdgg4jU18AjLCKBhRSm,293826,Billie Jean,spotify:track:5ChkMS8OtdzJeqyybCc9R5,2
3,3,Dookie,spotify:album:4uG8q3GPuWHQlRbswMIRS6,Green Day,spotify:artist:7oPftvlwr6VrsViSDV7fJY,181533,Basket Case,spotify:track:6L89mwZXSOwYl76YXfX13s,3
4,4,Elephant,spotify:album:4teFaDSeFHYXZjZJaZGrAO,The White Stripes,spotify:artist:4F84IBURUo98rz4r61KF70,231800,Seven Nation Army,spotify:track:1jNOi6m3Hn8nLEeHCp5Msr,4
...,...,...,...,...,...,...,...,...,...
110711,110711,Peliculas De Cine Vol. 2,spotify:album:63Ayga8d1117BO6ECavJgA,The Royal Movie Band,spotify:artist:1YMADOVqFTlVjZwbCOSp6c,223173,Starman,spotify:track:3p5vcN5Q9JawdzBbh5ceVr,110711
110712,110712,Sports Weekend,spotify:album:0S4qTNMIciayRUU4TXCYGR,2 LIVE CREW,spotify:artist:58Dx4HPzeOO3dbpD9YYEes,257640,Pop That P--sy,spotify:track:2PBTwMH2mzfLigdMyPzOcp,110712
110713,110713,Wild World,spotify:album:0gtU58GVllHFpZzIgLd5lV,Bastille,spotify:artist:7EQ0qTo7fWT7DPxmxtSYEc,183559,Campus,spotify:track:5JugcqxQihVYdvCSPzmP1H,110713
110714,110714,Better Off Alone,spotify:album:4Bbi9dcZgtJiJqEkiclbpK,Alice Deejay,spotify:artist:0aNCjE72yyrhKQB1qfPBpi,412426,Better Off Alone - Vocal Club RMX,spotify:track:1cKRBp7hrBVD4eP3W9x2AI,110714


In [27]:
pd.read_csv('df_data/df_tracks_spark/part-00000-2ebb2f34-fa55-4871-8b78-d8d172f72b0d-c000.csv', on_bad_lines='skip')

Unnamed: 0,track_uri1,album_name,album_uri,artist_name,artist_uri,duration_ms,track_name,track_uri,tid
0,spotify:track:7tIJDktakabGoHjwTTa35W,Take Me,spotify:album:7Cr3sCsROrVfclDb7d9z5z,Miso,spotify:artist:1ukccHTdm6Xjmag0QLNJBx,257647,Take Me,spotify:track:7tIJDktakabGoHjwTTa35W,
1,spotify:track:3wKt77fHCmjWRXkmkYTxrl,Out of the Shadow,spotify:album:6HqU5Tckb8sF6Gj2CRPla2,Rogue Wave,spotify:artist:2JSc53B5cQ31m0xTB7JFpG,260000,Endgame,spotify:track:3wKt77fHCmjWRXkmkYTxrl,
2,spotify:track:4wBzcbZPAkMC6gpKsr84a9,Lover,spotify:album:4e35UuRLPKEMnd2UJedbx1,George Maple,spotify:artist:19m3oZKjGSLzVW0OGIAcNg,220893,Hero,spotify:track:4wBzcbZPAkMC6gpKsr84a9,
3,spotify:track:3jS7bB0oXVOwGFZn3aE5NV,Jagged Little Pill (Remastered),spotify:album:5Ap3F8CxjjsQKZGASDcHNA,Alanis Morissette,spotify:artist:6ogn9necmbUdCppmNnGOdi,249493,You Oughta Know - 2015 Remastered,spotify:track:3jS7bB0oXVOwGFZn3aE5NV,
4,spotify:track:1J2TJKwwiNqrqAywQMBFNu,Origami,spotify:album:6lJWG2Z0Tw7tc9Jwf5rbTu,Vinyl Theatre,spotify:artist:7xSEWLsywYbocdtt3xsQsU,247653,My Fault,spotify:track:1J2TJKwwiNqrqAywQMBFNu,
...,...,...,...,...,...,...,...,...,...
9496,spotify:track:7veYmTIhhDuVIpCW6UeoxJ,Location,spotify:album:5XpjnV9EuzkH60PtpwHn1E,Khalid,spotify:artist:6LuN9FCkKOj5PcnpouEgny,245045,Location,spotify:track:7veYmTIhhDuVIpCW6UeoxJ,
9497,spotify:track:7gszEHRkBhgWxz4ehSe0lr,Lady and the Tramp (1955 Film Score),spotify:album:2bDCjSVK1wjEaPT5YpgtyR,"Peggy Lee (performer), Oliver Wallace (comduct...",spotify:artist:3MHlIt9IAlysdZJz2xvkj6,157126,The Siamese Cat Song / What's Going On Down There,spotify:track:7gszEHRkBhgWxz4ehSe0lr,
9498,spotify:track:7j61qfTooymeOCTrQbmoMU,Cooleyhighharmony,spotify:album:3jknvlUSe6D9Oyn2E3JBLO,Boyz II Men,spotify:artist:6O74knDqdv3XaWtkII7Xjp,203440,This Is My Heart,spotify:track:7j61qfTooymeOCTrQbmoMU,
9499,spotify:track:0mf7oWY87iDpJpEElcUzFJ,Rubbish/Push It Up,spotify:album:0TTnmD67eV4VAL41K74gFW,Clipz,spotify:artist:0XX7H57SCtPTrFytCQ03go,304000,Rubbish,spotify:track:0mf7oWY87iDpJpEElcUzFJ,


In [61]:
dfp = pd.read_csv('df_data/df_playlists.csv')
dfp

Unnamed: 0.1,Unnamed: 0,pid,tid,pos
0,0,2000,0,0
1,1,2000,1,1
2,2,2000,2,2
3,3,2000,3,3
4,4,2000,4,4
...,...,...,...,...
199869,199869,1999,8005,11
199870,199870,1999,76505,12
199871,199871,1999,43579,13
199872,199872,1999,31250,14


In [30]:
pd.read_csv('df_data/df_playlists_test_spark/part-00000-57743597-332c-4982-8345-33f8eb65f346-c000.csv', on_bad_lines='skip')

Unnamed: 0,track_uri,pid,pos,track_uri1,album_name,album_uri,artist_name,artist_uri,duration_ms,track_name,tid
0,spotify:track:005X0FmdtkM1kiutosXLTR,1026437,18,spotify:track:005X0FmdtkM1kiutosXLTR,I Hear A Symphony: Expanded Edition,spotify:album:5K23ycKA0TKPspiOf9xf7X,The Supremes,spotify:artist:57bUPid8xztkieZfS7OlEV,164346.0,I Hear A Symphony - Mono Version,
1,spotify:track:006r3Kh3rjAW6WgkSPTMzC,1049210,66,,,,,,,,
2,spotify:track:00BnfL75e8vHSGCmwUWbEk,1009820,0,spotify:track:00BnfL75e8vHSGCmwUWbEk,Diego,spotify:album:7FbbHGEqovOBqxZBsbxFqX,Tory Lanez,spotify:artist:2jku7tDXc6XoB6MO2hFuqg,242346.0,Diego,
3,spotify:track:00BnfL75e8vHSGCmwUWbEk,1028810,37,spotify:track:00BnfL75e8vHSGCmwUWbEk,Diego,spotify:album:7FbbHGEqovOBqxZBsbxFqX,Tory Lanez,spotify:artist:2jku7tDXc6XoB6MO2hFuqg,242346.0,Diego,
4,spotify:track:00BnfL75e8vHSGCmwUWbEk,1042866,37,spotify:track:00BnfL75e8vHSGCmwUWbEk,Diego,spotify:album:7FbbHGEqovOBqxZBsbxFqX,Tory Lanez,spotify:artist:2jku7tDXc6XoB6MO2hFuqg,242346.0,Diego,
...,...,...,...,...,...,...,...,...,...,...,...
33702,spotify:track:7zsw78LtXUD7JfEwH64HK2,1029262,78,spotify:track:7zsw78LtXUD7JfEwH64HK2,Little Mermaid,spotify:album:4aAwvCRNJIqiUGVEjieWv6,Pat Carroll,spotify:artist:0Yy9u86cq66Se2pB9fYaiW,291693.0,Poor Unfortunate Souls - From \The Little Merm...,
33703,spotify:track:7zsw78LtXUD7JfEwH64HK2,1034047,18,spotify:track:7zsw78LtXUD7JfEwH64HK2,Little Mermaid,spotify:album:4aAwvCRNJIqiUGVEjieWv6,Pat Carroll,spotify:artist:0Yy9u86cq66Se2pB9fYaiW,291693.0,Poor Unfortunate Souls - From \The Little Merm...,
33704,spotify:track:7zv8Fey2BaVJe0PomkkA16,1012496,5,,,,,,,,
33705,spotify:track:7zv8Fey2BaVJe0PomkkA16,1000178,5,,,,,,,,


In [53]:
dfpi = pd.read_csv('df_data/df_playlists_info.csv')

dfpi

Unnamed: 0.1,Unnamed: 0,collaborative,duration_ms,modified_at,name,num_albums,num_artists,num_edits,num_followers,num_tracks,pid
0,0,False,39413578,1446854400,party party,142,116,3,1,152,2000
1,1,False,2386978,1495584000,summer,10,9,4,1,11,2001
2,2,False,47356150,1509235200,Rap,128,84,65,4,221,2002
3,3,False,6630645,1477785600,#tb,33,26,7,1,35,2003
4,4,False,3716605,1500940800,Disney,19,19,3,1,20,2004
...,...,...,...,...,...,...,...,...,...,...,...
2995,2995,False,29183178,1509062400,woo,68,24,13,2,146,1995
2996,2996,False,9354755,1420070400,NEW YEARS,37,34,3,1,38,1996
2997,2997,False,10940019,1508371200,JESUS,28,22,8,2,40,1997
2998,2998,False,7260645,1448236800,yep,27,20,3,1,29,1998


In [54]:
dfpt = pd.read_csv('df_data/df_playlists_test.csv')
dfpt

Unnamed: 0.1,Unnamed: 0,pid,tid,pos
0,0,1000000,76507,0
1,1,1000000,18548,1
2,2,1000000,20986,2
3,3,1000000,52862,3
4,4,1000000,3800,4
...,...,...,...,...
280995,280995,1006767,110715,0
280996,280996,1006771,4337,0
280997,280997,1006773,38222,0
280998,280998,1006775,12960,0


In [55]:
dfpti = pd.read_csv('df_data/df_playlists_test_info.csv')
dfpti 

Unnamed: 0.1,Unnamed: 0,name,num_holdouts,num_samples,num_tracks,pid
0,0,spanish playlist,11,0,11,1000002
1,1,Groovin,48,0,48,1000003
2,2,uplift,40,0,40,1000004
3,3,WUBZ,27,0,27,1000006
4,4,new,41,0,41,1000007
...,...,...,...,...,...,...
9995,9995,Playlist 2015,20,1,21,1006767
9996,9996,Workout,24,1,25,1006771
9997,9997,Girlz,16,1,17,1006773
9998,9998,let's get lost,35,1,36,1006775


### pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, lit, monotonically_increasing_id
from pyspark.sql.functions import broadcast

def create_spark_dataframes(data_path, challenge_file, output_path, partitions):
    """
    Process playlist and track data using Spark, and save the output as CSV files.

    Args:
        data_path (str): Path to the directory containing JSON files.
        challenge_file (str): Path to the challenge_set.json file.
        output_path (str): Path to save the resulting CSV files.

    Returns:
        None
    """

     # Initialize SparkSession with parallelism configurations
    spark = SparkSession.builder \
        .appName("Create Spark DataFrames for Playlist Dataset") \
        .config("spark.sql.shuffle.partitions", str(partitions)) \
        .config("spark.executor.memory", "4g") \
        .config("spark.executor.cores", "4") \
        .config("spark.driver.memory", "4g") \
        .getOrCreate()
    
    
    # Define column sets
    playlist_col = ['collaborative', 'duration_ms', 'modified_at', 
                    'name', 'num_albums', 'num_artists', 'num_edits',
                    'num_followers', 'num_tracks', 'pid']
    tracks_col = ['album_name', 'album_uri', 'artist_name', 'artist_uri', 
                  'duration_ms', 'track_name', 'track_uri'] 
    playlist_test_col = ['name', 'num_holdouts', 'num_samples', 'num_tracks', 'pid']

    # Read all JSON files in the directory
    df_raw = spark.read.option("multiline", "true").json(f"{data_path}/*.json").repartition(partitions)

    # Extract playlist data
    df_playlists = df_raw.select(explode(col("playlists")).alias("playlist"))

    # Extract playlist-level information
    df_playlists_info = df_playlists.select(*[col(f"playlist.{cols}").alias(cols) for cols in playlist_col])

    # Extract track-level information
    df_tracks = df_playlists.select(
        col("playlist.pid").alias("pid"),
        explode(col("playlist.tracks")).alias("track")
    ).select(
        col("track.track_uri").alias("track_uri1"),
        *[col(f"track.{cols}").alias(cols) for cols in tracks_col]
    ).drop_duplicates()

    
    
    # Add unique track ID (tid) in parallel
    df_tracks = df_tracks.withColumn("tid", monotonically_increasing_id())

    # Join playlist and track information to create a relationship DataFrame
    df_playlists_tracks = df_playlists.select(
        col("playlist.pid").alias("pid"),
        explode(col("playlist.tracks")).alias("track")
    ).select(
        col("pid"),
        col("track.track_uri").alias("track_uri"),
        col("track.pos").alias("pos")
    )
    
    df_playlists_tracks = df_playlists_tracks.join(broadcast(df_tracks), on="track_uri", how="left")
    
    # Join with track ID (tid)
#     df_playlists_tracks = df_playlists_tracks.join(df_tracks, on="track_uri", how="left")

    # Process challenge set
    df_challenge_raw = spark.read.option("multiline", "true").json(challenge_file).repartition(partitions)
    df_playlists_test_info = df_challenge_raw.select(
        explode(col("playlists")).alias("playlist")
    ).select(*[col(f"playlist.{cols}").alias(cols) for cols in playlist_test_col])

    df_playlists_test = df_challenge_raw.select(
        explode(col("playlists")).alias("playlist")
    ).select(
        col("playlist.pid").alias("pid"),
        explode(col("playlist.tracks")).alias("track")
    ).select(
        col("pid"),
        col("track.track_uri").alias("track_uri"),
        col("track.pos").alias("pos")
    ).join(broadcast(df_tracks), on="track_uri", how="left")

    # Save DataFrames as CSV files
    df_playlists_info.write.csv(f"{output_path}/df_playlists_info_spark", header=True, mode="overwrite")
    df_tracks.write.csv(f"{output_path}/df_tracks_spark", header=True, mode="overwrite")
    df_playlists_tracks.write.csv(f"{output_path}/df_playlists_spark", header=True, mode="overwrite")
    df_playlists_test_info.write.csv(f"{output_path}/df_playlists_test_info_spark", header=True, mode="overwrite")
    df_playlists_test.write.csv(f"{output_path}/df_playlists_test_spark", header=True, mode="overwrite")

    print("DataFrames successfully created and saved as CSV files.")



In [4]:
if __name__ == "__main__":
    # Define paths
    data_path = "data"  # Path to directory with JSON files
    challenge_file = "challenge_set.json"  # Challenge set file
    output_path = "df_data"  # Output directory for CSV files
     # Number of partitions for parallelism
    num_partitions = 200  # Adjust based on your system's resources

    # Run the function
    create_spark_dataframes(data_path, challenge_file, output_path, partitions=num_partitions)

    # Run the function
#     create_spark_dataframes(data_path, challenge_file, output_path)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/05 02:00:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

DataFrames successfully created and saved as CSV files.




In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, lit, monotonically_increasing_id

def create_spark_dataframes_parallel(data_path, challenge_file, output_path, partitions=200):
    """
    Process playlist and track data using Spark with parallelism, and save the output as CSV files.

    Args:
        data_path (str): Path to the directory containing JSON files.
        challenge_file (str): Path to the challenge_set.json file.
        output_path (str): Path to save the resulting CSV files.
        partitions (int): Number of partitions for parallelism. Default is 200.

    Returns:
        None
    """
    # Initialize SparkSession with parallelism configurations
    spark = SparkSession.builder \
        .appName("Create Spark DataFrames with Parallelism") \
        .config("spark.sql.shuffle.partitions", str(partitions)) \
        .config("spark.executor.memory", "4g") \
        .config("spark.executor.cores", "4") \
        .config("spark.driver.memory", "4g") \
        .getOrCreate()

    # Define column sets
    playlist_col = ['collaborative', 'duration_ms', 'modified_at', 
                    'name', 'num_albums', 'num_artists', 'num_edits',
                    'num_followers', 'num_tracks', 'pid']
    tracks_col = ['album_name', 'album_uri', 'artist_name', 'artist_uri', 
                  'duration_ms', 'track_name', 'track_uri'] 
    playlist_test_col = ['name', 'num_holdouts', 'num_samples', 'num_tracks', 'pid']

    # Read all JSON files in the directory with parallelism
    print(f"Reading files from: {data_path}")
    df_raw = spark.read.json(f"{data_path}/*.json").repartition(partitions)

    # Explode playlists to flatten the structure
    df_playlists = df_raw.select(explode(col("playlists")).alias("playlist"))

    # Extract playlist-level information
    df_playlists_info = df_playlists.select(*[col(f"playlist.{col}").alias(col) for col in playlist_col])

    # Extract track-level information
    df_tracks = df_playlists.select(
        col("playlist.pid").alias("pid"),
        explode(col("playlist.tracks")).alias("track")
    ).select(
        col("track.track_uri").alias("track_uri"),
        *[col(f"track.{col}").alias(col) for col in tracks_col]
    ).drop_duplicates()

    # Add unique track ID (tid) in parallel
    df_tracks = df_tracks.withColumn("tid", monotonically_increasing_id())

    # Join playlist and track information to create a relationship DataFrame
    df_playlists_tracks = df_playlists.select(
        col("playlist.pid").alias("pid"),
        explode(col("playlist.tracks")).alias("track")
    ).select(
        col("pid"),
        col("track.track_uri").alias("track_uri"),
        col("track.pos").alias("pos")
    )

    # Broadcast smaller DataFrame (df_tracks) for efficient join
    from pyspark.sql.functions import broadcast
    df_playlists_tracks = df_playlists_tracks.join(broadcast(df_tracks), on="track_uri", how="left")

    # Process challenge set with parallelism
    df_challenge_raw = spark.read.json(challenge_file).repartition(partitions)
    df_playlists_test_info = df_challenge_raw.select(
        explode(col("playlists")).alias("playlist")
    ).select(*[col(f"playlist.{col}").alias(col) for col in playlist_test_col])

    df_playlists_test = df_challenge_raw.select(
        explode(col("playlists")).alias("playlist")
    ).select(
        col("playlist.pid").alias("pid"),
        explode(col("playlist.tracks")).alias("track")
    ).select(
        col("pid"),
        col("track.track_uri").alias("track_uri"),
        col("track.pos").alias("pos")
    ).join(broadcast(df_tracks), on="track_uri", how="left")

    # Save DataFrames as CSV files in parallel
    print(f"Saving DataFrames to {output_path} with parallelism...")
    df_playlists_info.write.csv(f"{output_path}/df_playlists_info.csv", header=True, mode="overwrite")
    df_tracks.write.csv(f"{output_path}/df_tracks.csv", header=True, mode="overwrite")
    df_playlists_tracks.write.csv(f"{output_path}/df_playlists.csv", header=True, mode="overwrite")
    df_playlists_test_info.write.csv(f"{output_path}/df_playlists_test_info.csv", header=True, mode="overwrite")
    df_playlists_test.write.csv(f"{output_path}/df_playlists_test.csv", header=True, mode="overwrite")

    print("DataFrames successfully created and saved as CSV files with parallelism.")

if __name__ == "__main__":
    # Define paths
    data_path = "data"  # Path to directory with JSON files
    challenge_file = "challenge_set.json"  # Challenge set file
    output_path = "df_data"  # Output directory for CSV files

    # Number of partitions for parallelism
    num_partitions = 200  # Adjust based on your system's resources

    # Run the function
    create_spark_dataframes_parallel(data_path, challenge_file, output_path, partitions=num_partitions)
