In [6]:
import os
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
song_to_playlists = defaultdict(list)

size = 1000         # playlist number in every JSON file
range_index = 1000  # JSON file number
base_path = 'spotify_million_playlist_dataset/data'

for i in range(range_index):
    filename = os.path.join(base_path, f'mpd.slice.{i*size}-{(i+1)*size - 1}.json')
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    playlists = data['playlists']
    for playlist in playlists:
        pid = playlist['pid']  # playlist number
        for track in playlist['tracks']:
            track_uri = track['track_uri']
            # extract track_id
            track_id = track_uri.split(':')[-1]
            song_to_playlists[track_id].append(pid)
    if i % 10 == 0:
        print(f"Processed {i+1} / {range_index} JSON files.")

Processed 1 / 1000 JSON files.
Processed 11 / 1000 JSON files.
Processed 21 / 1000 JSON files.
Processed 31 / 1000 JSON files.
Processed 41 / 1000 JSON files.
Processed 51 / 1000 JSON files.
Processed 61 / 1000 JSON files.
Processed 71 / 1000 JSON files.
Processed 81 / 1000 JSON files.
Processed 91 / 1000 JSON files.
Processed 101 / 1000 JSON files.
Processed 111 / 1000 JSON files.
Processed 121 / 1000 JSON files.
Processed 131 / 1000 JSON files.
Processed 141 / 1000 JSON files.
Processed 151 / 1000 JSON files.
Processed 161 / 1000 JSON files.
Processed 171 / 1000 JSON files.
Processed 181 / 1000 JSON files.
Processed 191 / 1000 JSON files.
Processed 201 / 1000 JSON files.
Processed 211 / 1000 JSON files.
Processed 221 / 1000 JSON files.
Processed 231 / 1000 JSON files.
Processed 241 / 1000 JSON files.
Processed 251 / 1000 JSON files.
Processed 261 / 1000 JSON files.
Processed 271 / 1000 JSON files.
Processed 281 / 1000 JSON files.
Processed 291 / 1000 JSON files.
Processed 301 / 1000 

In [3]:
num_playlist = 1000000
num_blocks = 100
block_size = num_playlist // num_blocks  # playlist number in every interval

df_features = pd.read_csv('scaled_songs.csv')
df_features['location'] = None

for idx, row in df_features.iterrows():
    song_id = row['id']
    # initial location vector
    aggregated = np.zeros(num_blocks, dtype=int)
    
    if song_id in song_to_playlists:
        pids = song_to_playlists[song_id]
        for pid in pids:
            block_idx = pid // block_size 
            if block_idx >= num_blocks:
                block_idx = num_blocks - 1
            aggregated[block_idx] += 1
    
    # store DataFrame
    df_features.at[idx, 'location'] = aggregated.tolist()
    
    if idx % 10 == 0:
        print(f"Processed {idx+1} / {len(df_features)} songs.")

print(df_features.head())

Processed 1 / 134712 songs.
Processed 11 / 134712 songs.
Processed 21 / 134712 songs.
Processed 31 / 134712 songs.
Processed 41 / 134712 songs.
Processed 51 / 134712 songs.
Processed 61 / 134712 songs.
Processed 71 / 134712 songs.
Processed 81 / 134712 songs.
Processed 91 / 134712 songs.
Processed 101 / 134712 songs.
Processed 111 / 134712 songs.
Processed 121 / 134712 songs.
Processed 131 / 134712 songs.
Processed 141 / 134712 songs.
Processed 151 / 134712 songs.
Processed 161 / 134712 songs.
Processed 171 / 134712 songs.
Processed 181 / 134712 songs.
Processed 191 / 134712 songs.
Processed 201 / 134712 songs.
Processed 211 / 134712 songs.
Processed 221 / 134712 songs.
Processed 231 / 134712 songs.
Processed 241 / 134712 songs.
Processed 251 / 134712 songs.
Processed 261 / 134712 songs.
Processed 271 / 134712 songs.
Processed 281 / 134712 songs.
Processed 291 / 134712 songs.
Processed 301 / 134712 songs.
Processed 311 / 134712 songs.
Processed 321 / 134712 songs.
Processed 331 / 13471

In [7]:
from sklearn.preprocessing import StandardScaler
# 1. extract location matrix
location_matrix = np.array(df_features['location'].tolist())
print("Origin location shape:", location_matrix.shape)

# 2. standarlize
scaler = StandardScaler()
location_matrix_scaled = scaler.fit_transform(location_matrix)

# 3. PCA
pca = PCA(n_components=10, random_state=42)
location_reduced = pca.fit_transform(location_matrix_scaled)
print("After PCA shape:", location_reduced.shape)

# 4. transfer to DataFrame
location_pca_df = pd.DataFrame(location_reduced, index=df_features.index,
                               columns=[f'loc_pca_{i}' for i in range(10)])

# 5. drop origin location info
df_features = pd.concat([df_features.drop(columns=['location']), location_pca_df], axis=1)

print(df_features.head())

Origin location shape: (134712, 100)
After PCA shape: (134712, 10)
                       id  danceability    energy       key  loudness  \
0  7lmeHLHBe4nmXzuXc0HDjk     -0.239643  1.545439  0.504262  0.855004   
1  1wsRitfRRtWyEapl0q22o8      0.448997  1.470214  1.632828  0.794662   
2  1hR0fIFK2qRG3f3RF70pb7     -1.067078  1.516782  0.504262  0.850871   
3  2lbASgTSoDO7MTuLAXlTW0     -0.399791  1.506035  1.632828  0.783751   
4  1MQTmpYOZ6fcMQc56Hdo7T     -0.474528  1.369913 -0.906446  0.635127   

   speechiness  acousticness  instrumentalness  liveness   valence  ...  \
0    -0.084450     -1.018675         -0.663029  0.731281  0.184434  ...   
1     0.998881     -1.054604         -0.662862 -0.286777  0.132294  ...   
2     3.770628     -1.026024         -0.663054 -0.453920 -0.310892  ...   
3     1.459273     -0.646043         -0.663049 -0.458985  0.448856  ...   
4    -0.108879     -1.085308         -0.369701 -0.672220  0.318507  ...   

   loc_pca_0  loc_pca_1  loc_pca_2  loc_pca

In [8]:
df_features.head()

Unnamed: 0,id,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,...,loc_pca_0,loc_pca_1,loc_pca_2,loc_pca_3,loc_pca_4,loc_pca_5,loc_pca_6,loc_pca_7,loc_pca_8,loc_pca_9
0,7lmeHLHBe4nmXzuXc0HDjk,-0.239643,1.545439,0.504262,0.855004,-0.08445,-1.018675,-0.663029,0.731281,0.184434,...,24.203403,-0.15892,0.247126,0.789665,-0.060023,-0.640468,1.330385,-0.857548,-0.423787,0.039703
1,1wsRitfRRtWyEapl0q22o8,0.448997,1.470214,1.632828,0.794662,0.998881,-1.054604,-0.662862,-0.286777,0.132294,...,28.493807,0.228536,0.646449,1.706311,-0.004733,-1.077771,0.702044,-1.451287,0.96321,0.337379
2,1hR0fIFK2qRG3f3RF70pb7,-1.067078,1.516782,0.504262,0.850871,3.770628,-1.026024,-0.663054,-0.45392,-0.310892,...,4.036218,-0.282505,0.266417,0.224058,-0.022911,-1.097135,-0.07522,-0.147339,0.320747,-0.147518
3,2lbASgTSoDO7MTuLAXlTW0,-0.399791,1.506035,1.632828,0.783751,1.459273,-0.646043,-0.663049,-0.458985,0.448856,...,0.141121,-0.180057,0.039945,-0.117471,0.191789,-0.015862,0.012023,-0.025625,0.02263,-0.076836
4,1MQTmpYOZ6fcMQc56Hdo7T,-0.474528,1.369913,-0.906446,0.635127,-0.108879,-1.085308,-0.369701,-0.67222,0.318507,...,11.897067,-0.646398,0.500255,-0.252342,0.111723,-0.587308,0.765197,-0.737422,-0.186457,-0.062984


In [9]:
df_features.to_csv('new_features.csv', index=False)