Example of playlist similarity with Dask

In [None]:
import dask.array as da
import numpy as np
from dask import delayed
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import jaccard

# Original playlists data
playlists = [
    [1, 0, 1, 0, 1, 0, 0, 1, 0, 0],  # Playlist 1
    [0, 1, 1, 0, 1, 1, 0, 0, 0, 0],  # Playlist 2
    [1, 1, 0, 1, 0, 0, 1, 0, 1, 0],  # Playlist 3
    [1, 0, 0, 0, 1, 0, 1, 1, 0, 1],  # Playlist 4
    [0, 1, 1, 1, 0, 0, 0, 1, 0, 1]   # Playlist 5
]

# Convert playlists to a Dask array
dask_playlists = da.from_array(playlists, chunks=(1, len(playlists[0])))

# Function to calculate pairwise Jaccard distances for all pairs
def calculate_jaccard_matrix(arr):
    n = arr.shape[0]
    distance_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i + 1, n):
            distance = jaccard(arr[i], arr[j])
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance
    return distance_matrix

# Compute pairwise distance matrix using Dask
distance_matrix = da.from_delayed(
    delayed(calculate_jaccard_matrix)(dask_playlists.compute()),
    shape=(len(playlists), len(playlists)),
    dtype=float
).compute()

# Perform KNN on the computed distance matrix
knn_model = NearestNeighbors(n_neighbors=3, metric="precomputed")
knn_model.fit(distance_matrix)

# Query KNN for the first playlist
distances, indices = knn_model.kneighbors([distance_matrix[0]])
print("Nearest playlists:", indices)
print("Distances to nearest playlists:", distances)

Nearest playlists: [[0 3 1]]
Distances to nearest playlists: [[0.         0.5        0.66666667]]


Example of playlist similarity with dask for 5000 playlists with 100 songs each

In [None]:
import dask.array as da
import numpy as np
from dask import delayed
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import jaccard

# Generate a larger dataset of 50 playlists with 100 songs each
np.random.seed(42)  # For reproducibility
playlists = np.random.randint(2, size=(5000, 100)).tolist()  # Random 0/1 matrix of 50x100

# Convert playlists to a Dask array
dask_playlists = da.from_array(playlists, chunks=(1, len(playlists[0])))

# Function to calculate pairwise Jaccard distances for all pairs
def calculate_jaccard_matrix(arr):
    n = arr.shape[0]
    distance_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i + 1, n):
            distance = jaccard(arr[i], arr[j])
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance
    return distance_matrix

# Compute pairwise distance matrix using Dask
distance_matrix = da.from_delayed(
    delayed(calculate_jaccard_matrix)(dask_playlists.compute()),
    shape=(len(playlists), len(playlists)),
    dtype=float
).compute()

# Perform KNN on the computed distance matrix
knn_model = NearestNeighbors(n_neighbors=3, metric="precomputed")
knn_model.fit(distance_matrix)

# Query KNN for the first playlist
distances, indices = knn_model.kneighbors([distance_matrix[0]])
print("Nearest playlists:", indices)
print("Distances to nearest playlists:", distances)

Nearest playlists: [[   0 3678 2602]]
Distances to nearest playlists: [[0.         0.46575342 0.47297297]]


Example of song similarity with dask for few songs and playlists

In [9]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist, squareform

# Original playlists data (Rows = Playlists, Columns = Songs)
playlists = [
    [1, 0, 1, 0, 1],  # Playlist 1
    [0, 1, 1, 0, 1],  # Playlist 2
    [1, 1, 0, 1, 0],  # Playlist 3
    [1, 0, 0, 0, 1],  # Playlist 4
    [0, 1, 1, 1, 0]   # Playlist 5
]

# Transpose the matrix to make rows = songs, columns = playlists
songs = np.transpose(playlists)

# Calculate Jaccard distances between songs
distance_matrix = squareform(pdist(songs, metric="jaccard"))

# Apply KNN to find similar songs
knn_model = NearestNeighbors(n_neighbors=3, metric="precomputed")
knn_model.fit(distance_matrix)

# Query KNN for the first song
distances, indices = knn_model.kneighbors([distance_matrix[0]])
print("Nearest songs:", indices)
print("Distances to nearest songs:", distances)


Nearest songs: [[0 4 3]]
Distances to nearest songs: [[0.   0.5  0.75]]


Example of song similarity with dask for nxm songs and playlists

In [None]:
import dask.array as da
import numpy as np
from dask import delayed
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import jaccard

# Generate a larger dataset of n playlists with m songs each
np.random.seed(42)  # For reproducibility
n = 1000  # Number of playlists
m = 25   # Number of songs
playlists = np.random.randint(2, size=(n, m))  # Random 0/1 matrix of size n x m

# Convert playlists to a Dask array (to handle larger data)
dask_playlists = da.from_array(playlists, chunks=(1, m))  # Use chunking to efficiently process large datasets

# Function to calculate pairwise Jaccard distances for all pairs of playlists
def calculate_jaccard_matrix(arr):
    n = arr.shape[0]
    distance_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i + 1, n):
            # Calculate Jaccard distance between playlists i and j
            distance = jaccard(arr[i], arr[j])
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance
    return distance_matrix

# Compute pairwise distance matrix using Dask
distance_matrix = da.from_delayed(
    delayed(calculate_jaccard_matrix)(dask_playlists.compute()),  # Compute the Jaccard matrix
    shape=(n, n),  # Shape of the distance matrix (n x n)
    dtype=float
).compute()  # Convert from delayed Dask array to a regular numpy array

# Perform KNN on the computed distance matrix
knn_model = NearestNeighbors(n_neighbors=3, metric="precomputed")
knn_model.fit(distance_matrix)

# Query KNN for the first playlist
distances, indices = knn_model.kneighbors([distance_matrix[0]])

# Output the results
print("Nearest playlists:", indices)
print("Distances to nearest playlists:", distances)

Nearest playlists: [[  0 248  68]]
Distances to nearest playlists: [[0.         0.33333333 0.35714286]]


Playlist Recommandations through the file, working!!

In [None]:
import dask.array as da
import numpy as np
import json
import os
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import jaccard
from scipy.sparse import lil_matrix, csr_matrix
from dask import delayed

# Path to the data folder
data_folder = 'data'

# Get the first 3 .json files in the data folder
json_files = sorted([f for f in os.listdir(data_folder) if f.endswith('.json')])[:3]

# Initialize the list to store all playlists and the set of all unique song names
all_playlists = []
all_songs = set()

# Iterate over the first 3 .json files and collect the playlists and songs
for filename in json_files:
    filepath = os.path.join(data_folder, filename)
    with open(filepath, 'r') as file:
        data = json.load(file)
        playlists = data['playlists']
        all_playlists.extend(playlists)
        for playlist in playlists:
            all_songs.update(track['track_name'] for track in playlist['tracks'])

# Create a song-to-index mapping
song_to_index = {song: idx for idx, song in enumerate(all_songs)}

# Initialize a sparse binary matrix (Rows: playlists, Columns: unique songs)
num_playlists = len(all_playlists)
num_songs = len(all_songs)
playlist_matrix = lil_matrix((num_playlists, num_songs), dtype=int)

# Populate the matrix with song data
for i, playlist in enumerate(all_playlists):
    for track in playlist['tracks']:
        song_idx = song_to_index[track['track_name']]
        playlist_matrix[i, song_idx] = 1

# Convert to a compressed sparse row format for efficient row-based calculations
playlist_matrix_csr = csr_matrix(playlist_matrix)

# Function to calculate pairwise Jaccard similarities based on song overlap
@delayed
def calculate_jaccard_matrix(arr):
    n = arr.shape[0]
    distance_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i + 1, n):
            # Calculate Jaccard distance for playlist-song vectors
            distance = jaccard(arr[i].toarray()[0], arr[j].toarray()[0])
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance
    return distance_matrix

# Compute the Jaccard distance matrix using Dask
distance_matrix = da.from_delayed(
    calculate_jaccard_matrix(playlist_matrix_csr),
    shape=(num_playlists, num_playlists),
    dtype=float
).compute()

# Perform KNN on the computed distance matrix
knn_model = NearestNeighbors(n_neighbors=3, metric="precomputed")
knn_model.fit(distance_matrix)

# Query KNN for the first playlist
distances, indices = knn_model.kneighbors([distance_matrix[0]])
print("Nearest playlists:", indices)
print("Distances to nearest playlists:", distances)

Nearest playlists: [[   0 2376  262]]
Distances to nearest playlists: [[0.         0.87417219 0.87647059]]


trying to be faster than above

In [4]:
from dask.distributed import Client, LocalCluster
import dask.array as da
import numpy as np
import json
import os
from scipy.spatial.distance import jaccard
from scipy.sparse import lil_matrix, csr_matrix
from dask import delayed
from sklearn.neighbors import NearestNeighbors

# Set up a Dask cluster
cluster = LocalCluster()
client = Client(cluster)

# Path to the data folder
data_folder = 'data'

# Get all .json files in the data folder
json_files = sorted([f for f in os.listdir(data_folder) if f.endswith('.json')])[:1]

# Function to load playlists from a chunk of JSON files
def load_playlists(file_chunk):
    all_playlists = []
    all_songs = set()
    for filename in file_chunk:
        filepath = os.path.join(data_folder, filename)
        with open(filepath, 'r') as file:
            data = json.load(file)
            playlists = data['playlists']
            
            # Filter playlists with more than 15 tracks
            filtered_playlists = [playlist for playlist in playlists if len(playlist['tracks']) > 0]
            
            all_playlists.extend(filtered_playlists)
            for playlist in filtered_playlists:
                all_songs.update(track['track_name'] for track in playlist['tracks'])
    
    return all_playlists, all_songs
# Chunk the JSON files into groups of 100,000 playlists
chunk_size = 100000
chunks = [json_files[i:i+chunk_size] for i in range(0, len(json_files), chunk_size)]

# Function to create playlist matrix for a chunk of playlists
@delayed
def create_playlist_matrix(playlists, all_songs):
    song_to_index = {song: idx for idx, song in enumerate(all_songs)}
    num_playlists = len(playlists)
    num_songs = len(all_songs)
    playlist_matrix = lil_matrix((num_playlists, num_songs), dtype=int)
    
    # Populate the matrix
    for i, playlist in enumerate(playlists):
        for track in playlist['tracks']:
            song_idx = song_to_index[track['track_name']]
            playlist_matrix[i, song_idx] = 1
    
    return csr_matrix(playlist_matrix)

# Function to calculate pairwise Jaccard distances for a chunk of playlists
@delayed
def calculate_jaccard_distances(matrix):
    n = matrix.shape[0]
    distance_matrix = np.zeros((n, n))
    
    # Calculate pairwise Jaccard distances
    for i in range(n):
        for j in range(i + 1, n):
            distance = jaccard(matrix[i].toarray()[0], matrix[j].toarray()[0])
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance
    
    return distance_matrix

# Perform KNN on the computed distance matrix
def perform_knn(distance_matrix):
    knn_model = NearestNeighbors(n_neighbors=3, metric="precomputed")
    knn_model.fit(distance_matrix)
    
    # Query KNN for the first playlist (example)
    distances, indices = knn_model.kneighbors([distance_matrix[0]])
    return distances, indices

# Process each chunk in parallel
tasks = []
for chunk in chunks:
    # Load playlists and songs
    tasks.append(delayed(load_playlists)(chunk))

# Compute the results lazily and gather them
results = [task.compute() for task in tasks]

# After chunking and computing, you can calculate Jaccard similarities for each chunk
all_distances = []
all_indices = []

for chunk_playlists, chunk_songs in results:
    # Create the playlist matrix for this chunk
    playlist_matrix = create_playlist_matrix(chunk_playlists, chunk_songs).compute()
    
    # Calculate pairwise Jaccard distances
    distance_matrix = calculate_jaccard_distances(playlist_matrix).compute()
    
    # Perform KNN for the chunk
    distances, indices = perform_knn(distance_matrix)
    all_distances.append(distances)
    all_indices.append(indices)

# After processing all chunks, you can further analyze the results if necessary
print("All nearest playlist distances:", all_distances)
print("All nearest playlist indices:", all_indices)

# Optionally, close the Dask client when done
client.close()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 49263 instead
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


All nearest playlist distances: [array([[0.        , 0.87647059, 0.89830508]])]
All nearest playlist indices: [array([[  0, 262, 747]])]


In [9]:
import numpy as np
from dask.distributed import Client, LocalCluster
from dask import delayed
from sklearn.metrics import pairwise_distances
from scipy.sparse import csr_matrix
import os
import json

# Set up a Dask cluster
cluster = LocalCluster()
client = Client(cluster)

# Path to the data folder
data_folder = "data"

# Get all .json files in the data folder
json_files = sorted([f for f in os.listdir(data_folder) if f.endswith(".json")])[:3]

# Function to load playlists and unique songs from a chunk of JSON files
@delayed
def load_playlists(file_chunk):
    all_playlists = []
    all_songs = set()
    for filename in file_chunk:
        filepath = os.path.join(data_folder, filename)
        with open(filepath, "r") as file:
            data = json.load(file)
            playlists = data["playlists"]

            # Remove filtering of playlists (all playlists are now included)
            all_playlists.extend(playlists)
            for playlist in playlists:
                all_songs.update(track["track_name"] for track in playlist["tracks"])
    return all_playlists, all_songs

# Function to create the playlist matrix
@delayed
def create_playlist_matrix(playlists, all_songs):
    song_to_index = {song: idx for idx, song in enumerate(all_songs)}
    num_playlists = len(playlists)
    num_songs = len(all_songs)

    # Sparse playlist matrix
    playlist_matrix = csr_matrix((num_playlists, num_songs), dtype=int)

    for i, playlist in enumerate(playlists):
        indices = [song_to_index[track["track_name"]] for track in playlist["tracks"]]
        playlist_matrix[i, indices] = 1

    return playlist_matrix

# Function to calculate Jaccard distances and find 4 nearest playlists
@delayed
def find_nearest_playlists(matrix):
    # Convert sparse matrix to dense format and then to a numpy array
    matrix_dense = np.asarray(matrix.todense())  # Convert to numpy array
    
    # Calculate Jaccard distance
    distance_matrix = pairwise_distances(matrix_dense, metric="jaccard")
    
    # Store nearest 4 playlists for each playlist
    nearest_playlists = []
    for i, distances in enumerate(distance_matrix):
        # Get indices and distances for the 4 nearest playlists (excluding itself)
        nearest_indices = np.argsort(distances)[1:5]
        nearest_distances = distances[nearest_indices]
        nearest_playlists.append((i, nearest_indices, nearest_distances))
    
    return nearest_playlists

# Chunk the JSON files
chunk_size = 1  # Adjust the chunk size as needed
chunks = [json_files[i:i+chunk_size] for i in range(0, len(json_files), chunk_size)]

# Process each chunk and create delayed tasks
tasks = []
for chunk in chunks:
    playlists_all_songs = load_playlists(chunk)
    playlists = playlists_all_songs[0]  # This is the delayed list of playlists
    all_songs = playlists_all_songs[1]  # This is the delayed set of songs
    playlist_matrix = create_playlist_matrix(playlists, all_songs)
    nearest_playlists = find_nearest_playlists(playlist_matrix)
    tasks.append(nearest_playlists)

# Compute results
results = client.compute(tasks)

# Wait for and retrieve results
results = client.gather(results)

# Print the 4 nearest playlists and their distances
for chunk_id, nearest in enumerate(results):
    print(f"\nChunk {chunk_id}: Nearest playlists and distances")
    for playlist_id, indices, distances in nearest:
        print(f"Playlist {playlist_id}:")
        for i, (idx, dist) in enumerate(zip(indices, distances)):
            print(f"  {i+1}. Playlist {idx} with Jaccard distance {dist:.4f}")

# Close the client
client.close()


Perhaps you already have a cluster running?
Hosting the HTTP server on port 50536 instead



Chunk 0: Nearest playlists and distances
Playlist 0:
  1. Playlist 262 with Jaccard distance 0.8765
  2. Playlist 747 with Jaccard distance 0.8983
  3. Playlist 717 with Jaccard distance 0.9072
  4. Playlist 721 with Jaccard distance 0.9157
Playlist 1:
  1. Playlist 580 with Jaccard distance 0.9259
  2. Playlist 888 with Jaccard distance 0.9286
  3. Playlist 343 with Jaccard distance 0.9394
  4. Playlist 637 with Jaccard distance 0.9435
Playlist 2:
  1. Playlist 88 with Jaccard distance 0.9344
  2. Playlist 238 with Jaccard distance 0.9882
  3. Playlist 323 with Jaccard distance 0.9890
  4. Playlist 696 with Jaccard distance 0.9895
Playlist 3:
  1. Playlist 391 with Jaccard distance 0.9842
  2. Playlist 382 with Jaccard distance 0.9928
  3. Playlist 599 with Jaccard distance 0.9929
  4. Playlist 78 with Jaccard distance 0.9929
Playlist 4:
  1. Playlist 236 with Jaccard distance 0.9167
  2. Playlist 307 with Jaccard distance 0.9211
  3. Playlist 485 with Jaccard distance 0.9429
  4. Pl

Calculate the Jaccard Distance between playlist

In [25]:
import json
import os

# Path to the data folder
data_folder = 'data'

# Define playlists to inspect
playlist_ids_to_check = [0, 2376, 262]

# Function to search for specified playlist IDs and return all tracks
def get_all_tracks(playlist_id, data_folder):
    for filename in sorted(os.listdir(data_folder)):
        filepath = os.path.join(data_folder, filename)
        with open(filepath, 'r') as f:
            data = json.load(f)
            for playlist in data['playlists']:
                if playlist['pid'] == playlist_id:
                    return [track['track_name'] for track in playlist['tracks']]  # All tracks
    return None

# Retrieve and print all tracks for each playlist in playlist_ids_to_check
for pid in playlist_ids_to_check:
    all_tracks = get_all_tracks(pid, data_folder)
    if all_tracks:
        print(f"All tracks for playlist {pid}:")
        for track in all_tracks:
            print(track)
        print("\n")  # Add a newline for separation between playlists
    else:
        print(f"Playlist {pid} not found.")


All tracks for playlist 0:
Lose Control (feat. Ciara & Fat Man Scoop)
Toxic
Crazy In Love
Rock Your Body
It Wasn't Me
Yeah!
My Boo
Buttons
Say My Name
Hey Ya! - Radio Mix / Club Mix
Promiscuous
Right Where You Want Me - Radio Edit Version
Beautiful Soul
Leavin'
Me & U
Ice Box
Sk8er Boi
Run It!
Check On It - feat. Bun B and Slim Thug
Jumpin', Jumpin'
Soak Up The Sun
Where Is The Love?
Stacy's Mom
Just The Girl
Yo (Excuse Me Miss)
Year 3000
Lip Gloss
Everytime We Touch - Radio Edit
Whatcha Say
Miss Independent
Party In The U.S.A.
The Great Escape
Replay
Forever
Your Love Is My Drug
Closer
One Less Lonely Girl
Paper Planes
Mr. Brightside
All The Small Things
Beep
Somebody To Love
Dirty Little Secret
Baby
A Thousand Miles
Livin on Sunday
See You Again
How Do You Sleep? - Featuring Ludacris
This Is Me
My Happy Ending
Check Yes Juliet
The Great Escape


All tracks for playlist 2376:
Veni Veni Emmanuel
Christmas Day
Christmastime
Veni Veni
Medley
Gloria
Overture/O Come All Ye Faithful


All t

In [5]:
import os
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11.0.1"


In [2]:
import dask.bag as db
import os
import json
from collections import defaultdict

# Path to your dataset folder
DATA_FOLDER = "data"
MAX_FILES = 10  # Limit the number of files to process

# Load JSON files into a Dask Bag
def load_json_files(data_folder, max_files):
    # Get all JSON files and limit to the first `max_files`
    all_files = sorted([f for f in os.listdir(data_folder) if f.endswith(".json")])[:max_files]
    files_pattern = [os.path.join(data_folder, f) for f in all_files]

    # Read the limited files into a Dask Bag
    def safe_parse(line, filename):
        try:
            return json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON in file '{filename}': {e}")
            return None  # Skip invalid JSON lines

    return db.read_text(files_pattern).map(lambda line: safe_parse(line, os.path.basename(line))).filter(lambda x: x is not None)

# Extract playlists and tracks as a dictionary
def extract_playlists_to_dict(data_bag):
    playlists = data_bag.pluck("playlists").flatten()

    # Flatten and store as a dictionary
    tracks_dict = defaultdict(set)
    for playlist in playlists:
        pid = playlist["pid"]
        for track in playlist["tracks"]:
            tracks_dict[pid].add(track["track_uri"])
    return dict(tracks_dict)

# Compute Jaccard similarity
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

# Find top N similar playlists
def find_top_similar_playlists(tracks_dict, top_n=4):
    playlist_ids = list(tracks_dict.keys())
    results = []

    for pid1 in playlist_ids:
        set1 = tracks_dict[pid1]
        similarities = []
        for pid2 in playlist_ids:
            if pid1 != pid2:
                set2 = tracks_dict[pid2]
                similarity = jaccard_similarity(set1, set2)
                similarities.append((pid2, similarity))
        # Sort and take top N
        similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
        results.append((pid1, similarities))
    return results

# Main function
def main():
    # Load JSON files
    print(f"Loading up to {MAX_FILES} files from '{DATA_FOLDER}'...")
    data_bag = load_json_files(DATA_FOLDER, MAX_FILES)

    # Extract playlists and tracks as a dictionary
    print("Extracting playlists...")
    try:
        tracks_dict = extract_playlists_to_dict(data_bag.compute())
    except Exception as e:
        print(f"Error during extraction: {e}")
        return

    # Compute top similar playlists
    print("Computing Jaccard similarities...")
    try:
        top_similarities = find_top_similar_playlists(tracks_dict, top_n=4)
    except Exception as e:
        print(f"Error during similarity computation: {e}")
        return

    # Print results
    print("Results:")
    for pid, similarities in top_similarities:
        print(f"\nPlaylist {pid}:")
        for similar_pid, score in similarities:
            print(f"  Similar Playlist {similar_pid} - Jaccard Similarity: {score:.4f}")

if __name__ == "__main__":
    main()



Loading up to 10 files from 'data'...
Extracting playlists...
Error during extraction: 'list' object has no attribute 'pluck'


In [1]:
import json
import os
import re

# Path to your dataset folder
DATA_FOLDER = "data"

# Helper function to sort files numerically
def numerical_sort(value):
    # Extract the starting number from the filename
    match = re.search(r"mpd\.slice\.(\d+)-", value)
    return int(match.group(1)) if match else 0

# Load playlists from JSON file
def load_playlists(filename):
    with open(os.path.join(DATA_FOLDER, filename), 'r') as file:
        data = json.load(file)
    return data["playlists"]

# Compute Jaccard similarity between two playlists
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

# Find the top N similar playlists for each playlist
def find_top_similar_playlists(playlists, attribute="track_uri", top_n=4):
    # Create a mapping of playlist ID to the set of attributes
    playlist_sets = {p["pid"]: set(track[attribute] for track in p["tracks"]) for p in playlists}

    # Dictionary to store top N similar playlists for each playlist
    top_similarities = {}

    # Calculate similarities for each playlist
    for pid1 in playlist_sets:
        similarities = []
        for pid2 in playlist_sets:
            if pid1 != pid2:
                score = jaccard_similarity(playlist_sets[pid1], playlist_sets[pid2])
                similarities.append((pid2, score))
        
        # Sort by similarity score in descending order and take the top N
        similarities.sort(key=lambda x: x[1], reverse=True)
        top_similarities[pid1] = similarities[:top_n]

    return top_similarities

# Main function to process files and calculate similarities
def main():
    # Number of files to process (adjust as needed)
    NUM_FILES_TO_PROCESS = 10

    # Get the list of all JSON files in the dataset folder
    all_files = sorted([file for file in os.listdir(DATA_FOLDER) if file.endswith(".json")], key=numerical_sort)

    if NUM_FILES_TO_PROCESS > len(all_files):
        print(f"There are only {len(all_files)} files available. Adjusting to process all available files.")
        num_files = len(all_files)
    else:
        num_files = NUM_FILES_TO_PROCESS

    # Load the specified number of JSON files
    files_to_process = all_files[:num_files]
    playlists = []
    for file in files_to_process:
        playlists.extend(load_playlists(file))
    
    # Calculate the top 4 similar playlists for each playlist
    top_similarities = find_top_similar_playlists(playlists, attribute="track_uri", top_n=4)
    
    # Display the results
    print(f"Top 4 Similar Playlists for Each Playlist (Processed {num_files} files):")
    for pid, similarities in top_similarities.items():
        print(f"\nPlaylist {pid}:")
        for similar_pid, score in similarities:
            print(f"  Similar Playlist {similar_pid} - Jaccard Similarity: {score:.4f}")

# Run the main function
if __name__ == "__main__":
    main()


Top 4 Similar Playlists for Each Playlist (Processed 10 files):

Playlist 0:
  Similar Playlist 6323 - Jaccard Similarity: 0.1603
  Similar Playlist 9938 - Jaccard Similarity: 0.1489
  Similar Playlist 4266 - Jaccard Similarity: 0.1385
  Similar Playlist 3978 - Jaccard Similarity: 0.1383

Playlist 1:
  Similar Playlist 1089 - Jaccard Similarity: 0.1053
  Similar Playlist 6614 - Jaccard Similarity: 0.1042
  Similar Playlist 6267 - Jaccard Similarity: 0.1034
  Similar Playlist 4851 - Jaccard Similarity: 0.0984

Playlist 2:
  Similar Playlist 2585 - Jaccard Similarity: 0.0852
  Similar Playlist 88 - Jaccard Similarity: 0.0656
  Similar Playlist 2807 - Jaccard Similarity: 0.0634
  Similar Playlist 3260 - Jaccard Similarity: 0.0433

Playlist 3:
  Similar Playlist 9499 - Jaccard Similarity: 0.0182
  Similar Playlist 391 - Jaccard Similarity: 0.0158
  Similar Playlist 9582 - Jaccard Similarity: 0.0153
  Similar Playlist 1650 - Jaccard Similarity: 0.0150

Playlist 4:
  Similar Playlist 8454 - 

In [4]:
import dask.bag as db
import json
import os
import re

# Path to your dataset folder
DATA_FOLDER = "data"

# Helper function to sort files numerically based on naming structure
def numerical_sort(value):
    match = re.search(r"mpd\.slice\.(\d+)-", value)
    return int(match.group(1)) if match else float('inf')

# Load playlists from a single JSON file
def load_playlists_from_file(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data["playlists"]

# Compute Jaccard similarity between two playlists
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

# Find the top N similar playlists for a given playlist ID
def find_top_similar_playlists(target_pid, all_playlists, top_n=4):
    # Get the target playlist's track URI set
    target_playlist = dict(all_playlists).get(target_pid)
    if not target_playlist:
        raise ValueError(f"Playlist ID {target_pid} not found in the dataset.")

    similarities = []
    for pid, track_set in all_playlists:
        if pid != target_pid:
            score = jaccard_similarity(target_playlist, track_set)
            similarities.append((pid, score))

    # Sort by similarity score in descending order and return the top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

# Main function
def main():
    # Number of files to process
    NUM_FILES_TO_PROCESS = 200

    # Ensure the dataset folder exists
    if not os.path.exists(DATA_FOLDER):
        print(f"Dataset folder '{DATA_FOLDER}' does not exist. Please ensure the folder is set correctly.")
        return

    # Get the list of JSON files in the dataset folder
    all_files = sorted([file for file in os.listdir(DATA_FOLDER) if file.endswith(".json")], key=numerical_sort)

    if NUM_FILES_TO_PROCESS > len(all_files):
        print(f"There are only {len(all_files)} files available. Adjusting to process all available files.")
        num_files = len(all_files)
    else:
        num_files = NUM_FILES_TO_PROCESS

    # Limit the files to the first NUM_FILES_TO_PROCESS
    files_to_process = all_files[:num_files]

    # Create a Dask Bag to process the JSON files in parallel
    file_paths = [os.path.join(DATA_FOLDER, file) for file in files_to_process]
    data_bag = db.from_sequence(file_paths).map(load_playlists_from_file).flatten()

    # Map playlists to (pid, track_uri set) pairs
    print("Processing playlists into track_uri sets...")
    playlist_sets = data_bag.map(lambda p: (p["pid"], set(track["track_uri"] for track in p["tracks"])))

    # Convert playlist sets to a list for pairwise comparison
    all_playlists = playlist_sets.compute()

    # Ask the user for a playlist ID
    try:
        target_pid = int(input("Enter the Playlist ID to find similar playlists: "))
    except ValueError:
        print("Invalid Playlist ID. Please enter an integer.")
        return

    # Find the top 4 similar playlists for the given playlist ID
    print(f"Finding top 4 similar playlists for Playlist ID {target_pid}...")
    try:
        top_similarities = find_top_similar_playlists(target_pid, all_playlists, top_n=4)
    except ValueError as e:
        print(e)
        return

    # Display the results
    print(f"\nTop 4 Similar Playlists for Playlist ID {target_pid}:")
    for similar_pid, score in top_similarities:
        print(f"  Similar Playlist {similar_pid} - Jaccard Similarity: {score:.4f}")

if __name__ == "__main__":
    main()


Processing playlists into track_uri sets...
Finding top 4 similar playlists for Playlist ID 5...

Top 4 Similar Playlists for Playlist ID 5:
  Similar Playlist 121485 - Jaccard Similarity: 0.1862
  Similar Playlist 85708 - Jaccard Similarity: 0.1692
  Similar Playlist 60498 - Jaccard Similarity: 0.1468
  Similar Playlist 112947 - Jaccard Similarity: 0.1420


In [1]:
import dask.bag as db
import json
import os
import re

# Path to your dataset folder
DATA_FOLDER = "data"

# Helper function to sort files numerically based on naming structure
def numerical_sort(value):
    match = re.search(r"mpd\.slice\.(\d+)-", value)
    return int(match.group(1)) if match else float('inf')

# Load playlists from a single JSON file
def load_playlists_from_file(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data["playlists"]

# Compute Dice coefficient
def dice_coefficient(set1, set2):
    intersection = len(set1 & set2)
    return (2 * intersection) / (len(set1) + len(set2)) if (len(set1) + len(set2)) > 0 else 0

# Find the top N similar playlists for a given playlist ID
def find_top_similar_playlists(target_pid, all_playlists, top_n=4):
    # Get the target playlist's track URI set
    target_playlist = dict(all_playlists).get(target_pid)
    if not target_playlist:
        raise ValueError(f"Playlist ID {target_pid} not found in the dataset.")

    similarities = []
    for pid, track_set in all_playlists:
        if pid != target_pid:
            score = dice_coefficient(target_playlist, track_set)
            similarities.append((pid, score))

    # Sort by similarity score in descending order and return the top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

# Main function
def main():
    # Number of files to process
    NUM_FILES_TO_PROCESS = 200

    # Ensure the dataset folder exists
    if not os.path.exists(DATA_FOLDER):
        print(f"Dataset folder '{DATA_FOLDER}' does not exist. Please ensure the folder is set correctly.")
        return

    # Get the list of JSON files in the dataset folder
    all_files = sorted([file for file in os.listdir(DATA_FOLDER) if file.endswith(".json")], key=numerical_sort)

    if NUM_FILES_TO_PROCESS > len(all_files):
        print(f"There are only {len(all_files)} files available. Adjusting to process all available files.")
        num_files = len(all_files)
    else:
        num_files = NUM_FILES_TO_PROCESS

    # Limit the files to the first NUM_FILES_TO_PROCESS
    files_to_process = all_files[:num_files]

    # Create a Dask Bag to process the JSON files in parallel
    file_paths = [os.path.join(DATA_FOLDER, file) for file in files_to_process]
    data_bag = db.from_sequence(file_paths).map(load_playlists_from_file).flatten()

    # Map playlists to (pid, track_uri set) pairs
    print("Processing playlists into track_uri sets...")
    playlist_sets = data_bag.map(lambda p: (p["pid"], set(track["track_uri"] for track in p["tracks"])))

    # Convert playlist sets to a list for pairwise comparison
    all_playlists = playlist_sets.compute()

    # Ask the user for a playlist ID
    try:
        target_pid = int(input("Enter the Playlist ID to find similar playlists: "))
    except ValueError:
        print("Invalid Playlist ID. Please enter an integer.")
        return

    # Find the top 4 similar playlists for the given playlist ID
    print(f"Finding top 4 similar playlists for Playlist ID {target_pid}...")
    try:
        top_similarities = find_top_similar_playlists(target_pid, all_playlists, top_n=4)
    except ValueError as e:
        print(e)
        return

    # Display the results
    print(f"\nTop 4 Similar Playlists for Playlist ID {target_pid}:")
    for similar_pid, score in top_similarities:
        print(f"  Similar Playlist {similar_pid} - Dice Coefficient: {score:.4f}")

if __name__ == "__main__":
    main()


Processing playlists into track_uri sets...
Finding top 4 similar playlists for Playlist ID 5...

Top 4 Similar Playlists for Playlist ID 5:
  Similar Playlist 121485 - Dice Coefficient: 0.3140
  Similar Playlist 85708 - Dice Coefficient: 0.2895
  Similar Playlist 60498 - Dice Coefficient: 0.2560
  Similar Playlist 112947 - Dice Coefficient: 0.2488


In [6]:
import dask.bag as db
import json
import os
import re

# Path to your dataset folder
DATA_FOLDER = "data"

# Helper function to sort files numerically based on naming structure
def numerical_sort(value):
    match = re.search(r"mpd\.slice\.(\d+)-", value)
    return int(match.group(1)) if match else float('inf')

# Load playlists from a single JSON file
def load_playlists_from_file(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data["playlists"]

# Compute Jaccard similarity
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

# Extract artist sets for each playlist
def extract_artists_from_playlists(playlist):
    pid = playlist["pid"]
    # Extract unique artists from tracks
    artist_set = set(track["artist_uri"] for track in playlist["tracks"])
    return pid, artist_set

# Find the top N similar playlists for a given playlist ID
def find_top_similar_playlists(target_pid, all_playlists, top_n=4):
    # Convert list to dictionary for fast lookup
    playlists_dict = dict(all_playlists)

    # Get the target playlist's artist set
    target_artists = playlists_dict.get(target_pid)
    if not target_artists:
        raise ValueError(f"Playlist ID {target_pid} not found in the dataset.")

    similarities = []
    for pid, artist_set in all_playlists:
        if pid != target_pid:
            score = jaccard_similarity(target_artists, artist_set)
            similarities.append((pid, score))

    # Sort by similarity score in descending order and return the top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

# Main function
def main():
    # Number of files to process
    NUM_FILES_TO_PROCESS = 200

    # Ensure the dataset folder exists
    if not os.path.exists(DATA_FOLDER):
        print(f"Dataset folder '{DATA_FOLDER}' does not exist. Please ensure the folder is set correctly.")
        return

    # Get the list of JSON files in the dataset folder
    all_files = sorted([file for file in os.listdir(DATA_FOLDER) if file.endswith(".json")], key=numerical_sort)

    if NUM_FILES_TO_PROCESS > len(all_files):
        print(f"There are only {len(all_files)} files available. Adjusting to process all available files.")
        num_files = len(all_files)
    else:
        num_files = NUM_FILES_TO_PROCESS

    # Limit the files to the first NUM_FILES_TO_PROCESS
    files_to_process = all_files[:num_files]

    # Create a Dask Bag to process the JSON files in parallel
    file_paths = [os.path.join(DATA_FOLDER, file) for file in files_to_process]
    data_bag = db.from_sequence(file_paths).map(load_playlists_from_file).flatten()

    # Map playlists to (pid, artist set) pairs
    print("Extracting artists from playlists...")
    artist_sets = data_bag.map(extract_artists_from_playlists)

    # Convert artist sets to a list for pairwise comparison
    all_playlists = artist_sets.compute()

    # Ask the user for a playlist ID
    try:
        target_pid = int(input("Enter the Playlist ID to find similar playlists: "))
    except ValueError:
        print("Invalid Playlist ID. Please enter an integer.")
        return

    # Find the top 4 similar playlists for the given playlist ID
    print(f"Finding top 4 similar playlists for Playlist ID {target_pid}...")
    try:
        top_similarities = find_top_similar_playlists(target_pid, all_playlists, top_n=4)
    except ValueError as e:
        print(e)
        return

    # Display the results
    print(f"\nTop 4 Similar Playlists for Playlist ID {target_pid}:")
    for similar_pid, score in top_similarities:
        print(f"  Similar Playlist {similar_pid} - Jaccard Similarity: {score:.4f}")

if __name__ == "__main__":
    main()


Extracting artists from playlists...
Finding top 4 similar playlists for Playlist ID 5...

Top 4 Similar Playlists for Playlist ID 5:
  Similar Playlist 121485 - Jaccard Similarity: 0.2718
  Similar Playlist 78975 - Jaccard Similarity: 0.2299
  Similar Playlist 85708 - Jaccard Similarity: 0.2292
  Similar Playlist 178560 - Jaccard Similarity: 0.2277
