1er essai, ça marche

In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

# Load dataset
playlists_df = pd.read_csv("processed_first_50_files.csv")

# Encode playlists and tracks
playlists_df['playlist_idx'] = playlists_df['playlist_id'].astype('category').cat.codes
playlists_df['track_idx'] = playlists_df['track_uri'].astype('category').cat.codes

# Create playlist-track interaction matrix
interaction_matrix = csr_matrix(
    (np.ones(len(playlists_df)), 
     (playlists_df['playlist_idx'], playlists_df['track_idx']))
)


In [45]:
# Split playlists into train and test sets
playlist_ids = playlists_df['playlist_id'].unique()
train_ids, test_ids = train_test_split(playlist_ids, test_size=0.1, random_state=42)

# Create train and test datasets
train_df = playlists_df[playlists_df['playlist_id'].isin(train_ids)]
test_df = playlists_df[playlists_df['playlist_id'].isin(test_ids)]

In [46]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Train Matrix Factorization Model (SVD)
svd = TruncatedSVD(n_components=50, random_state=42)
playlist_embeddings = svd.fit_transform(interaction_matrix)

# Predict missing tracks for playlists in the test set
def recommend_tracks(playlist_idx, top_n=500):
    # Compute cosine similarity
    similarities = cosine_similarity(playlist_embeddings[playlist_idx].reshape(1, -1), playlist_embeddings).flatten()
    
    # Rank tracks by similarity
    similar_playlists = np.argsort(-similarities)[1:]  # Exclude itself
    recommended_tracks = set()

    for idx in similar_playlists:
        if len(recommended_tracks) >= top_n:
            break
        recommended_tracks.update(train_df[train_df['playlist_idx'] == idx]['track_uri'])
    
    return list(recommended_tracks)[:top_n]


In [47]:
def evaluate_model(test_df, top_n=500):
    r_precision = []
    ndcg_scores = []

    for playlist_id in test_df['playlist_id'].unique()[:10]:  # Limit to 5 for debugging
        # Get ground truth and seed tracks
        ground_truth = set(test_df[test_df['playlist_id'] == playlist_id]['track_uri'])
        seed_tracks = list(ground_truth)[:max(1, len(ground_truth)//2)]  # Use 50% as seed
        holdout_tracks = ground_truth - set(seed_tracks)

        # Get recommendations
        playlist_idx = test_df[test_df['playlist_id'] == playlist_id]['playlist_idx'].iloc[0]
        recommended_tracks = recommend_tracks(playlist_idx, top_n)

        # Calculate R-Precision
        relevant_tracks = len(set(recommended_tracks) & holdout_tracks)
        r_precision.append(relevant_tracks / len(holdout_tracks) if holdout_tracks else 0)

        # Calculate NDCG
        dcg = sum([1 / np.log2(i + 2) if track in holdout_tracks else 0 for i, track in enumerate(recommended_tracks)])
        idcg = sum([1 / np.log2(i + 2) for i in range(len(holdout_tracks))])
        ndcg_scores.append(dcg / idcg if idcg > 0 else 0)

    # Report metrics
    print("R-Precision:", np.mean(r_precision))
    print("NDCG:", np.mean(ndcg_scores))

# Run evaluation
evaluate_model(test_df)


R-Precision: 0.41030244417640205
NDCG: 0.17023055036950918


2e essai

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
from collections import defaultdict, Counter
from scipy.spatial.distance import jaccard
from sklearn.metrics import roc_auc_score

# Ensure reproducibility
np.random.seed(42)

In [None]:
# Load dataset
data_path = "processed_first_50_files.csv"  # Replace with your dataset path
data = pd.read_csv(data_path)

# Display basic info about the dataset
data.head()


Unnamed: 0,playlist_id,playlist_name,track_position,track_name,artist_name,album_name,duration_ms,track_uri
0,0,Throwbacks,0,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,The Cookbook,226863,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,0,Throwbacks,1,Toxic,Britney Spears,In The Zone,198800,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,0,Throwbacks,2,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),235933,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,0,Throwbacks,3,Rock Your Body,Justin Timberlake,Justified,267266,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,0,Throwbacks,4,It Wasn't Me,Shaggy,Hot Shot,227600,spotify:track:1lzr43nnXAijIGYnCT8M8H


In [3]:
def compute_r_precision(recommended_tracks, ground_truth):
    relevant_tracks = len(set(recommended_tracks) & set(ground_truth))
    return relevant_tracks / len(ground_truth) if ground_truth else 0

def compute_ndcg(recommended_tracks, ground_truth):
    dcg = sum([1 / np.log2(i + 2) if track in ground_truth else 0 for i, track in enumerate(recommended_tracks)])
    idcg = sum([1 / np.log2(i + 2) for i in range(len(ground_truth))])
    return dcg / idcg if idcg > 0 else 0

def compute_auc(recommended_scores, ground_truth_labels):
    return roc_auc_score(ground_truth_labels, recommended_scores)


In [4]:
def jaccard_similarity(playlist1, playlist2):
    set1, set2 = set(playlist1), set(playlist2)
    return len(set1 & set2) / len(set1 | set2) if set1 | set2 else 0

# Generate recommendations
def recommend_jaccard(playlist, all_playlists, top_n=500):
    similarities = []
    for pid, tracks in all_playlists.items():
        similarity = jaccard_similarity(playlist, tracks)
        similarities.append((pid, similarity))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    recommended_playlists = similarities[:top_n]
    recommended_tracks = {track for pid, _ in recommended_playlists for track in all_playlists[pid]}
    return list(recommended_tracks)[:top_n]

# Evaluate Jaccard Model
# Create a dictionary: playlist_id -> list of track_uri
all_playlists = data.groupby('playlist_id')['track_uri'].apply(list).to_dict()

# Example: Evaluate a single playlist
playlist_id = list(all_playlists.keys())[0]
ground_truth = set(all_playlists[playlist_id])
seed_tracks = list(ground_truth)[:max(1, len(ground_truth)//2)]
holdout_tracks = ground_truth - set(seed_tracks)

recommended_tracks = recommend_jaccard(seed_tracks, all_playlists)

# Calculate metrics
r_precision = compute_r_precision(recommended_tracks, holdout_tracks)
ndcg = compute_ndcg(recommended_tracks, holdout_tracks)
print(f"Jaccard Model - R-Precision: {r_precision}, NDCG: {ndcg}")


Jaccard Model - R-Precision: 0.038461538461538464, NDCG: 0.013957181036599698


In [5]:
# Prepare data for Item2Vec
sentences = data.groupby('playlist_id')['track_uri'].apply(list).tolist()

# Train Word2Vec Model
item2vec_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, sg=1, workers=4)

def recommend_item2vec(seed_tracks, all_tracks, top_n=500):
    similarities = {}
    for track in all_tracks:
        if track not in seed_tracks:
            similarity = sum(item2vec_model.wv.similarity(seed_track, track) for seed_track in seed_tracks if track in item2vec_model.wv)
            similarities[track] = similarity
    recommended_tracks = sorted(similarities.keys(), key=lambda x: similarities[x], reverse=True)
    return recommended_tracks[:top_n]

# Evaluate Item2Vec Model
all_tracks = set(data['track_uri'])
recommended_tracks = recommend_item2vec(seed_tracks, all_tracks)

# Calculate metrics
r_precision = compute_r_precision(recommended_tracks, holdout_tracks)
ndcg = compute_ndcg(recommended_tracks, holdout_tracks)
print(f"Item2Vec Model - R-Precision: {r_precision}, NDCG: {ndcg}")


Item2Vec Model - R-Precision: 0.38461538461538464, NDCG: 0.17926785246016472


In [6]:
# Prepare interaction matrix
interaction_matrix = pd.pivot_table(data, index='playlist_id', columns='track_uri', aggfunc='size', fill_value=0)

# Train SVD Model
svd = TruncatedSVD(n_components=50, random_state=42)
playlist_embeddings = svd.fit_transform(interaction_matrix)

def recommend_fism(playlist_idx, interaction_matrix, top_n=500):
    similarities = cosine_similarity(playlist_embeddings[playlist_idx].reshape(1, -1), playlist_embeddings).flatten()
    similar_playlists = np.argsort(-similarities)[1:]  # Exclude itself
    recommended_tracks = set()
    for idx in similar_playlists:
        if len(recommended_tracks) >= top_n:
            break
        recommended_tracks.update(interaction_matrix.columns[interaction_matrix.iloc[idx] > 0])
    return list(recommended_tracks)[:top_n]

# Evaluate FISM Model
playlist_idx = list(interaction_matrix.index).index(playlist_id)
recommended_tracks = recommend_fism(playlist_idx, interaction_matrix)

# Calculate metrics
r_precision = compute_r_precision(recommended_tracks, holdout_tracks)
ndcg = compute_ndcg(recommended_tracks, holdout_tracks)
print(f"FISM Model - R-Precision: {r_precision}, NDCG: {ndcg}")


  num_cells = num_rows * num_columns


IndexError: index 1376000540 is out of bounds for axis 0 with size 1375963520

3e essai

In [19]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import roc_auc_score

# Ensure reproducibility
np.random.seed(42)


In [20]:
# Load dataset
data_path = "processed_first_50_files.csv"  # Replace with your dataset path
data = pd.read_csv(data_path)

# Split into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Display basic info about the datasets
print("Training Data:")
print(train_df.head())
print("\nTesting Data:")
print(test_df.head())


Training Data:
         playlist_id playlist_name  track_position             track_name  \
3289146       142108      throwbax              17                     Oh   
3126781        14679          viva              30               Abrázame   
1203909       113981  random songs              67  Different Color Molly   
2698683       134146         Venus              13        Kid Charlemagne   
2796050       135659       Country              50        Californication   

                   artist_name       album_name  duration_ms  \
3289146                  Ciara          Goodies       256346   
3126781                 Camila      Todo Cambio       230720   
1203909             Smokepurpp         Deadstar       185298   
2698683             Steely Dan   The Royal Scam       278800   
2796050  Red Hot Chili Peppers  Californication       329733   

                                    track_uri  
3289146  spotify:track:7i7UIbm5E0DD7aSOYvwp2v  
3126781  spotify:track:6rCyXpDwlMf1GH2qlp

In [21]:
def compute_r_precision(recommended_tracks, ground_truth):
    relevant_tracks = len(set(recommended_tracks) & set(ground_truth))
    return relevant_tracks / len(ground_truth) if ground_truth else 0

def compute_ndcg(recommended_tracks, ground_truth):
    dcg = sum([1 / np.log2(i + 2) if track in ground_truth else 0 for i, track in enumerate(recommended_tracks)])
    idcg = sum([1 / np.log2(i + 2) for i in range(len(ground_truth))])
    return dcg / idcg if idcg > 0 else 0

def compute_auc(recommended_scores, ground_truth_labels):
    return roc_auc_score(ground_truth_labels, recommended_scores)


In [25]:
# Compute Jaccard similarity
def jaccard_similarity(playlist1, playlist2):
    set1, set2 = set(playlist1), set(playlist2)
    return len(set1 & set2) / len(set1 | set2) if set1 | set2 else 0

# Generate recommendations using the training set
def recommend_jaccard(playlist, train_playlists, top_n=500):
    similarities = []
    for pid, tracks in train_playlists.items():
        similarity = jaccard_similarity(playlist, tracks)
        similarities.append((pid, similarity))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    recommended_playlists = similarities[:top_n]
    recommended_tracks = {track for pid, _ in recommended_playlists for track in train_playlists[pid]}
    return list(recommended_tracks)[:top_n]

# Main function for evaluating a single playlist
def evaluate_single_playlist(train_df, test_df):
    # Prepare train and test playlists
    train_playlists = train_df.groupby('playlist_id')['track_uri'].apply(list).to_dict()
    test_playlists = test_df.groupby('playlist_id')['track_uri'].apply(list).to_dict()

    # Select one playlist for evaluation
    playlist_id = list(test_playlists.keys())[0]
    tracks = test_playlists[playlist_id]
    
    # Split tracks into seed and holdout
    seed_tracks = tracks[:max(1, len(tracks) // 2)]
    holdout_tracks = set(tracks) - set(seed_tracks)
    
    # Generate recommendations
    recommended_tracks = recommend_jaccard(seed_tracks, train_playlists)
    
    # Calculate metrics
    r_precision = len(set(recommended_tracks) & holdout_tracks) / len(holdout_tracks) if holdout_tracks else 0
    dcg = sum(1 / np.log2(i + 2) for i, track in enumerate(recommended_tracks) if track in holdout_tracks)
    idcg = sum(1 / np.log2(i + 2) for i in range(len(holdout_tracks)))
    ndcg = dcg / idcg if idcg > 0 else 0

    # Print the results
    print(f"Jaccard Model - Playlist ID: {playlist_id}")
    print(f"R-Precision: {r_precision}")
    print(f"NDCG: {ndcg}")

# Call the function
evaluate_single_playlist(train_df, test_df)


Jaccard Model - Playlist ID: 0
R-Precision: 0.16666666666666666
NDCG: 0.034345378034907025


In [27]:
# Import necessary libraries for Item2Vec
from gensim.models import Word2Vec
import numpy as np

# Train an Item2Vec model

def train_item2vec(train_df, vector_size=50, window=5, min_count=1, epochs=10):
    """Train an Item2Vec model using tracks in playlists."""
    # Prepare sentences (playlists as lists of track URIs)
    playlists = train_df.groupby('playlist_id')['track_uri'].apply(list).tolist()
    
    # Train Word2Vec (Item2Vec)
    model = Word2Vec(sentences=playlists, vector_size=vector_size, window=window, min_count=min_count, sg=1, epochs=epochs)
    return model

# Generate recommendations using the trained Item2Vec model
def recommend_item2vec(seed_tracks, model, train_playlists, top_n=500):
    """Generate recommendations for a playlist using the Item2Vec model."""
    recommended_tracks = set()

    for track in seed_tracks:
        if track in model.wv:
            # Get most similar tracks for the seed track
            similar_tracks = model.wv.most_similar(track, topn=top_n)
            recommended_tracks.update([t[0] for t in similar_tracks])

    # Filter out seed tracks
    recommended_tracks -= set(seed_tracks)

    return list(recommended_tracks)[:top_n]

# Main function for evaluating a single playlist using Item2Vec
def evaluate_single_playlist_item2vec(train_df, test_df):
    # Train Item2Vec model
    print("Training Item2Vec model...")
    item2vec_model = train_item2vec(train_df)

    # Prepare train and test playlists
    train_playlists = train_df.groupby('playlist_id')['track_uri'].apply(list).to_dict()
    test_playlists = test_df.groupby('playlist_id')['track_uri'].apply(list).to_dict()

    # Select one playlist for evaluation
    playlist_id = list(test_playlists.keys())[0]
    tracks = test_playlists[playlist_id]

    # Split tracks into seed and holdout
    seed_tracks = tracks[:max(1, len(tracks) // 2)]
    holdout_tracks = set(tracks) - set(seed_tracks)

    # Generate recommendations
    recommended_tracks = recommend_item2vec(seed_tracks, item2vec_model, train_playlists)

    # Calculate metrics
    r_precision = len(set(recommended_tracks) & holdout_tracks) / len(holdout_tracks) if holdout_tracks else 0
    dcg = sum(1 / np.log2(i + 2) for i, track in enumerate(recommended_tracks) if track in holdout_tracks)
    idcg = sum(1 / np.log2(i + 2) for i in range(len(holdout_tracks)))
    ndcg = dcg / idcg if idcg > 0 else 0

    # Print the results
    print(f"Item2Vec Model - Playlist ID: {playlist_id}")
    print(f"R-Precision: {r_precision}")
    print(f"NDCG: {ndcg}")

# Call the function
evaluate_single_playlist_item2vec(train_df, test_df)


Training Item2Vec model...
Item2Vec Model - Playlist ID: 0
R-Precision: 0.5
NDCG: 0.15014788133756604


In [33]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

# Generate FISM embeddings using sparse matrices
def generate_fism_embeddings_sparse(interaction_matrix, alpha=0.5):
    """
    Generate FISM embeddings for all playlists and tracks using a sparse matrix.

    Parameters:
        interaction_matrix (csr_matrix): A binary playlist-track interaction sparse matrix.
        alpha (float): Neighborhood agreement value in the range of [0, 1].

    Returns:
        csr_matrix: Track embeddings.
    """
    row_sum = np.array(interaction_matrix.sum(axis=1)).flatten()
    row_sum_alpha = np.power(row_sum, alpha)
    row_sum_alpha[row_sum_alpha == 0] = 1  # Avoid division by zero

    normalized_matrix = interaction_matrix.multiply(1 / row_sum_alpha[:, np.newaxis])
    track_embeddings = normalized_matrix.T @ interaction_matrix

    return track_embeddings

def evaluate_fism_sparse(train_df, test_df, alpha=0.5):
    # Debug: Print columns to verify
    print("Train DataFrame Columns:", train_df.columns)
    print("Test DataFrame Columns:", test_df.columns)

    # Ensure the columns are correctly named
    if 'playlist_id' not in train_df.columns or 'track_uri' not in train_df.columns:
        raise KeyError("Missing required columns: 'playlist_id' or 'track_uri'")

    # Proceed with existing implementation...


# Main evaluation function for FISM
def evaluate_fism_sparse(train_df, test_df, alpha=0.5):
    """
    Evaluate FISM model using sparse matrices.

    Parameters:
        train_df (pd.DataFrame): Training data.
        test_df (pd.DataFrame): Testing data.
        alpha (float): Neighborhood agreement value for FISM.

    Returns:
        None
    """
    # Create a mapping of playlist and track IDs to sequential indices
    playlist_ids = train_df['playlist_id'].unique()
    playlist_to_idx = {pid: idx for idx, pid in enumerate(playlist_ids)}
    idx_to_playlist = {idx: pid for pid, idx in playlist_to_idx.items()}

    all_tracks = list(set(train_df['track_uri']).union(set(test_df['track_uri'])))
    track_to_idx = {track: idx for idx, track in enumerate(all_tracks)}
    idx_to_track = {idx: track for track, idx in track_to_idx.items()}

    num_playlists = len(playlist_ids)
    num_tracks = len(all_tracks)

    # Create a sparse interaction matrix
    rows, cols, data = [], [], []
    for pid, tracks in train_df.groupby('playlist_id')['track_uri']:
        for track in tracks:
            rows.append(playlist_to_idx[pid])
            cols.append(track_to_idx[track])
            data.append(1)

    interaction_matrix = csr_matrix((data, (rows, cols)), shape=(num_playlists, num_tracks))

    # Generate FISM embeddings
    track_embeddings = generate_fism_embeddings_sparse(interaction_matrix, alpha)

    # Select a single test playlist
    test_playlists = test_df.groupby('playlist_id')['track_uri'].apply(list).to_dict()
    playlist_id = list(test_playlists.keys())[0]
    tracks = test_playlists[playlist_id]

    # Split tracks into seed and holdout
    seed_tracks = tracks[:max(1, len(tracks) // 2)]
    holdout_tracks = set(tracks) - set(seed_tracks)

    # Find seed indices
    seed_indices = [track_to_idx[track] for track in seed_tracks if track in track_to_idx]

    # Recommend tracks
    playlist_idx = playlist_to_idx[playlist_id]  # Map playlist_id to row index
    scores = track_embeddings[:, playlist_idx].toarray().flatten()
    recommended_indices = np.argsort(-scores)
    recommended_tracks = [idx_to_track[idx] for idx in recommended_indices if idx in idx_to_track]

    # Calculate metrics
    r_precision = len(set(recommended_tracks) & holdout_tracks) / len(holdout_tracks) if holdout_tracks else 0
    dcg = sum(1 / np.log2(i + 2) for i, track in enumerate(recommended_tracks) if track in holdout_tracks)
    idcg = sum(1 / np.log2(i + 2) for i in range(len(holdout_tracks)))
    ndcg = dcg / idcg if idcg > 0 else 0

    # Print the results
    print(f"FISM Model - Playlist ID: {playlist_id}")
    print(f"R-Precision: {r_precision}")
    print(f"NDCG: {ndcg}")

# Example Usage
evaluate_fism_sparse(train_df, test_df)


KeyError: 'Column not found: 0'

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import dask.dataframe as dd

# Load dataset
data_path = "processed_first_50_files.csv"  # Replace with your dataset path
data = pd.read_csv(data_path)

# Split into training and testing sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Confirm types
print(type(train_df))  # Should be <class 'pandas.core.frame.DataFrame'>
print(type(test_df))  # Should be <class 'pandas.core.frame.DataFrame'>

# Convert to Dask DataFrames
train_df = dd.from_pandas(train_df, npartitions=4)
test_df = dd.from_pandas(test_df, npartitions=2)

# Confirm Dask DataFrame types
print(type(train_df))  # Should be <class 'dask.dataframe.core.DataFrame'>
print(type(test_df))  # Should be <class 'dask.dataframe.core.DataFrame'>


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'dask_expr._collection.DataFrame'>
<class 'dask_expr._collection.DataFrame'>


In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import dask.dataframe as dd

# Load dataset
data_path = "processed_first_50_files.csv"  # Replace with your dataset path
data = pd.read_csv(data_path)

# Split into training and testing sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Confirm types
print(type(train_df))  # Should be <class 'pandas.core.frame.DataFrame'>
print(type(test_df))  # Should be <class 'pandas.core.frame.DataFrame'>

# Convert to Dask DataFrames
train_df = dd.from_pandas(train_df, npartitions=4)
test_df = dd.from_pandas(test_df, npartitions=2)

# Confirm Dask DataFrame types
print(type(train_df))  # Should be <class 'dask.dataframe.core.DataFrame'>
print(type(test_df))  # Should be <class 'dask.dataframe.core.DataFrame'>


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'dask_expr._collection.DataFrame'>
<class 'dask_expr._collection.DataFrame'>


In [34]:
import numpy as np
import dask.dataframe as dd
from dask import delayed
from scipy.sparse import csr_matrix

# Generate FISM embeddings using sparse matrices
def generate_fism_embeddings_sparse(interaction_matrix, alpha=0.5):
    row_sum = np.array(interaction_matrix.sum(axis=1)).flatten()
    row_sum_alpha = np.power(row_sum, alpha)
    row_sum_alpha[row_sum_alpha == 0] = 1  # Avoid division by zero

    normalized_matrix = interaction_matrix.multiply(1 / row_sum_alpha[:, np.newaxis])
    track_embeddings = normalized_matrix.T @ interaction_matrix
    return track_embeddings

# Prepare the interaction matrix using Dask
def prepare_interaction_matrix_dask(train_df, track_to_idx, num_playlists, num_tracks):
    rows, cols, data = [], [], []

    @delayed
    def process_partition(partition):
        partition_rows, partition_cols, partition_data = [], [], []
        for pid, tracks in partition.groupby('playlist_id')['track_uri']:
            for track in tracks:
                partition_rows.append(pid)  # Playlist ID
                partition_cols.append(track_to_idx[track])  # Convert track URI to index
                partition_data.append(1)
        return partition_rows, partition_cols, partition_data

    partitions = [
        process_partition(partition)
        for partition in train_df.to_delayed()
    ]

    results = delayed(partitions).compute()
    for r, c, d in results:
        rows.extend(r)
        cols.extend(c)
        data.extend(d)

    interaction_matrix = csr_matrix((data, (rows, cols)), shape=(num_playlists, num_tracks))
    return interaction_matrix

# Evaluate FISM using Dask
def evaluate_fism_dask(train_df, test_df, alpha=0.5):
    # Convert track URIs to indices
    all_tracks = dd.concat([train_df['track_uri'], test_df['track_uri']]).unique().compute()
    track_to_idx = {track: idx for idx, track in enumerate(all_tracks)}
    idx_to_track = {idx: track for track, idx in track_to_idx.items()}

    num_playlists = train_df['playlist_id'].nunique().compute()
    num_tracks = len(all_tracks)

    # Prepare interaction matrix
    print("Preparing interaction matrix...")
    interaction_matrix = prepare_interaction_matrix_dask(train_df, track_to_idx, num_playlists, num_tracks)

    # Generate FISM embeddings
    print("Generating FISM embeddings...")
    track_embeddings = generate_fism_embeddings_sparse(interaction_matrix, alpha)

    # Select a single test playlist
    test_playlists = test_df.groupby('playlist_id')['track_uri'].apply(list).compute()
    playlist_id = list(test_playlists.keys())[0]
    tracks = test_playlists[playlist_id]

    # Split tracks into seed and holdout
    seed_tracks = tracks[:max(1, len(tracks) // 2)]
    holdout_tracks = set(tracks) - set(seed_tracks)

    # Find seed indices
    seed_indices = [track_to_idx[track] for track in seed_tracks if track in track_to_idx]

    # Recommend tracks
    playlist_idx = playlist_id  # Assume playlist_id matches interaction matrix row
    scores = track_embeddings[:, playlist_idx].toarray().flatten()
    recommended_indices = np.argsort(-scores)
    recommended_tracks = [idx_to_track[idx] for idx in recommended_indices if idx in idx_to_track]

    # Calculate metrics
    r_precision = len(set(recommended_tracks) & holdout_tracks) / len(holdout_tracks) if holdout_tracks else 0
    dcg = sum(1 / np.log2(i + 2) for i, track in enumerate(recommended_tracks) if track in holdout_tracks)
    idcg = sum(1 / np.log2(i + 2) for i in range(len(holdout_tracks)))
    ndcg = dcg / idcg if idcg > 0 else 0

    # Print the results
    print(f"FISM Model - Playlist ID: {playlist_id}")
    print(f"R-Precision: {r_precision}")
    print(f"NDCG: {ndcg}")

# Load Dask DataFrames
train_df = dd.from_pandas(train_df, npartitions=4)
test_df = dd.from_pandas(test_df, npartitions=2)

# Example Usage
evaluate_fism_dask(train_df, test_df)


TypeError: Input must be a pandas DataFrame or Series.

In [None]:
# Summarize Results
results = {
    "Model": ["Jaccard", "Item2Vec", "FISM"],
    "R-Precision": [r_precision_jaccard, r_precision_item2vec, r_precision_fism],
    "NDCG": [ndcg_jaccard, ndcg_item2vec, ndcg_fism]
}
results_df = pd.DataFrame(results)
results_df
