# Task 1: Fundamentals

Baseline, text-based similarity and retrieval, simple UI, accuracy metrics

- Input (query) to the IR system is the title and artist of a song (track) from the dataset

- Output of the IR system is a list of songs from the dataset (title and artist) of length N that
are similar to the query song

In [2]:
# Imports
import random
import pandas as pd
import ast  
from sklearn.metrics import ndcg_score
from sklearn.metrics.pairwise import cosine_similarity
import hashlib, os, json
from tqdm import tqdm
from ipywidgets import Combobox, Button, VBox, Output
from IPython.display import display

## Baseline System

Create a simple baseline system that randomly (regardless of the query) selects N items
from the catalog (excluding the query item); Make sure that the system produces new
results for each query!

In [33]:
# Load and process datasets 
def load_dataset_with_info(genre_file, info_file):
    """
    Load the genre, metadata, and information datasets.
    Merge them into one and parse the 'genre' column.
    """
    # Load genres and information
    genres = pd.read_csv(genre_file, sep='\t')
    info = pd.read_csv(info_file, sep='\t')
    
    # Ensure column names are consistent
    genres.columns = genres.columns.str.strip()
    info.columns = info.columns.str.strip()
    
    # Parse 'genre' column into lists
    genres['genre'] = genres['genre'].apply(ast.literal_eval)
    
    # Merge all datasets on the 'id' column
    dataset = pd.merge(genres, info, on='id')
    return dataset

# Define file paths
genre_file = "dataset/id_genres_mmsr.tsv"
info_file = "dataset/id_information_mmsr.tsv"

# Load the dataset
dataset = load_dataset_with_info(genre_file, info_file)

# Display a sample to ensure data is loaded correctly
dataset.head()



Unnamed: 0,id,genre,artist,song,album_name
0,01rMxQv6vhyE1oQX,"[rock, pop punk]",Against the Current,Chasing Ghosts,In Our Bones
1,02ZnlCGZEbkfCDxo,"[pop, italian pop, latin, europop, ambient, po...",Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te
2,04OjszRi9rC5BlHC,"[experimental, folk, lo fi, freak folk, indie ...",Grizzly Bear,Knife,Yellow House
3,04iitW3ffa0mhpx3,"[pop, r b, hip hop, soul, rhythm and blues, si...",Ne-Yo,Miss Independent,Year Of The Gentleman (Bonus Track Edition)
4,04xUDjAYC14jsHyH,"[punk, emo, post hardcore, post punk, melodic ...",Jawbreaker,Jinx Removing,24 Hour Revenge Therapy (Remastered)


In [34]:
# Random Baseline System (Adjusted for Song and Artist)
def random_baseline(query_song, dataset, N=10):
    """
    Generate a random list of N songs excluding the query song.
    Internally keep 'id', but exclude it in the output display.
    """
    filtered_dataset = dataset[dataset['id'] != query_song['id']].sample(frac=1, random_state=random.randint(0, 10000))
    return filtered_dataset.head(N)


# Test the adjusted random baseline
query_song = dataset.iloc[0]  # Example query song
print("Query Song:", query_song['song'], "by", query_song['artist'])

random_output = random_baseline(query_song, dataset, N=10)
print("\nRandom Output (Song and Artist):")
print(random_output[['song', 'artist']])  # Display only song and artist


Query Song: Chasing Ghosts by Against the Current

Random Output (Song and Artist):
                                    song                     artist
251                      Endless Endings  The Dillinger Escape Plan
2366  Save Yourself, I'll Hold Them Back        My Chemical Romance
2907            Too Much Of A Good Thing                   The Sons
3759                           Pensacola       Manchester Orchestra
1453                   Me In My Own Head                  Beartooth
3252                    One In A Million              Pet Shop Boys
4755                               Crawl                Chris Brown
846             That's Just What You Are                 Aimee Mann
7            The Devil Is in the Details           Boards of Canada
2679                             Que Lio               Willie Colón


## Accuracy Metrics

Implement and compute accuracy metrics for your baseline (Precision@10, Recall@10,
NDCG@10, MRR), averaged over all queries; Use genre as relevance criterion (i.e., a
retrieved song is considered relevant if its top genre matches the top genre of the query)

In [35]:
# Compute Precision@K
def precision_at_k(retrieved, relevant, k=10):
    retrieved_relevant = [song for song in retrieved[:k] if song in relevant]
    return len(retrieved_relevant) / k

# Compute Recall@K
def recall_at_k(retrieved, relevant, k=10):
    retrieved_relevant = [song for song in retrieved[:k] if song in relevant]
    return len(retrieved_relevant) / len(relevant) if relevant else 0

# Compute Mean Reciprocal Rank (MRR)
def mean_reciprocal_rank(retrieved, relevant):
    for idx, song in enumerate(retrieved):
        if song in relevant:
            return 1 / (idx + 1)
    return 0

# Compute NDCG@K
def ndcg_at_k(retrieved, relevant, k=10):
    y_true = [1 if song in relevant else 0 for song in retrieved[:k]]
    y_score = [1] * len(y_true)  # Assume all retrieved items have the same score
    return ndcg_score([y_true], [y_score], k=k)


In [36]:
# Evaluate Baseline System
def evaluate_baseline_with_info(dataset, N=10):
    """
    Evaluate the random baseline system on all queries.
    Only show 'song' and 'artist' in the output, but use 'id' for evaluations.
    """
    precision_list, recall_list, ndcg_list, mrr_list = [], [], [], []
    for _, query_song in dataset.iterrows():
        # Define query
        query_genre = query_song['genre'][0]  # Use the first genre as the primary genre
        relevant_songs = dataset[dataset['genre'].apply(lambda genres: query_genre in genres)]['id'].tolist()
        
        # Random baseline
        retrieved_songs = random_baseline(query_song, dataset, N=N)['id'].tolist()  # Use 'id' for metrics
        
        # Compute metrics
        precision_list.append(precision_at_k(retrieved_songs, relevant_songs, k=N))
        recall_list.append(recall_at_k(retrieved_songs, relevant_songs, k=N))
        ndcg_list.append(ndcg_at_k(retrieved_songs, relevant_songs, k=N))
        mrr_list.append(mean_reciprocal_rank(retrieved_songs, relevant_songs))
    
    # Compute averages
    return {
        "Precision@10": sum(precision_list) / len(precision_list),
        "Recall@10": sum(recall_list) / len(recall_list),
        "NDCG@10": sum(ndcg_list) / len(ndcg_list),
        "MRR": sum(mrr_list) / len(mrr_list),
    }

# Run evaluation on the baseline system
metrics = evaluate_baseline_with_info(dataset, N=10)

# Display the results
print("Evaluation Metrics for Random Baseline System:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")



Evaluation Metrics for Random Baseline System:
Precision@10: 0.1693
Recall@10: 0.0018
NDCG@10: 0.3595
MRR: 0.2938


## Retrieval System

In [37]:
def add_lyrics_to_dataset(dataset, lyrics_file):
    """
    Add lyrics to dataset and extract column values for text feature representation method.

    Args
    ----
        dataset : pd.DataFrame
            Pre-made dataset with song names, authors etc.
        lyrics_file : str
            Location of the file with precomputed lyrics features.

    Returns
    -------
        tuple
            Merged dataset, text feature representation method cols.
    """
    lyrics_data = pd.read_csv(lyrics_file, sep='\t')

    # Ensuring column name consistency
    dataset.columns = dataset.columns.str.strip()
    lyrics_data.columns = lyrics_data.columns.str.strip()

    # Excluding song col because it causes collision of the data and pandas processes it incorrectly
    lyrics_data = lyrics_data.drop(columns=['song'])

    merged_data = pd.merge(dataset, lyrics_data, on='id')

    # Excluding id
    feature_columns = lyrics_data.columns[1:]
    return merged_data, feature_columns

In [38]:
lyrics_tf_idf_file = 'dataset/id_lyrics_tf-idf_mmsr.tsv'

# Adding precomputed td-idf features to our dataset
dataset_with_lyrics, tfidf_columns = add_lyrics_to_dataset(dataset, lyrics_tf_idf_file)  

In [39]:
dataset_with_lyrics.head()

Unnamed: 0,id,genre,artist,song,album_name,abl,accept,across,act,addict,...,yea,yeah,year,yellow,yes,yesterday,yet,yo,young,youth
0,01rMxQv6vhyE1oQX,"[rock, pop punk]",Against the Current,Chasing Ghosts,In Our Bones,0.0,0.0,0.0,0.0,0.0,...,0.0,0.079754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,02ZnlCGZEbkfCDxo,"[pop, italian pop, latin, europop, ambient, po...",Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,04OjszRi9rC5BlHC,"[experimental, folk, lo fi, freak folk, indie ...",Grizzly Bear,Knife,Yellow House,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,04iitW3ffa0mhpx3,"[pop, r b, hip hop, soul, rhythm and blues, si...",Ne-Yo,Miss Independent,Year Of The Gentleman (Bonus Track Edition),0.0,0.0,0.0,0.0,0.0,...,0.0,0.792131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,04xUDjAYC14jsHyH,"[punk, emo, post hardcore, post punk, melodic ...",Jawbreaker,Jinx Removing,24 Hour Revenge Therapy (Remastered),0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
def compute_precomputed_embeddings_similarity(query_song, dataset, feature_columns, N=10):
    """
    Compute similarity using precomputed text embedding vectors from the dataset.

    Args
    ----
        query_song
            The query song (row) from the dataset.
        dataset
            Full dataset containing precomputed features vectors.
        feature_columns
            List of columns corresponding to feature representation method features.
        N
            Number of top results to retrieve.

    Returns
    -------
        pd.DataFrame
            A DataFrame of the top N similar songs.
    """
    # Extract the TF-IDF vector for the query song
    query_vector = query_song[feature_columns].values.reshape(1, -1)

    # Compute cosine similarity with all songs
    embed_matrix = dataset[feature_columns].values
    similarities = cosine_similarity(query_vector, embed_matrix).flatten()

    # Add similarity scores to dataset and exclude query song
    dataset['similarity'] = similarities
    results = dataset[dataset['id'] != query_song['id']].sort_values(by='similarity', ascending=False)

    return results.head(N)

In [None]:
def evaluate_precomputed_embeddings_retrieval(IR_system_name: str, dataset, feature_columns, N=10):
    """
    Evaluate the text-based retrieval system using the same metrics.
    
    Args
    ----
        IR_system_name
            Name of the IR system, must be one of "TF-IDF", "system2(TBD)", or "system3(TBD)".
        dataset
            Full dataset containing genres and precomputed text embedding vectors.
        feature_columns
            List of columns corresponding to text embedding features.
        N
            Number of top results to retrieve for evaluation.
    
    Returns
    -------
        dict
            A dictionary containing the evaluation metrics (Precision@10, Recall@10, etc.).
    """
    if IR_system_name not in ["TF-IDF", "system2(TBD)", "system3(TBD)"]:
        raise ValueError("IR_system_name must be one of 'TF-IDF', 'system2(TBD)', or 'system3(TBD)'")

    # Ensure the public directory exists
    os.makedirs("public", exist_ok=True)

    # Create a hash based on input parameters
    hash_input = IR_system_name + str(len(dataset)) + str(feature_columns) + str(N)
    params_hash = hashlib.md5(hash_input.encode()).hexdigest()
    filename = f"{params_hash[:5]}.json"
    filepath = os.path.join("public", filename)

    # Check if the JSON cache file exists
    if os.path.exists(filepath):
        with open(filepath, 'r') as f:
            cached_data = json.load(f)
        # Extract metrics from cached file if present
        metadata = cached_data.get("metadata", {})
        print(f"The results are already cached. Metadata: {metadata}")
        return {
            "Precision@10": metadata.get("Precision@10", 0),
            "Recall@10": metadata.get("Recall@10", 0),
            "NDCG@10": metadata.get("NDCG@10", 0),
            "MRR": metadata.get("MRR", 0),
        }

    # No cache found, proceed with computation
    precision_list, recall_list, ndcg_list, mrr_list = [], [], [], []
    recommendations_dict = {}

    for _, query_song in tqdm(dataset.iterrows(), total=len(dataset), desc="Evaluating retrieval"):
        query_genres = set(query_song['genre'])  # Convert query genres to a set
        relevant_songs = dataset[dataset['genre'].apply(lambda genres: len(query_genres & set(genres)) > 0)]['id'].tolist()

        retrieved_songs = compute_precomputed_embeddings_similarity(query_song, dataset, feature_columns, N=N)['id'].tolist()

        # Store recommendations
        recommendations_dict[query_song['id']] = retrieved_songs

        # Compute metrics
        precision_list.append(precision_at_k(retrieved_songs, relevant_songs, k=N))
        recall_list.append(recall_at_k(retrieved_songs, relevant_songs, k=N))
        ndcg_list.append(ndcg_at_k(retrieved_songs, relevant_songs, k=N))
        mrr_list.append(mean_reciprocal_rank(retrieved_songs, relevant_songs))

    # Compute averages
    metrics = {
        "Precision@10": sum(precision_list) / len(precision_list),
        "Recall@10": sum(recall_list) / len(recall_list),
        "NDCG@10": sum(ndcg_list) / len(ndcg_list),
        "MRR": sum(mrr_list) / len(mrr_list),
    }

    # Save recommendations and metrics to JSON file
    output_data = {
        "metadata": {
            "IR_system_name": IR_system_name,
            "N": N,
            "feature_columns": list(feature_columns),
            "Precision@10": metrics["Precision@10"],
            "Recall@10": metrics["Recall@10"],
            "NDCG@10": metrics["NDCG@10"],
            "MRR": metrics["MRR"]
        },
        "content": recommendations_dict
    }

    with open(filepath, 'w') as f:
        json.dump(output_data, f, indent=4)

    return metrics


## Precompute recommendations

In [42]:
# Use 10% of the dataset for faster execution
# small_subset = dataset_with_lyrics.sample(frac=0.1, random_state=42)

# Run evaluation
tfidf_metrics = evaluate_precomputed_embeddings_retrieval("TF-IDF", dataset_with_lyrics, tfidf_columns, N=100)

# Display the results
print("TF-IDF Retrieval Evaluation Metrics:")
for metric, value in tfidf_metrics.items():
    print(f"{metric}: {value:.4f}")

Evaluating retrieval: 100%|██████████| 5148/5148 [06:01<00:00, 14.24it/s]


TF-IDF Retrieval Evaluation Metrics:
Precision@10: 0.2066
Recall@10: 0.0289
NDCG@10: 0.5146
MRR: 0.3663


In [43]:
def get_recommendations_from_json(artist: str, song: str, filename: str, dataset):
    # Load the cached JSON data
    with open(filename, 'r') as f:
        data = json.load(f)

    # Find the query song ID
    query_id_series = dataset.loc[(dataset['artist'] == artist) & (dataset['song'] == song), 'id']
    if query_id_series.empty:
        raise ValueError("Song by the given artist not found in the dataset.")
    query_id = query_id_series.values[0]

    # Retrieve recommended song IDs from JSON
    recommended_ids = data['content'].get(query_id, [])
    if not recommended_ids:
        return f"No recommendations found for {song} by {artist}."

    # Return a dataframe of recommended songs and their artists
    recommended_songs = dataset[dataset['id'].isin(recommended_ids)][['song', 'artist']]
    return recommended_songs


## Notebok UI

In [None]:
filename = "public/fb5d6.json"  # Example filename from the previous run

dataset = small_subset  # Use the small subset for the widget

# Extract unique artists
artists = sorted(dataset['artist'].unique())

# Create artist selection widget with autocomplete
artist_widget = Combobox(
    placeholder='Type or select an artist',
    options=artists,
    description='Artist:'
)

# Create song selection widget with autocomplete
song_widget = Combobox(
    placeholder='Type or select a song',
    options=[],
    description='Song:'
)

# Update the song widget options based on the selected artist
def update_songs(change):
    if change['name'] == 'value' and change['new']:
        selected_artist = change['new']
        # Filter songs by the chosen artist
        songs_by_artist = dataset[dataset['artist'] == selected_artist]['song'].unique()
        song_widget.options = sorted(songs_by_artist) if len(songs_by_artist) > 0 else []

artist_widget.observe(update_songs, 'value')

# Button to get recommendations
button = Button(description='Get Recommendations')
output = Output()

def on_button_click(b):
    with output:
        output.clear_output()
        if artist_widget.value and song_widget.value:
            # Call the previously defined function
            recommendations = get_recommendations_from_json(artist_widget.value, song_widget.value, filename, dataset)
            display(recommendations)
        else:
            print("Please select both artist and song")

button.on_click(on_button_click)

# Arrange the widgets in a vertical box layout
ui = VBox([artist_widget, song_widget, button, output])
display(ui)


VBox(children=(Combobox(value='', description='Artist:', options=('10,000 Maniacs', '2 Chainz', '2Pac', '3OH!3…

In [None]:
# List all your input TSV files
files = [
    "dataset/id_genres_mmsr.tsv",
    "dataset/id_metadata_mmsr.tsv",
    "dataset/id_information_mmsr.tsv",
    "dataset/id_url_mmsr.tsv",
    "dataset/id_total_listens.tsv"
]

# Read all the files into a list of DataFrames
dataframes = [pd.read_csv(f, sep='\t') for f in files]

# Merge them all by 'id'
merged_df = dataframes[0]
for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df, on='id', how='outer')

# Write the merged data to a new TSV file
merged_df.to_csv("public/data/merged_dataset.tsv", sep='\t', index=False)
print("Merging completed. 'merged_dataset.tsv' created.")

Merging completed. 'merged_dataset.tsv' created.


In [1]:
import os
import json

# Directory containing the JSON files
input_dir = '../public/data/precomputed_systems'
output_file = os.path.join(input_dir, 'metadata_by_file.json')

# Function to read and extract metadata from a JSON file
def extract_metadata(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        return data.get('metadata', {})

# Dictionary to hold metadata with filenames as keys
metadata_dict = {}

# Iterate over all files in the input directory
for filename in os.listdir(input_dir):
    file_path = os.path.join(input_dir, filename)
    if filename.endswith('.json') and file_path != output_file:
        metadata = extract_metadata(file_path)
        metadata_dict[filename] = metadata

# Write metadata dictionary to the output file
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(metadata_dict, file, indent=2)

print(f'Metadata by file written to {output_file}')

Metadata by file written to ../public/data/precomputed_systems\metadata_by_file.json
