In [None]:
!pip install pandas
import numpy as np
import pandas as pd
import t2_utils as t2
import time 
from ast import literal_eval
import os

In [None]:
# change the file path here
file_path_base = r""
file_path_info = file_path_base + "id_information_mmsr.tsv" 
file_path_word2vec = file_path_base + "id_lyrics_word2vec_mmsr.tsv"
file_path_tfidf = file_path_base + "id_lyrics_tf-idf_mmsr.tsv"
file_path_bert = file_path_base + "id_lyrics_bert_mmsr.tsv"

file_path_genre = file_path_base + "id_genres_mmsr.tsv"
file_path_musicnn = file_path_base + "id_musicnn_mmsr.tsv"
file_path_mfcc_bow = file_path_base + "id_mfcc_bow_mmsr.tsv"
file_path_ivec_256 = file_path_base + "id_ivec256_mmsr.tsv"
file_path_logfluc = file_path_base + "id_blf_logfluc_mmsr.tsv"

df_info = pd.read_table(file_path_info)
df_word2vec = pd.read_table(file_path_word2vec)
df_tfidf = pd.read_table(file_path_tfidf)
df_bert = pd.read_table(file_path_bert)

df_genre = pd.read_table(file_path_genre)
df_musicnn = pd.read_table(file_path_musicnn)
df_mfcc_bow = pd.read_table(file_path_mfcc_bow)
df_ivec_256 = pd.read_table(file_path_ivec_256)
df_logfluc = pd.read_table(file_path_logfluc)

df_genre["genre"] = df_genre["genre"].apply(literal_eval) # convert string into array

In [None]:
# getting the unique genres
genres = df_genre.explode("genre")["genre"].values
unique_genres = np.unique(genres)
len(unique_genres)

In [None]:
# Audio-based(cos-sim, MFCC BoW)
t2.song_retrieval(df_info, df_mfcc_bow, "As Long as You Love Me","Justin Bieber", 10, filter = ["id", "artist", "song", "sim"])

In [None]:
# Audio-based(cos-sim, DNN)
t2.song_retrieval(df_info, df_musicnn, "As Long as You Love Me","Justin Bieber", 10, filter = ["id", "artist", "song", "sim"])

In [None]:
# Audio-based(cos-sim, ivec 256)
t2.song_retrieval(df_info, df_ivec_256, "As Long as You Love Me","Justin Bieber", 10, filter = ["id", "artist", "song", "sim"])

In [None]:
# Audio-based(cos-sim, BLF logfluc)
t2.song_retrieval(df_info, df_logfluc, "As Long as You Love Me","Justin Bieber", 10, filter = ["id", "artist", "song", "sim"])

In [None]:
df_genre["idcg_value"] = df_genre.apply(lambda x: t2.idcg(x, df_genre), axis=1)
df_genre

In [None]:
df_song_data = pd.merge(df_info, df_genre, on = "id")
df_song_data

In [None]:
# Warning: running this can take a long time
st = time.time()
retr_systems = [("random baseline",df_song_data, t2.cos_sim), ("word2vec euc-sim", df_word2vec, t2.euc_sim), 
                ("tfidf cos-sim", df_tfidf, t2.cos_sim), ("bert cos-sim", df_bert, t2.cos_sim), 
                ("mfcc_bow cos-sim", df_mfcc_bow, t2.cos_sim), ("musicnn cos-sim", df_musicnn, t2.cos_sim), 
                ("ivec_256 cos-sim", df_ivec_256, t2.cos_sim), ("logfluc cos-sim", df_logfluc, t2.cos_sim)]

retr_systems_columns = ["Avg ndcg", "Avg genre diversity", "Genre coverage"]
retr_systems_index = [tupl[0] for tupl in retr_systems]

df_metrics = pd.DataFrame(columns=retr_systems_columns, index = retr_systems_index)

for system in retr_systems:
    index, feature_set, sim_func = system
    metrics = t2.evaluation_pipeline(df_song_data, feature_set, sim_func, df_song_data.equals(feature_set))
    df_metrics.loc[index] = metrics
     
print("Execution time(min):", (time.time()-st)/60)
df_metrics

In [None]:
# below this cell the evaluation pipeline for every retrieval system is run individually, so it can be ignored

In [None]:
st = time.time()
metrics_random = t2.evaluation_pipeline(df_song_data, df_song_data, t2.cos_sim, True)
print("random")
print("Avg ndcg:", metrics_word2vec[0])
print("Avg genre diversity:", metrics_word2vec[1])
print("genre coverage:", metrics_word2vec[2])
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
metrics_word2vec = t2.evaluation_pipeline(df_song_data, df_word2vec, t2.euc_sim)
print("word2vec")
print("Avg ndcg:", metrics_word2vec[0])
print("Avg genre diversity:", metrics_word2vec[1])
print("genre coverage:", metrics_word2vec[2])
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
metrics_tfidf = t2.evaluation_pipeline(df_song_data, df_tfidf, t2.cos_sim)
print("tfidf")
print("Avg ndcg:", metrics_tfidf[0])
print("Avg genre diversity:", metrics_tfidf[1])
print("genre coverage:", metrics_tfidf[2])
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
metrics_bert = t2.evaluation_pipeline(df_song_data, df_bert, t2.cos_sim)
print("bert")
print("Avg ndcg:", metrics_bert[0])
print("Avg genre diversity:", metrics_bert[1])
print("genre coverage:", metrics_bert[2])
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
metrics_mfcc_bow = t2.evaluation_pipeline(df_song_data,df_mfcc_bow, t2.cos_sim)
print("mfcc_bow")
print("Avg ndcg:", metrics_mfcc_bow[0])
print("Avg genre diversity:", metrics_mfcc_bow[1])
print("genre coverage:", metrics_mfcc_bow[2])
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
metrics_musicnn = t2.evaluation_pipeline(df_song_data, df_musicnn, t2.cos_sim)
print("musicnn")
print("Avg ndcg:", metrics_musicnn[0])
print("Avg genre diversity:", metrics_musicnn[1])
print("genre coverage:", metrics_musicnn[2])
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
metrics_ivec_256 = t2.evaluation_pipeline(df_song_data, df_ivec_256, t2.cos_sim)
print("ivec_256")
print("Avg ndcg:", metrics_ivec_256[0])
print("Avg genre diversity:", metrics_ivec_256[1])
print("genre coverage:", metrics_ivec_256[2])
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
metrics_logfluc = t2.evaluation_pipeline(df_song_data, df_logfluc, t2.cos_sim)
print("logfluc")
print("Avg ndcg:", metrics_logfluc[0])
print("Avg genre diversity:", metrics_logfluc[1])
print("genre coverage:", metrics_logfluc[2])
print("Execution time(min):", (time.time()-st)/60)

In [None]:
# TODO: when all RS finished -> Please ignore
df_dict = {
    'df_word2vec': pd.read_table(file_path_word2vec),
    'df_tfidf': pd.read_table(file_path_tfidf)
}  # todo rest: df_bert, df_mfcc_bow, df_musicnn, df_ivec_256

result = {}

for key, value in df_dict.items():
    print(f"DataFrame {key}:")
    dcg_value_list = df_info.apply(calc_ndcg, axis=1, args=(value, idcg_matrix,))
    result[key] = np.mean(dcg_value_list)