In [None]:
!pip install pandas
import numpy as np
import pandas as pd
import t2_utils as t2
import time 
from ast import literal_eval
import os

In [None]:
# change the file path here
file_path_base = r""
file_path_info = file_path_base + "id_information_mmsr.tsv" 
file_path_word2vec = file_path_base + "id_lyrics_word2vec_mmsr.tsv"
file_path_tfidf = file_path_base + "id_lyrics_tf-idf_mmsr.tsv"
file_path_bert = file_path_base + "id_lyrics_bert_mmsr.tsv"

file_path_genre = file_path_base + "id_genres_mmsr.tsv"
file_path_musicnn = file_path_base + "id_musicnn_mmsr.tsv"
file_path_mfcc_bow = file_path_base + "id_mfcc_bow_mmsr.tsv"
file_path_ivec_256 = file_path_base + "id_ivec256_mmsr.tsv"
file_path_logfluc = file_path_base + "id_blf_logfluc_mmsr.tsv"

df_info = pd.read_table(file_path_info)
df_word2vec = pd.read_table(file_path_word2vec)
df_tfidf = pd.read_table(file_path_tfidf)
df_bert = pd.read_table(file_path_bert)

df_genre = pd.read_table(file_path_genre)
df_musicnn = pd.read_table(file_path_musicnn)
df_mfcc_bow = pd.read_table(file_path_mfcc_bow)
df_ivec_256 = pd.read_table(file_path_ivec_256)
df_logfluc = pd.read_table(file_path_logfluc)

df_genre["genre"] = df_genre["genre"].apply(literal_eval) # convert string into array

In [None]:
# getting the unique genres
genres = df_genre.explode("genre")["genre"].values
unique_genres = np.unique(genres)
unique_genres

In [None]:
# Audio-based(cos-sim, MFCC BoW)
t2.song_retrieval(df_info, df_mfcc_bow, "As Long as You Love Me","Justin Bieber", 10, filter = ["id", "artist", "song", "sim"])

In [None]:
# Audio-based(cos-sim, DNN)
t2.song_retrieval(df_info, df_musicnn, "As Long as You Love Me","Justin Bieber", 10, filter = ["id", "artist", "song", "sim"])

In [None]:
# Audio-based(cos-sim, ivec 256)
t2.song_retrieval(df_info, df_ivec_256, "As Long as You Love Me","Justin Bieber", 10, filter = ["id", "artist", "song", "sim"])

In [None]:
# Audio-based(cos-sim, BLF logfluc)
t2.song_retrieval(df_info, df_logfluc, "As Long as You Love Me","Justin Bieber", 10, filter = ["id", "artist", "song", "sim"])

In [None]:
# TODO: select retrieval systems combinations based on perfomance of metrics (grid search)

In [None]:
def calc_idcg_dataframe(df_genre) -> pd.DataFrame:
    if os.path.exists('idcg.tsv'):
        return pd.read_table('idcg.tsv')
    else:
        idcg_value_list = np.empty(len(df_genre))

        for index, row in df_genre.iterrows():
            genre = row["genre"]
            if index % 100 == 0:
                print(index, end=" ")
            
            df_new_genre = df_genre[df_genre["id"] !=row ["id"]]
            result_array = df_new_genre["genre"].apply(lambda x: get_rel(x, genre)).values
            top_ten_results = np.sort(result_array)[-10:]
            idcg = dcg(top_ten_results)
            idcg_value_list[index] = idcg

        idcgs = pd.DataFrame({
            "id": df_genre["id"].copy(),
            "idcg_value": idcg_value_list
        })
        idcgs.to_csv('idcg.tsv', sep='\t', encoding='utf-8')

        return idcgs
    
def calc_idcg(row, df_genre) -> int:
    genre = row["genre"]
    df_new_genre = df_genre[df_genre["id"] != row ["id"]]
    result_array = df_new_genre["genre"].apply(lambda x: calc_rel(x, genre)).values
    top_ten_results = np.sort(result_array)[-10:]
    return calc_dcg(top_ten_results)
    
def calc_rel(i_genre, track_genre):  # todo: rename
    t0_genres = set(track_genre)
    t1_genres = set(i_genre)
    return 2 * len(t0_genres.intersection(t1_genres)) / (len(t0_genres) + len(t1_genres))

# https://wikimedia.org/api/rest_v1/media/math/render/svg/3efe45491d555db398ed663107460f81d6ecaf1e
def calc_dcg(top_rel: []):
    dcg = top_rel[0]
    for i in range(2, len(top_rel)+1):
        dcg += top_rel[i-1] / np.log2(i + 1) 
    return dcg

def calc_ndcg(row, feature, info): # todo: include df_genre as parameter
    song_id = row["id"]
    artist = row["artist"]
    song_title = row["song"]
    song_idcg = row["idcg_value"]
    song_genre = row["genre"]

    if song_id == "XWfDJYP0AIVHgsrk":
        print("nearly half")
  
    retrieved_songs = t2.song_retrieval(info, feature, song_title, artist, 10)
    retrieved_rel = retrieved_songs["genre"].apply(lambda x: calc_rel(x, song_genre)).values  # todo: rename

    return calc_dcg(retrieved_rel) / song_idcg


def get_track_genres(track_id: str, genres: pd.DataFrame) -> [str]:
    return genres[genres['id'] == track_id]['genre'].values[0]


In [None]:
df_genre["idcg_value"] = df_genre.apply(lambda x: calc_idcg(x, df_genre), axis=1)
df_genre

In [None]:
df_song_data = pd.merge(df_info, df_genre, on = "id")
df_song_data

In [None]:
st = time.time()
dcgs_word2vec = df_song_data.apply(calc_ndcg, axis=1, args=(df_word2vec, df_song_data))
print('df_word2vec')
print("Avg ndcg:", np.mean(dcgs_word2vec))
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
dcgs_tfidf = df_song_data.apply(calc_ndcg, axis=1, args=(df_tfidf, df_song_data))
print('df_tfidf')
print("Avg ndcg:", np.mean(dcgs_tfidf))
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
dcgs_bert = df_song_data.apply(calc_ndcg, axis=1, args=(df_bert, df_song_data))
print('df_bert')
print("Avg ndcg:", np.mean(dcgs_bert))
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
dcgs_mfcc_bow = df_song_data.apply(calc_ndcg, axis=1, args=(df_mfcc_bow, df_song_data))
print('df_mfcc_bow')
print("Avg ndcg:", np.mean(dcgs_mfcc_bow))
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
dcgs_musicnn = df_song_data.apply(calc_ndcg, axis=1, args=(df_musicnn, df_song_data))
print('df_musicnn')
print("Avg ndcg:", np.mean(dcgs_musicnn))
print("Execution time(min):", (time.time()-st)/60)

In [None]:
st = time.time()
dcgs_ivec_256 = df_song_data.apply(calc_ndcg, axis=1, args=(df_ivec_256, df_song_data))
print('df_ivec_256')
print("Avg ndcg:", np.mean(dcgs_ivec_256))
print("Execution time(min):", (time.time()-st)/60)

In [None]:
# TODO: when all RS finished -> Please ignore
df_dict = {
    'df_word2vec': pd.read_table(file_path_word2vec),
    'df_tfidf': pd.read_table(file_path_tfidf)
}  # todo rest: df_bert, df_mfcc_bow, df_musicnn, df_ivec_256

result = {}

for key, value in df_dict.items():
    print(f"DataFrame {key}:")
    dcg_value_list = df_info.apply(calc_ndcg, axis=1, args=(value, idcg_matrix,))
    result[key] = np.mean(dcg_value_list)