# FMA similarity evaluation

In [118]:
# IMPORTS
import pandas as pd
from IPython.display import Markdown

def md(text):
    display(Markdown(text))

In [135]:
# LOAD FILES
# Load the main evaluator data
main_evaluator_path = 'data/evaluation/FMA_similarity_evaluation_main_evaluator_N.tsv'
main_evaluation = pd.read_csv(main_evaluator_path, sep='\t')

# Load the consensus evaluation data
def preprocess_consensus_df(joined_evaluation):
    '''Preprocess df for consensus evaluation'''
    # Define relevant columns on the df
    relevant_cols = ['Type', 'Track_id', 'Main_id']

    # Pivot the table to have separate similarity columns for each evaluator
    joined_pivot = joined_evaluation.pivot_table(index=relevant_cols,
                                                columns='Evaluator',
                                                values='Similarity',
                                                aggfunc='first').reset_index()

    # Rename similarity columns to include evaluator numbers
    joined_pivot.columns = relevant_cols + [f'Evaluator{int(col+1)}' for col in range(3)]

    # Remove "main" rows
    joined_pivot = joined_pivot[joined_pivot['Type'] != 'Main'].reset_index(drop=True)

    # Sort by main track id
    joined_pivot = joined_pivot.sort_values(['Main_id', 'Type']).reset_index(drop=True)

    return joined_pivot

joined_evaluation_path = 'data/evaluation/FMA_similarity_evaluation_main16_joined.tsv'
joined_evaluation_raw = pd.read_csv(joined_evaluation_path, sep='\t')
joined_evaluation = preprocess_consensus_df(joined_evaluation_raw)

In [126]:
# BASIC EVALUATION
def print_basic_info(main_evaluation, joined_evaluation):
    '''Show basic info about what was evaluated'''
    md(f"1 evaluator evaluated {len(main_evaluation['Main_id'].unique())} tracks, one example for every 2 genres")
    md(f"3 evaluators evaluated the same {len(joined_evaluation['Main_id'].unique())} tracks")
    md("Example of the evaluation of 1 track:")
    # TODO - Score is not always present nor is genre. I should add them from FMA_similarity_dict.json.gz
    display(main_evaluation.head(6))
    md("Example of the consensus evaluation of 1 track:")
    display(joined_evaluation[5:10])

print_basic_info(main_evaluation, joined_evaluation)

1 evaluator evaluated 80 tracks, one example for every 2 genres

3 evaluators evaluated the same 13 tracks

Example of the evaluation of 1 track:

Unnamed: 0,Type,Track_id,Similarity,Link,Genre,Title,Artist,Album,Score,Main_id,Evaluator
0,Main,20433,Main,https://melody.dvstr.net/song/20433,International,Drifting,Midival Punditz,Hello Hello,1.0,20433,1
1,Similar_1,133066,S,,,Nuevos ritos,Melange,Melange,0.95,20433,1
2,Similar_2,109473,S,,,Music Industry Event,The Pink Tiles,Live on WFMU's Surface Noise with Joe McGasko:...,,20433,1
3,Similar_3,66326,N,,,E-Love,The Upsidedown,Dead Bees records label sampler #11,0.93,20433,1
4,Similar_4,148620,N,,,Decay of,2L8,The Answer,0.93,20433,1
5,Similar_5,64297,VS,,,The Fantastic Doctrine,The New Mystikal Troubadours,III,0.93,20433,1


Example of the consensus evaluation of 1 track:

Unnamed: 0,Type,Track_id,Score,Main_id,Evaluator1,Evaluator2,Evaluator3
5,Similar_1,133066,0.95,20433,S,VS,S
6,Similar_2,109473,0.93,20433,,,S
7,Similar_3,66326,0.93,20433,N,S,S
8,Similar_4,148620,0.93,20433,N,D,N
9,Similar_5,64297,0.93,20433,VS,S,S


In [137]:
# EVALUATE SIMILARITY ON MAIN EVALUATOR
def evaluate_similarity(main_evaluation):
    '''Evaluate similarity between the main evaluator on 80 tracks'''

    # Get value counts % percentages for each similarity type 
    similar_tracks = main_evaluation[main_evaluation['Type'].str.contains("Similar")]
    counts = similar_tracks['Similarity'].value_counts(dropna=False).reset_index()
    counts['%'] = counts['count'] / len(similar_tracks) * 100

    # Order counts by similarity type
    order = ["VS", "S", "N", "D"]
    counts['Similarity'] = pd.Categorical(counts['Similarity'], categories=order, ordered=True)
    counts = counts.sort_values('Similarity').reset_index(drop=True)

    # Get cumulative percentage
    counts['Cumulative %'] = counts['%'].cumsum()

    display(counts)

evaluate_similarity(main_evaluation)

Unnamed: 0,Similarity,count,%,Cumulative %
0,VS,59,14.75,14.75
1,S,196,49.0,63.75
2,N,66,16.5,80.25
3,D,79,19.75,100.0


In [159]:
from itertools import combinations
from sklearn.metrics import cohen_kappa_score
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

def evaluate_agreement(joined_evaluation, group_vs_s=False):
    """
    Evaluate agreement between evaluators using Cohen's Kappa, Fleiss' Kappa, 
    and measure overall agreement percentage. 

    Parameters:
        joined_evaluation (pd.DataFrame): DataFrame containing similarity ratings from multiple evaluators.
        group_vs_s (bool): If True, "VS" and "S" are treated as the same label.

    Returns:
        None (prints results and displays disagreement table)
    """

    # Identify evaluator columns
    eval_cols = [col for col in joined_evaluation.columns if 'Evaluator' in col]

    # Define rating mapping
    if group_vs_s:
        # Merge VS and S into one category
        for col in eval_cols:
            joined_evaluation[col] = joined_evaluation[col].replace({"VS": "S"})
        rating_map = {"S": 2, "N": 1, "D": 0}  # Merging VS and S into one category
    else:
        rating_map = {"VS": 3, "S": 2, "N": 1, "D": 0}  # Keeping separate categories

    # Convert ratings to numerical values
    joined_evaluation_num = joined_evaluation.copy()
    joined_evaluation_num[eval_cols] = joined_evaluation_num[eval_cols].map(lambda x: rating_map.get(x, np.nan))

    ### 1. Compute Pairwise Agreement (Cohen’s Kappa)
    pairwise_kappas = {}
    for eval1, eval2 in combinations(eval_cols, 2):
        kappa = cohen_kappa_score(joined_evaluation_num[eval1], joined_evaluation_num[eval2])
        pairwise_kappas[f"{eval1} vs {eval2}"] = kappa

    print("\nPairwise Agreement (Cohen's Kappa):")
    for pair, score in pairwise_kappas.items():
        print(f"{pair}: {score:.3f}")

    ### 2. Compute Fleiss' Kappa
    def compute_fleiss_kappa(df):
        """
        Compute Fleiss' Kappa for multi-rater agreement.
        :param df: DataFrame where each row represents an item rated by multiple evaluators.
        :return: Fleiss' Kappa value.
        """
        num_categories = len(set(rating_map.values()))  # Determine number of categories
        rating_counts = np.zeros((len(df), num_categories))  

        for i, row in enumerate(df.values):
            for rating in row:
                if not np.isnan(rating):
                    rating_counts[i, int(rating)] += 1  

        # Compute proportions
        N, k = rating_counts.shape  
        P_i = (np.sum(rating_counts**2, axis=1) - df.shape[1]) / (df.shape[1] * (df.shape[1] - 1))
        P_bar = np.mean(P_i)
        P_e = np.sum((np.sum(rating_counts, axis=0) / (N * df.shape[1]))**2)

        return (P_bar - P_e) / (1 - P_e)

    fleiss_kappa = compute_fleiss_kappa(joined_evaluation_num[eval_cols])
    print(f"\nFleiss' Kappa (Overall Agreement for all evaluators): {fleiss_kappa:.3f}")

    ### 3. Identify Disagreements
    joined_evaluation["Agreement"] = joined_evaluation[eval_cols].nunique(axis=1) == 1  
    agreement_percentage = joined_evaluation["Agreement"].mean() * 100

    print(f"\nFull Agreement Percentage: {agreement_percentage:.1f}%")
    disagreements = joined_evaluation[joined_evaluation["Agreement"] == False]

    # Display the disagreements table
    md("<b>Disagreements Table</b>")
    # display(disagreements)

    ### 4. Contingency Matrix
    print("\nContingency Matrix (Evaluator 1 vs Evaluator 2):")
    contingency_matrix = pd.crosstab(joined_evaluation[eval_cols[0]], joined_evaluation[eval_cols[1]])
    display(contingency_matrix)

    # Chi-Square Test for Dependency
    chi2, p, _, _ = chi2_contingency(contingency_matrix)
    print(f"\nChi-Square Test Results: χ² = {chi2:.3f}, p-value = {p:.5f}")


# Example Usage
# evaluate_agreement(joined_evaluation, group_vs_s=False)  # Standard evaluation
evaluate_agreement(joined_evaluation, group_vs_s=True)   # Evaluation with VS and S grouped


Pairwise Agreement (Cohen's Kappa):
Evaluator1 vs Evaluator2: 0.418
Evaluator1 vs Evaluator3: 0.446
Evaluator2 vs Evaluator3: 0.381

Fleiss' Kappa (Overall Agreement for all evaluators): 0.414

Full Agreement Percentage: 56.9%


<b>Disagreements Table</b>


Contingency Matrix (Evaluator 1 vs Evaluator 2):


Evaluator2,D,N,S
Evaluator1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
D,5,3,2
N,2,6,4
S,7,2,34



Chi-Square Test Results: χ² = 23.543, p-value = 0.00010
