# Data Cleaning & Augmentation

## Simplify chords

Simplifying chords down to 42: base note (A-G) + accidental + major/minor(dim).

In [1]:
import pandas as pd
import numpy as np
import json
import csv
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
### Simplifying chords down to set of 42 ###
notes = ["A", "B", "C", "D", "E", "F", "G"]
accs = ["b", "s", ""]
all_notes_list = [note + acc for note in notes for acc in accs]

def simplify_chord(chord: str) -> str:
    """
    Removes chord quality from a chord.
    """
    for note in all_notes_list:
        if not chord.startswith(note):
            continue

        suffix = chord.removeprefix(note)
        if suffix.startswith("min") or suffix.startswith("dim"):
            return note + "min"
        else:
            return note

    if chord == "sC":
        return "Cs"

    # print(chord)
    return ""

In [3]:
df = pd.read_csv("chordonomicon_v2.csv", usecols=["id", "chords", "main_genre"])
df["chords"] = (
    df["chords"]
    .str.split(" ")
    .map(lambda lst: [simplify_chord(x) for x in lst if not x.startswith("<")])
)
df.to_csv("chordonomicon_v2_simplified.csv", index=False)

  df = pd.read_csv("chordonomicon_v2.csv", usecols=["id", "chords", "main_genre"])


Optionally, also collaps both sharp and flat accents to sharp:

In [4]:
def simplify_and_standardize(chord: str) -> str:
    """
    Simplifies chord quality (min/dim) and standardizes flat prefixes.
    """
    # --- 1) Simplification (your original simplify_chord) ---
    simplified = ""
    for note in all_notes_list:
        if chord.startswith(note):
            suffix = chord[len(note):]
            if suffix.startswith("min") or suffix.startswith("dim"):
                simplified = note + "min"
            else:
                simplified = note
            break

    if chord == "sC":
        simplified = "Cs"

    if simplified == "":
        simplified = chord  # fallback to original if nothing matched

    # --- 2) Standardization (your standardize_chord_prefix) ---
    flat_to_sharp = {
        "Bb": "As",
        "Db": "Cs",
        "Eb": "Ds",
        "Gb": "Fs",
        "Ab": "Gs",
        "Cb": "B",
        "Fb": "E",
        "Bs": "C",
        "Es": "F"
    }

    for flat, sharp in flat_to_sharp.items():
        if simplified.startswith(flat):
            return sharp + simplified[len(flat):]

    return simplified


In [5]:
df = pd.read_csv("chordonomicon_v2.csv", usecols=["id", "chords", "main_genre"])
df["chords"] = (
    df["chords"]
    .str.split(" ")
    .map(lambda lst: [simplify_and_standardize(x) for x in lst if not x.startswith("<")])
)
df.to_csv("chordonomicon_v2_standardized.csv", index=False)

  df = pd.read_csv("chordonomicon_v2.csv", usecols=["id", "chords", "main_genre"])


## Transpose to all chords
Transpose each song to mitigate bias from the key of the song and to augment our data. Also Collapse both sharp and flat accents to sharp

In [6]:
### Transposing songs to all keys ###
keys_list = ["C", "Cs", "D", "Ds", "E", "F", "Fs", "G", "Gs", "A", "As", "B"]

def transpose_chord(chord: str, variation: int) -> str:
    # Identify base note and suffix robustly
    if len(chord) >= 2 and chord[1] == "s":
        base = chord[:2]
        suffix = chord[2:]
    else:
        base = chord[:1]
        suffix = chord[1:]
    if base in keys_list:
        idx = keys_list.index(base)
        new_base = keys_list[(idx + variation) % 12]
        return new_base + suffix
    return chord  # If not found, return as is

def augment_keys(df):
    augmented_rows = []
    for _, row in df.iterrows():
        for variation in range(12):
            new_row = row.copy()
            if variation == 0:
                new_row["original_key"] = True
            else:
                new_row["original_key"] = False
            new_row["added_semitones"] = variation
            new_row["chords"] = [transpose_chord(chord, variation) for chord in row["chords"]]
            augmented_rows.append(new_row)
    return pd.DataFrame(augmented_rows)

def standardize_chord_prefix(chord: str) -> str:
    # Map flat notes to their sharp equivalents
    flat_to_sharp = {
        "Bb": "As",
        "Db": "Cs",
        "Eb": "Ds",
        "Gb": "Fs",
        "Ab": "Gs",
        "Cb": "B",
        "Fb": "E",
        "Bs": "C",
        "Es": "F"
    }
    for flat, sharp in flat_to_sharp.items():
        if chord.startswith(flat):
            return sharp + chord[len(flat):]
    return chord

In [7]:
# Writing transposed dataset to csv
def get_pop_chords_df():
    df = pd.read_csv("hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv",usecols=["id", "chords", "main_genre"])
    pop_df = df[df["main_genre"] == "pop"][["id", "chords"]].copy()
    pop_df["chords"] = pop_df["chords"].str.split(" ")
    pop_df["chords"] = pop_df["chords"].map(
        lambda chords: [simplify_chord(standardize_chord_prefix(chord)) for chord in chords if not chord.startswith("<")]
    )

    pop_df["original_key"] = True
    pop_df["added_semitones"] = 0
    return pop_df

pop_chords_df = get_pop_chords_df()
augmented_df = augment_keys(pop_chords_df)
print(augmented_df.head(15))
augmented_df.to_csv("chordonomicon_v2_augmented.csv", index=False)

  df = pd.read_csv("hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv",usecols=["id", "chords", "main_genre"])


   id                                             chords  original_key  \
0   1  [C, F, C, E, Amin, C, F, C, G, C, F, C, E, Ami...          True   
0   1  [Cs, Fs, Cs, F, Asmin, Cs, Fs, Cs, Gs, Cs, Fs,...         False   
0   1  [D, G, D, Fs, Bmin, D, G, D, A, D, G, D, Fs, B...         False   
0   1  [Ds, Gs, Ds, G, Cmin, Ds, Gs, Ds, As, Ds, Gs, ...         False   
0   1  [E, A, E, Gs, Csmin, E, A, E, B, E, A, E, Gs, ...         False   
0   1  [F, As, F, A, Dmin, F, As, F, C, F, As, F, A, ...         False   
0   1  [Fs, B, Fs, As, Dsmin, Fs, B, Fs, Cs, Fs, B, F...         False   
0   1  [G, C, G, B, Emin, G, C, G, D, G, C, G, B, Emi...         False   
0   1  [Gs, Cs, Gs, C, Fmin, Gs, Cs, Gs, Ds, Gs, Cs, ...         False   
0   1  [A, D, A, Cs, Fsmin, A, D, A, E, A, D, A, Cs, ...         False   
0   1  [As, Ds, As, D, Gmin, As, Ds, As, F, As, Ds, A...         False   
0   1  [B, E, B, Ds, Gsmin, B, E, B, Fs, B, E, B, Ds,...         False   
4   5  [C, G, C, G, C, F, Dmin, G, Dmi

# N-gram Learning
Compute n-gram counts using CountVectorizer library (usually used for bag of n-grams)

In [8]:
def count_n_grams(data, n: int = 1) -> pd.DataFrame:
    word_vectorizer = CountVectorizer(
        ngram_range=(1, n),
        analyzer="word",
        token_pattern=r"(?u)\b\w+\b",
        lowercase=False,
    )

    sparse_matrix = word_vectorizer.fit_transform(
        data.map(lambda chords: " ".join(chords))
    )

    frequencies = sum(sparse_matrix).toarray()[0]

    df_all = pd.DataFrame(
        frequencies,
        index=word_vectorizer.get_feature_names_out(),
        columns=["count"],
    )

    return df_all.groupby(by=lambda chords: len(chords.split(" ")))

Import our processed dataset and compute n-gram counts.

In [9]:
import pandas as pd
import ast
import json

def load_chords_from_csv(path):
    df = pd.read_csv(path, usecols=['chords'])
    # If chords are stored as a list string, use ast.literal_eval
    try:
        chords = df['chords'].apply(ast.literal_eval)
    except Exception:
        chords = df['chords'].str.split(" ")
    return chords

results = {}

for name, path in [
    ("simplified", "chordonomicon_v2_simplified.csv"),
    ("standardized", "chordonomicon_v2_standardized.csv"),
    ("augmented", "chordonomicon_v2_augmented.csv")
]:
    chords = load_chords_from_csv(path)
    n_gram_counts = count_n_grams(chords, 3)
    results[name] = {}
    for key, _ in n_gram_counts:
        df = n_gram_counts.get_group(key).sort_values(by='count')
        results[name][f"{key}-gram"] = df.reset_index().to_dict(orient="records")
        if name == "simplified":
            print(f"\n--- {key}-gram counts for simplified ---")
            print(df)

with open("n_gram_counts_results.json", "w") as f:
    json.dump(results, f, indent=2)


--- 1-gram counts for simplified ---
         count
Gbmin    13921
Dbmin    25731
Bs       43297
Abmin    50386
Es       53824
Asmin    76742
Ebmin    98737
Gb      115516
Dsmin   136273
Bbmin   173213
Db      226807
Fmin    349631
As      361219
Gsmin   365534
Ds      370729
Gs      379959
Ab      415987
Cs      458960
Cmin    485114
Eb      583180
Gmin    656014
Csmin   700142
Fs      842932
Fsmin   991863
Bb     1147575
Dmin   1449889
Bmin   1633942
B      1760255
Emin   2989466
E      3172313
Amin   3277296
F      3718124
A      4527730
D      6005289
C      6627164
G      7709880

--- 2-gram counts for simplified ---
               count
Asmin Gbmin        1
Gsmin Gbmin        1
Dsmin Gbmin        1
Dbmin Dsmin        1
Dsmin Dbmin        2
...              ...
F C          1275024
D G          1604703
G D          1747895
G C          1889671
C G          2208974

[1293 rows x 1 columns]

--- 3-gram counts for simplified ---
                   count
Dsmin Ebmin Gs         1
Fsmi

Calculate transition matrix probabilities using counts.

In [20]:
import itertools

# base notes and shape of chord names
notes = ["A", "B", "C", "D", "E", "F", "G"]
# standardized/augmented: no flat spellings (only natural + sharp 's')
accs_std = ["", "s"]
# simplified: include flats 'b' in addition to sharps and naturals
accs_simpl = ["", "s", "b"]
third = ["", "min"]

# exclude weird/duplicate forms for standardized (same as you used)
excluded_chords = {"Bs", "Bsmin", "Es", "Esmin", "Cb", "Cbmin", "Fb", "Fbmin"}

# build standardized / augmented list
all_chords_std = [note + acc + t for note in notes for acc in accs_std for t in third]
all_chords_std = [c for c in all_chords_std if c not in excluded_chords]

# build simplified list (includes flats)
all_chords_simpl = [note + acc + t for note in notes for acc in accs_simpl for t in third]
all_chords_simpl = [c for c in all_chords_simpl if c not in excluded_chords]

# final dict
all_chords_dict = {
    "standardized": sorted(all_chords_std),
    "augmented": sorted(all_chords_std),   # same as standardized
    "simplified": sorted(all_chords_simpl),
}

print("standardized (count):", len(all_chords_dict["standardized"]))
print(all_chords_dict["standardized"])
print("simplified (count):", len(all_chords_dict["simplified"]))
print(all_chords_dict["simplified"])

standardized (count): 24
['A', 'Amin', 'As', 'Asmin', 'B', 'Bmin', 'C', 'Cmin', 'Cs', 'Csmin', 'D', 'Dmin', 'Ds', 'Dsmin', 'E', 'Emin', 'F', 'Fmin', 'Fs', 'Fsmin', 'G', 'Gmin', 'Gs', 'Gsmin']
simplified (count): 34
['A', 'Ab', 'Abmin', 'Amin', 'As', 'Asmin', 'B', 'Bb', 'Bbmin', 'Bmin', 'C', 'Cmin', 'Cs', 'Csmin', 'D', 'Db', 'Dbmin', 'Dmin', 'Ds', 'Dsmin', 'E', 'Eb', 'Ebmin', 'Emin', 'F', 'Fmin', 'Fs', 'Fsmin', 'G', 'Gb', 'Gbmin', 'Gmin', 'Gs', 'Gsmin']


In [11]:
# Calculate transition matrix probabilities
# alpha is additive smoothing

def compute_unigram_prob(n_gram_counts, alpha=1.0, input_data="simplified"):
    unigram = n_gram_counts.get_group(1)
    unigram = unigram.reindex(all_chords_dict[input_data], fill_value=0)
    vocab_size = len(all_chords_dict[input_data])
    total_count = unigram["count"].sum()

    probs = (unigram["count"] + alpha) / (total_count + alpha * vocab_size)
    df = pd.DataFrame([probs.values], 
                      index=[""],
                      columns=all_chords_dict[input_data])
    return df

In [12]:
def compute_unigram_prob_from_df(unigram_df: pd.DataFrame, alpha: float = 1.0, input_data: str = "simplified") -> pd.DataFrame:
    """
    Compute unigram probabilities with additive smoothing directly from a DataFrame.
    
    Parameters:
    - unigram_df: DataFrame containing columns ['index', 'count', ...] for 1-grams
    - alpha: additive smoothing constant
    - input_data: key to select chord vocabulary from all_chords_dict
    
    Returns:
    - DataFrame: 1-row, columns = chords, values = probability
    """
    # Set index to chord names
    unigram_df = unigram_df.set_index("index")

    # Reindex to include all possible chords, fill missing with 0
    vocab = all_chords_dict[input_data]
    unigram_df = unigram_df.reindex(vocab, fill_value=0)

    # Compute probabilities with additive smoothing
    total_count = unigram_df["count"].sum()
    probs = (unigram_df["count"] + alpha) / (total_count + alpha * len(vocab))

    # Return as a 1-row DataFrame with chords as columns
    return pd.DataFrame([probs.values], columns=vocab, index=[""])


In [13]:
# Extract unigrams from df_all
df_all = results["simplified"]["1-gram"]
df_all = pd.DataFrame(df_all)
df_all["ngram_length"] = 1  # since these are unigrams
unigram_df = df_all[df_all["ngram_length"] == 1].copy()

# Compute probabilities
unigram_probs = compute_unigram_prob_from_df(unigram_df, alpha=1.0, input_data="simplified")

print(unigram_probs)


         A        Ab     Abmin      Amin       As     Asmin         B  \
  0.087244  0.008016  0.000971  0.063149  0.00696  0.001479  0.033918   

        Bb     Bbmin      Bmin  ...         F      Fmin        Fs     Fsmin  \
  0.022112  0.003338  0.031484  ...  0.071644  0.006737  0.016242  0.019112   

        G        Gb     Gbmin      Gmin        Gs     Gsmin  
  0.14856  0.002226  0.000268  0.012641  0.007321  0.007043  

[1 rows x 34 columns]


In [14]:
import itertools
import pandas as pd

def compute_ngram_prob_from_df(ngram_df, n: int = 2, alpha=1.0, input_data="simplified"):
    """
    Compute n-gram probabilities from a reconstructed DataFrame.
    ngram_df: DataFrame with columns ['index', 'count', 'ngram_length'] for n-grams
    """
    ngram = ngram_df.copy()

    # use the 'index' column as the n-gram string
    ngram["evidence"] = ngram["index"].map(lambda s: " ".join(s.split()[:-1]))
    ngram["next"] = ngram["index"].map(lambda s: s.split()[-1])

    # generate all possible (n-1)-length sequences
    all_evidence_seq = [" ".join(evidence) for evidence in itertools.product(all_chords_dict[input_data], repeat=(n - 1))]
    full_index = pd.MultiIndex.from_product([all_evidence_seq, all_chords_dict[input_data]], names=["evidence", "next"])

    # reindex to include all possible n-grams
    ngram = ngram.set_index(["evidence", "next"])
    ngram = ngram.reindex(full_index, fill_value=0)

    # compute probabilities with additive smoothing
    evidence_counts = ngram["count"].groupby(level="evidence").transform("sum")
    vocab_size = len(all_chords_dict[input_data])
    ngram["prob"] = (ngram["count"] + alpha) / (evidence_counts + alpha * vocab_size)

    # return as a 2D DataFrame: rows=evidence, columns=next chord
    return ngram["prob"].unstack(fill_value=0.0)


In [15]:
bigram_df = df_all[df_all["ngram_length"] == 2].copy()
bigram_probs = compute_ngram_prob_from_df(bigram_df, n=2, alpha=1.0, input_data="simplified")
print(bigram_probs.shape)
bigram_probs


(34, 34)


next,A,Ab,Abmin,Amin,As,Asmin,B,Bb,Bbmin,Bmin,...,F,Fmin,Fs,Fsmin,G,Gb,Gbmin,Gmin,Gs,Gsmin
evidence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
Ab,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
Abmin,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
Amin,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
As,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
Asmin,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
B,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
Bb,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
Bbmin,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
Bmin,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412


In [16]:
trigram_df = df_all[df_all["ngram_length"] == 3].copy()
trigram_probs = compute_ngram_prob_from_df(trigram_df, n=3, alpha=1.0, input_data="simplified")
print(trigram_probs.shape)
trigram_probs

(1156, 34)


next,A,Ab,Abmin,Amin,As,Asmin,B,Bb,Bbmin,Bmin,...,F,Fmin,Fs,Fsmin,G,Gb,Gbmin,Gmin,Gs,Gsmin
evidence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A A,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
A Ab,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
A Abmin,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
A Amin,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
A As,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gsmin Gb,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
Gsmin Gbmin,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
Gsmin Gmin,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412
Gsmin Gs,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,...,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412


# Inference
Deterministic and probabilistic methods, we are generating a 16 chord song simply from the log-probabilities from our models.

In [17]:
def deterministic_inference(evidence):
    # evidence: string of n-1 space-separated chords
    
    n = len(evidence.split()) + 1
    ngram_probs = unigram_probs if n == 1 else (bigram_probs if n == 2 else trigram_probs)

    if evidence not in ngram_probs.index:
        raise KeyError(f"Evidence '{evidence}' not found in {n}-gram table")
    
    row_probs = ngram_probs.loc[evidence]
    return row_probs.idxmax() # returns next chord w highest prob, if there are several, the first one in col order

def probabilistic_inference(evidence):
    # evidence: string of n-1 space-separated chords
    
    n = len(evidence.split()) + 1
    ngram_probs = unigram_probs if n == 1 else (bigram_probs if n == 2 else trigram_probs)

    if evidence not in ngram_probs.index:
        raise KeyError(f"Evidence '{evidence}' not found in {n}-gram table")
    
    row_probs = ngram_probs.loc[evidence]
    cdf = np.cumsum(row_probs.values) # create cumulative distribution over next possible chord

    # sample over dist
    seed = np.random.random()
    idx = np.searchsorted(cdf, seed)
    
    return row_probs.index[idx] # return probabilistically chosen next chord

In [18]:
### test inference for bigram ###
seq = []

for _ in range(16):
    if len(seq) == 0:
        evidence = ""
    elif len(seq) == 1:
        evidence = seq[-1]
    else:
        evidence = " ".join(seq[-2:])

    next_chord = deterministic_inference(evidence) # can change to deterministic_inference()
    seq.append(next_chord)

print(seq)

['G', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A']


In [19]:
### test inference for bigram ###
seq = []

for _ in range(16):
    if len(seq) == 0:
        evidence = ""
    elif len(seq) == 1:
        evidence = seq[-1]
    else:
        evidence = " ".join(seq[-2:])

    next_chord = probabilistic_inference(evidence) # can change to deterministic_inference()
    seq.append(next_chord)

print(seq)

['G', 'Db', 'As', 'Dbmin', 'A', 'Bb', 'B', 'Dbmin', 'Ab', 'Fs', 'Bbmin', 'Gmin', 'Bb', 'Dmin', 'F', 'Emin']


# Evaluation
Evaluate log-likelihood of an n-gram given a song (in this case the first song of the dataset)

In [30]:
import numpy as np

def song_log_likelihood_ngram(song, n, ngram_probs):
    """
    song: list of chords for one song
    n:    order of n-gram (1, 2, 3, ...)
    ngram_probs:
        - n=1: pandas Series or 1-row DataFrame with index = chord, value = P(chord)
        - n>=2: pandas DataFrame with index = context string, columns = chord, cell = P(target | context)
    """
    ll = 0.0
    if len(song) < n:
        return 0.0

    for t in range(n - 1, len(song)):
        if n == 1:
            # Unigram: no context, just P(chord)
            target = song[t]
            # If unigram_probs is a Series:
            try:
                # Series: P(chord)
                p = float(ngram_probs[target])
            except KeyError:
                # If it's a 1-row DataFrame instead:
                try:
                    p = float(ngram_probs.loc[:, target].iloc[0])
                except Exception:
                    p = 1e-12
        else:
            # Bigram / trigram / ...
            context = " ".join(song[t - (n - 1):t])
            target = song[t]

            # Only access .loc if both labels exist
            if (context in ngram_probs.index) and (target in ngram_probs.columns):
                p = float(ngram_probs.loc[context, target])
            else:
                p = 1e-12

        if p <= 0:
            p = 1e-12

        ll += np.log(p)

    return ll


In [22]:
# test log-likelihood
song_simplified = ['C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'C', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'C', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'G', 'C', 'F', 'C', 'F', 'C', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'G', 'C', 'D', 'G', 'D', 'G', 'D', 'A', 'D', 'G', 'D', 'Fs', 'Bmin', 'D', 'G', 'A', 'D', 'G', 'A', 'D']

print(song_log_likelihood_ngram(song_simplified, 1, unigram_probs))
print(song_log_likelihood_ngram(song_simplified, 2, bigram_probs))
print(song_log_likelihood_ngram(song_simplified, 3, trigram_probs))

-155.76098994173117
-232.73979462466684
-229.21343410005068


In [23]:
dfsimplified = pd.read_csv("chordonomicon_v2_simplified.csv")


In [25]:
import ast
dfsimplified["chords"] = dfsimplified["chords"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

In [None]:
total_loglik_uni = 0.0
total_loglik_bi  = 0.0
total_loglik_tri = 0.0
song1 = dfsimplified["chords"].iloc[0]
for i in range(len(dfsimplified)):
    song = dfsimplified["chords"].iloc[i]
    ll_uni = song_log_likelihood_ngram(song, 1, unigram_probs)
    ll_bi  = song_log_likelihood_ngram(song, 2, bigram_probs)
    ll_tri = song_log_likelihood_ngram(song, 3, trigram_probs)
    total_loglik_uni += ll_uni
    total_loglik_bi  += ll_bi
    total_loglik_tri += ll_tri


  p = float(ngram_probs[target])


In [34]:
print("Total unigram log-likelihood: ", total_loglik_uni)
print("Total bigram log-likelihood:  ", total_loglik_bi)
print("Total trigram log-likelihood: ", total_loglik_tri)

Total unigram log-likelihood:  -149349540.74597985
Total bigram log-likelihood:   -185539817.00730315
Total trigram log-likelihood:  -184812626.0754259


In [35]:
avg_uni = total_loglik_uni / len(dfsimplified)
avg_bi  = total_loglik_bi  / len(dfsimplified)
avg_tri = total_loglik_tri / len(dfsimplified)
print("Average unigram log-likelihood per song: ", avg_uni)
print("Average bigram log-likelihood per song:  ", avg_bi)
print("Average trigram log-likelihood per song: ", avg_tri)

Average unigram log-likelihood per song:  -219.69403190314287
Average bigram log-likelihood per song:   -272.930136064064
Average trigram log-likelihood per song:  -271.8604340282255


Also compute the top-k accuracy with k = 1,3,5 for the given song

In [37]:
def top_k_accuracy_ngram(song, n, ngram_probs, k=5):
    # song: list of chords in song
    # n: order of the n-gram model
    # ngram_probs: DataFrame with index=evidence, columns=next chords
    # k: number of top predictions to consider
    
    correct = 0
    total = 0
    
    if len(song) < n:
        return 0.0
    
    for t in range(n-1, len(song)):
        if n == 1:
            context = ""
        else:
            context = " ".join(song[t-(n-1):t])
        
        target = song[t]
        
        try:
            prob_row = ngram_probs.loc[context]
            
            top_k_chords = prob_row.nlargest(k).index.tolist()
            
            if target in top_k_chords:
                correct += 1
            total += 1
            
        except KeyError:
            total += 1
    
    return correct / total if total > 0 else 0.0

In [38]:
# Test on top-k accuracy
print(f"Top-1: {top_k_accuracy_ngram(seq, 2, bigram_probs, k=1):.4f}")
print(f"Top-3: {top_k_accuracy_ngram(seq, 2, bigram_probs, k=3):.4f}")
print(f"Top-5: {top_k_accuracy_ngram(seq, 2, bigram_probs, k=5):.4f}")

Top-1: 0.0000
Top-3: 0.1333
Top-5: 0.1333


In [39]:
# Test on top-k accuracy
print(f"Top-1: {top_k_accuracy_ngram(seq, 3, trigram_probs, k=1):.4f}")
print(f"Top-3: {top_k_accuracy_ngram(seq, 3, trigram_probs, k=3):.4f}")
print(f"Top-5: {top_k_accuracy_ngram(seq, 3, trigram_probs, k=5):.4f}")

Top-1: 0.0000
Top-3: 0.1429
Top-5: 0.1429


## Inference and evaluation with standardized data
Repeat the inference and evaluation using data with only sharp (no flat-sharp duplicates)

In [40]:
# Extract unigrams from df_all
unigram_df = df_all[df_all["ngram_length"] == 1].copy()

# Compute probabilities
unigram_probs = compute_unigram_prob_from_df(unigram_df, alpha=1.0, input_data="standardized")

print(unigram_probs)

bigram_df = df_all[df_all["ngram_length"] == 2].copy()
bigram_probs = compute_ngram_prob_from_df(bigram_df, n=2, alpha=1.0, input_data="standardized")
print(bigram_probs.shape)

trigram_df = df_all[df_all["ngram_length"] == 3].copy()
trigram_probs = compute_ngram_prob_from_df(trigram_df, n=3, alpha=1.0, input_data="standardized")
print(trigram_probs.shape)


         A     Amin        As     Asmin        B      Bmin        C      Cmin  \
  0.092315  0.06682  0.007365  0.001565  0.03589  0.033314  0.13512  0.009891   

        Cs     Csmin  ...        E      Emin         F      Fmin        Fs  \
  0.009358  0.014275  ...  0.06468  0.060952  0.075808  0.007129  0.017186   

     Fsmin         G      Gmin        Gs     Gsmin  
  0.020223  0.157195  0.013375  0.007747  0.007453  

[1 rows x 24 columns]
(24, 24)
(576, 24)


In [41]:
### test inference for bigram ###
seq = []

for _ in range(16):
    if len(seq) == 0:
        evidence = ""
    elif len(seq) == 1:
        evidence = seq[-1]
    else:
        evidence = " ".join(seq[-2:])

    next_chord = probabilistic_inference(evidence) # can change to deterministic_inference()
    seq.append(next_chord)

print(seq)

['F', 'Emin', 'Fsmin', 'Fsmin', 'C', 'As', 'Dsmin', 'Fs', 'Gmin', 'Ds', 'Amin', 'Gsmin', 'Cs', 'Gmin', 'Cs', 'Gmin']


In [42]:
# test log-likelihood
song_standardized = ['C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'C', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'C', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'G', 'C', 'F', 'C', 'F', 'C', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'G', 'C', 'D', 'G', 'D', 'G', 'D', 'A', 'D', 'G', 'D', 'Fs', 'Bmin', 'D', 'G', 'A', 'D', 'G', 'A', 'D']
print(song_log_likelihood_ngram(song_standardized, 1, unigram_probs))
print(song_log_likelihood_ngram(song_standardized, 2, bigram_probs))
print(song_log_likelihood_ngram(song_standardized, 3, trigram_probs))

-151.97528736047866
-209.75155280296457
-206.57349897261662


  p = float(ngram_probs[target])


In [51]:
total_loglik_uni = 0.0
total_loglik_bi  = 0.0
total_loglik_tri = 0.0
dfstandardized = pd.read_csv("chordonomicon_v2_standardized.csv")
import ast
dfstandardized["chords"] = dfstandardized["chords"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)   
for i in range(len(dfstandardized)):
    song = dfstandardized["chords"].iloc[i]
    ll_uni = song_log_likelihood_ngram(song, 1, unigram_probs)
    ll_bi  = song_log_likelihood_ngram(song, 2, bigram_probs)
    ll_tri = song_log_likelihood_ngram(song, 3, trigram_probs)
    total_loglik_uni += ll_uni
    total_loglik_bi  += ll_bi
    total_loglik_tri += ll_tri


  p = float(ngram_probs[target])


In [52]:
print("Total unigram log-likelihood: ", total_loglik_uni)
print("Total bigram log-likelihood:  ", total_loglik_bi)
print("Total trigram log-likelihood: ", total_loglik_tri)
avg_uni = total_loglik_uni / len(dfstandardized)
avg_bi  = total_loglik_bi  / len(dfstandardized)
avg_tri = total_loglik_tri / len(dfstandardized)
print("Average unigram log-likelihood per song: ", avg_uni)
print("Average bigram log-likelihood per song:  ", avg_bi)
print("Average trigram log-likelihood per song: ", avg_tri)

Total unigram log-likelihood:  -145063013.37622666
Total bigram log-likelihood:   -163081282.50112063
Total trigram log-likelihood:  -160921219.69565123
Average unigram log-likelihood per song:  -213.38852553184458
Average bigram log-likelihood per song:   -239.8935028634901
Average trigram log-likelihood per song:  -236.71603807499957


In [53]:
# Test on top-k accuracy
print(f"Top-1: {top_k_accuracy_ngram(seq, 2, bigram_probs, k=1):.4f}")
print(f"Top-3: {top_k_accuracy_ngram(seq, 2, bigram_probs, k=3):.4f}")
print(f"Top-5: {top_k_accuracy_ngram(seq, 2, bigram_probs, k=5):.4f}")

Top-1: 0.0000
Top-3: 0.0000
Top-5: 0.0000


In [54]:
# Test on top-k accuracy
print(f"Top-1: {top_k_accuracy_ngram(seq, 3, trigram_probs, k=1):.4f}")
print(f"Top-3: {top_k_accuracy_ngram(seq, 3, trigram_probs, k=3):.4f}")
print(f"Top-5: {top_k_accuracy_ngram(seq, 3, trigram_probs, k=5):.4f}")

Top-1: 0.0000
Top-3: 0.0000
Top-5: 0.0000


## Inference and evaluation with augmented data
Repeat the inference and evaluation with the augmented data through transpositions

In [55]:
# Extract unigrams from df_all
unigram_df = df_all[df_all["ngram_length"] == 1].copy()

# Compute probabilities
unigram_probs = compute_unigram_prob_from_df(unigram_df, alpha=1.0, input_data="augmented")

print(unigram_probs)

bigram_df = df_all[df_all["ngram_length"] == 2].copy()
bigram_probs = compute_ngram_prob_from_df(bigram_df, n=2, alpha=1.0, input_data="augmented")
print(bigram_probs.shape)

trigram_df = df_all[df_all["ngram_length"] == 3].copy()
trigram_probs = compute_ngram_prob_from_df(trigram_df, n=3, alpha=1.0, input_data="augmented")
print(trigram_probs.shape)


         A     Amin        As     Asmin        B      Bmin        C      Cmin  \
  0.092315  0.06682  0.007365  0.001565  0.03589  0.033314  0.13512  0.009891   

        Cs     Csmin  ...        E      Emin         F      Fmin        Fs  \
  0.009358  0.014275  ...  0.06468  0.060952  0.075808  0.007129  0.017186   

     Fsmin         G      Gmin        Gs     Gsmin  
  0.020223  0.157195  0.013375  0.007747  0.007453  

[1 rows x 24 columns]
(24, 24)
(576, 24)


In [56]:
### test inference for bigram ###
seq = []

for _ in range(16):
    if len(seq) == 0:
        evidence = ""
    elif len(seq) == 1:
        evidence = seq[-1]
    else:
        evidence = " ".join(seq[-2:])

    next_chord = probabilistic_inference(evidence) # can change to deterministic_inference()
    seq.append(next_chord)

print(seq)

['B', 'Amin', 'Cs', 'D', 'Cmin', 'D', 'Dmin', 'Gsmin', 'C', 'D', 'Cs', 'Cmin', 'Bmin', 'Gmin', 'Asmin', 'Gmin']


In [57]:
# test log-likelihood
song_augmented = ['C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'C', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'C', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'G', 'C', 'F', 'C', 'F', 'C', 'G', 'C', 'F', 'C', 'E', 'Amin', 'C', 'F', 'G', 'C', 'D', 'G', 'D', 'G', 'D', 'A', 'D', 'G', 'D', 'Fs', 'Bmin', 'D', 'G', 'A', 'D', 'G', 'A', 'D']
print(song_log_likelihood_ngram(song_augmented, 1, unigram_probs))
print(song_log_likelihood_ngram(song_augmented, 2, bigram_probs))
print(song_log_likelihood_ngram(song_augmented, 3, trigram_probs))

-151.97528736047866
-209.75155280296457
-206.57349897261662


  p = float(ngram_probs[target])


In [58]:
total_loglik_uni = 0.0
total_loglik_bi  = 0.0
total_loglik_tri = 0.0
dfaugmented = pd.read_csv("chordonomicon_v2_augmented.csv")
import ast
dfaugmented["chords"] = dfaugmented["chords"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
for i in range(len(dfaugmented)):
    song = dfaugmented["chords"].iloc[i]
    ll_uni = song_log_likelihood_ngram(song, 1, unigram_probs)
    ll_bi  = song_log_likelihood_ngram(song, 2, bigram_probs)
    ll_tri = song_log_likelihood_ngram(song, 3, trigram_probs)
    total_loglik_uni += ll_uni
    total_loglik_bi  += ll_bi
    total_loglik_tri += ll_tri


  p = float(ngram_probs[target])


In [59]:
print("Total unigram log-likelihood: ", total_loglik_uni)
print("Total bigram log-likelihood:  ", total_loglik_bi)
print("Total trigram log-likelihood: ", total_loglik_tri)
avg_uni = total_loglik_uni / len(dfaugmented)
avg_bi  = total_loglik_bi  / len(dfaugmented)  
avg_tri = total_loglik_tri / len(dfaugmented)
print("Average unigram log-likelihood per song: ", avg_uni)
print("Average bigram log-likelihood per song:  ", avg_bi)
print("Average trigram log-likelihood per song: ", avg_tri)

Total unigram log-likelihood:  -295845678.4015553
Total bigram log-likelihood:   -254828038.64340723
Total trigram log-likelihood:  -251579749.82337487
Average unigram log-likelihood per song:  -289.41487977299926
Average bigram log-likelihood per song:   -249.28884060516054
Average trigram log-likelihood per song:  -246.1111598514751


In [60]:
# Test on top-k accuracy
print(f"Top-1: {top_k_accuracy_ngram(seq, 2, bigram_probs, k=1):.4f}")
print(f"Top-3: {top_k_accuracy_ngram(seq, 2, bigram_probs, k=3):.4f}")
print(f"Top-5: {top_k_accuracy_ngram(seq, 2, bigram_probs, k=5):.4f}")

Top-1: 0.0000
Top-3: 0.0667
Top-5: 0.1333


In [61]:
# Test on top-k accuracy
print(f"Top-1: {top_k_accuracy_ngram(seq, 3, trigram_probs, k=1):.4f}")
print(f"Top-3: {top_k_accuracy_ngram(seq, 3, trigram_probs, k=3):.4f}")
print(f"Top-5: {top_k_accuracy_ngram(seq, 3, trigram_probs, k=5):.4f}")

Top-1: 0.0000
Top-3: 0.0000
Top-5: 0.0714
