In [None]:
!pip install safetensors



In [None]:
!pip install optuna torch pandas scikit-learn # run if optuna is not installed



In [None]:
import torch
import torch.nn as nn
import pandas as pd
import random
from torch.utils.data import Dataset, DataLoader
from fractions import Fraction

# Load the dataset from the CSV file (ensure output.csv is uploaded to your Colab environment)
df = pd.read_csv('output_FullDataset.csv')

# Inspect the first few rows and columns to verify column names
print(df.head())
print("Columns:", df.columns)

# Build a vocabulary mapping from note tokens to unique integers (reserve 0 for padding)
def build_note_vocab(dataframe):
    notes_set = set()
    # Only use rows where 'Type' equals "Note"
    for note_str in dataframe[dataframe['Type'] == "Note"]['Note']:
        notes_set.update(note_str.split())
    note2idx = {note: idx + 1 for idx, note in enumerate(sorted(notes_set))}
    return note2idx

note2idx = build_note_vocab(df)
print("Note Vocabulary mapping:", note2idx)

# Build a vocabulary mapping for chord labels from the 'Active Chord' column
def build_chord_vocab(dataframe):
    chords = dataframe[dataframe['Type'] == "Note"]['Active Chord'].dropna().unique()
    chords_str = sorted([str(chord) for chord in chords])
    chord2idx = {chord: idx for idx, chord in enumerate(chords_str)}
    return chord2idx

chord2idx = build_chord_vocab(df)
print("Chord mapping:", chord2idx)

                    File           Part  Measure          Note  Octave  \
0  1974%20Blues.musicxml  MusicXML Part        1  F2.A2.C3.E-3     NaN   
1  1974%20Blues.musicxml  MusicXML Part        1             D     3.0   
2  1974%20Blues.musicxml  MusicXML Part        1             D     3.0   
3  1974%20Blues.musicxml  MusicXML Part        1             E     3.0   
4  1974%20Blues.musicxml  MusicXML Part        1             B     2.0   

  Duration   Type Active Chord Chord Duration  
0      0.0  Chord           F7            2.0  
1      0.5   Note           F7            2.0  
2      0.5   Note           F7            2.0  
3      0.5   Note           F7            2.0  
4      0.5   Note           F7            2.0  
Columns: Index(['File', 'Part', 'Measure', 'Note', 'Octave', 'Duration', 'Type',
       'Active Chord', 'Chord Duration'],
      dtype='object')
Note Vocabulary mapping: {'A': 1, 'A#': 2, 'A-': 3, 'B': 4, 'B#': 5, 'B-': 6, 'B--': 7, 'C': 8, 'C#': 9, 'C-': 10, 'D': 11

In [None]:
# Enharmonic equivalents dictionary: here '-' is used to denote flats.
enharmonic_equivalents = {
    'C#': 'D-', 'D-': 'C#',
    'D#': 'E-', 'E-': 'D#',
    'F#': 'G-', 'G-': 'F#',
    'G#': 'A-', 'A-': 'G#',
    'A#': 'B-', 'B-': 'A#'
}

def augment_notes(note_str, p=0.5):
    """
    With probability p, replace a note token with its enharmonic equivalent.
    """
    tokens = note_str.split()
    augmented_tokens = []
    for token in tokens:
        if token in enharmonic_equivalents and random.random() < p:
            augmented_tokens.append(enharmonic_equivalents[token])
        else:
            augmented_tokens.append(token)
    return " ".join(augmented_tokens)

def tokenize_notes(note_str, mapping):
    """
    Convert a space-separated note string into a list of integers using the note mapping.
    """
    return [mapping[note] for note in note_str.split() if note in mapping]

def process_octaves(octave_input, max_seq_length):
    """
    Process octave information.
    - If the input is a string, assume it is space-separated and split it.
    - If it's not a string (e.g., a float or int), assume it's a single value and replicate it.
    Then pad/truncate to max_seq_length.
    """
    if isinstance(octave_input, str):
        tokens = [int(o) for o in octave_input.split()]
    else:
        tokens = [int(octave_input)]

    # Pad or truncate the list to max_seq_length
    if len(tokens) < max_seq_length:
        tokens = tokens + [0] * (max_seq_length - len(tokens))
    else:
        tokens = tokens[:max_seq_length]
    return tokens

def process_note_durations(duration_str, max_seq_length):
    """
    Convert a space-separated duration string into a list of floats and pad/truncate to max_seq_length.
    This version handles fractional durations (e.g., "1/3").
    """
    tokens = []
    for d in duration_str.split():
        try:
            tokens.append(float(d))
        except ValueError:
            try:
                tokens.append(float(Fraction(d)))
            except Exception:
                tokens.append(0.0)
    if len(tokens) < max_seq_length:
        tokens = tokens + [0.0] * (max_seq_length - len(tokens))
    else:
        tokens = tokens[:max_seq_length]
    return tokens

In [None]:
from fractions import Fraction

def convert_to_float(value):
    """
    Converts a string value to float.
    If the string is a fraction (e.g., '2/3'), it converts it appropriately.
    """
    try:
        return float(value)
    except ValueError:
        return float(Fraction(value))

class MusicDataset(Dataset):
    def __init__(self, dataframe, note_mapping, chord_mapping, max_seq_length=32, augment=False):
        # Filter rows to only include those where 'Type' equals "Note" and chord label is not NaN
        self.data = dataframe[(dataframe['Type'] == "Note") & (dataframe['Active Chord'].notna())].reset_index(drop=True)
        self.note_mapping = note_mapping
        self.chord_mapping = chord_mapping
        self.max_seq_length = max_seq_length
        self.augment = augment

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Process the note string (with optional augmentation)
        note_str = row['Note']
        if self.augment:
            note_str = augment_notes(note_str)
        tokens = tokenize_notes(note_str, self.note_mapping)
        if len(tokens) < self.max_seq_length:
            tokens = tokens + [0] * (self.max_seq_length - len(tokens))
        else:
            tokens = tokens[:self.max_seq_length]

        # Process octave and note duration sequences
        octaves = process_octaves(row['Octave'], self.max_seq_length)
        note_durations = process_note_durations(row['Duration'], self.max_seq_length)

        # Process chord label and chord duration target
        chord_label = self.chord_mapping[str(row['Active Chord'])]
        chord_duration = convert_to_float(row['Chord Duration'])

        # Convert all data into tensors
        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        octaves_tensor = torch.tensor(octaves, dtype=torch.long)
        note_durations_tensor = torch.tensor(note_durations, dtype=torch.float)
        chord_label_tensor = torch.tensor(chord_label, dtype=torch.long)
        chord_duration_tensor = torch.tensor(chord_duration, dtype=torch.float)

        # Return tuple: (inputs, (classification target, regression target))
        return (tokens_tensor, octaves_tensor, note_durations_tensor), (chord_label_tensor, chord_duration_tensor)

# Create the dataset and DataLoader; set augment=True to enable augmentation.
dataset = MusicDataset(df, note2idx, chord2idx, max_seq_length=32, augment=True)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Test by retrieving one batch
for batch in dataloader:
    inputs, targets = batch
    tokens, octaves, note_durations = inputs
    chord_labels, chord_durations = targets
    print("Tokens shape:", tokens.shape)
    print("Octaves shape:", octaves.shape)
    print("Note durations shape:", note_durations.shape)
    print("Chord labels shape:", chord_labels.shape)
    print("Chord durations shape:", chord_durations.shape)
    break


Tokens shape: torch.Size([32, 32])
Octaves shape: torch.Size([32, 32])
Note durations shape: torch.Size([32, 32])
Chord labels shape: torch.Size([32])
Chord durations shape: torch.Size([32])


In [None]:
import torch.nn as nn

class ChordPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_seq_length, num_octaves=10):
        super(ChordPredictor, self).__init__()

        # Embeddings
        self.note_embed = nn.Embedding(vocab_size, embed_dim)
        self.octave_embed = nn.Embedding(num_octaves, embed_dim)
        self.duration_linear = nn.Linear(1, embed_dim)

        # Positional encoding
        self.pos_embedding = nn.Parameter(torch.zeros(1, max_seq_length, embed_dim))

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Post-transformer processing
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)

        # Output heads
        self.fc_class = nn.Linear(hidden_dim // 2, num_classes)      # Chord classification
        self.fc_duration = nn.Linear(hidden_dim // 2, 1)             # Chord duration regression

    def forward(self, tokens, octaves, note_durations):
        token_emb = self.note_embed(tokens)                               # [B, L, D]
        octave_emb = self.octave_embed(octaves)                           # [B, L, D]
        duration_emb = self.duration_linear(note_durations.unsqueeze(-1)) # [B, L, D]

        x = token_emb + octave_emb + duration_emb + self.pos_embedding    # [B, L, D]
        x = x.permute(1, 0, 2)  # Transformer expects [L, B, D]

        x = self.transformer_encoder(x)
        pooled = x[0]  # Use first token's representation [B, D]

        x = self.fc1(pooled)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)

        chord_logits = self.fc_class(x)
        chord_duration = self.fc_duration(x).squeeze(-1)

        return chord_logits, chord_duration


In [None]:
from safetensors.torch import load_file as load_safetensors
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error

# ----- Recreate the Model -----
model_pred = ChordPredictor(
    vocab_size=len(note2idx) + 1,  # +1 for padding
    embed_dim=32,
    num_heads=4,
    hidden_dim=256,
    num_layers=4,
    num_classes=len(chord2idx),
    max_seq_length=32,
    num_octaves=10
)

# ----- Load weights from .safetensors -----
safetensors_path = "chord_predictor.safetensors"  # Replace with your actual path
state_dict = load_safetensors(safetensors_path)
model_pred.load_state_dict(state_dict)
model_pred.eval()

# ----- Prepare Dataset & Dataloader -----
test_dataset = MusicDataset(df, note2idx, chord2idx, max_seq_length=32, augment=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# ----- Evaluation -----
all_true_labels = []
all_pred_labels = []
all_true_durations = []
all_pred_durations = []

with torch.no_grad():
    for batch in test_loader:
        inputs, targets = batch
        tokens, octaves, durations = inputs
        chord_labels, chord_durations = targets

        chord_logits, predicted_durations = model_pred(tokens, octaves, durations)

        # Get predicted chord labels (classification)
        pred_labels = torch.argmax(chord_logits, dim=1)

        # Collect all values for evaluation
        all_true_labels.extend(chord_labels.tolist())
        all_pred_labels.extend(pred_labels.tolist())
        all_true_durations.extend(chord_durations.tolist())
        all_pred_durations.extend(predicted_durations.tolist())



In [None]:
# Chord Similarity
def parse_chord(chord_str):
    root = chord_str[:2] if len(chord_str) > 1 and chord_str[1] in ['#', '-'] else chord_str[0]
    if "maj" in chord_str or "M" in chord_str:
        quality = "maj"
    elif "m" in chord_str or "-" in chord_str:
        quality = "min"
    elif "dim" in chord_str or "o" in chord_str:
        quality = "dim"
    elif "7" in chord_str or "9" in chord_str:
        quality = "dom"
    else:
        quality = "other"
    return root, quality

def compute_chord_similarity(true_labels, pred_labels, idx2chord):
    matches = 0
    for t, p in zip(true_labels, pred_labels):
        chord_t = idx2chord.get(t, "")
        chord_p = idx2chord.get(p, "")
        root_t, qual_t = parse_chord(chord_t)
        root_p, qual_p = parse_chord(chord_p)
        if root_t == root_p or qual_t == qual_p:
            matches += 1
    return matches / len(true_labels)

# Invert chord2idx
idx2chord = {v: k for k, v in chord2idx.items()}
chord_similarity_score = compute_chord_similarity(all_true_labels, all_pred_labels, idx2chord)
print(f"Harmonic Similarity Score: {chord_similarity_score:.2%}")

Harmonic Similarity Score: 54.45%


In [None]:
def compute_top_k_accuracies(chord_logits, true_labels, ks=[2, 5, 10, 15]):
    """
    Computes Top-K accuracies for multiple values of K.
    Returns a dictionary of {k: accuracy}.
    """
    topk_results = {}
    topk_preds = torch.topk(chord_logits, k=max(ks), dim=1).indices

    for k in ks:
        correct = sum([
            true in pred[:k] for true, pred in zip(true_labels, topk_preds.tolist())
        ])
        topk_results[k] = correct / len(true_labels)

    return topk_results


In [None]:
# Ensure these are defined:
# chord_logits_all: Tensor of shape [N, num_classes]
# all_true_labels: List or Tensor of true class indices

topk_scores = compute_top_k_accuracies(chord_logits_all, all_true_labels, ks=[2, 5, 7, 10, 15 ,25])

# Print the results
for k, acc in topk_scores.items():
    print(f"Top-{k} Accuracy: {acc:.2%}")


Top-2 Accuracy: 19.78%
Top-5 Accuracy: 36.98%
Top-7 Accuracy: 45.55%
Top-10 Accuracy: 55.10%
Top-15 Accuracy: 65.73%
Top-25 Accuracy: 79.08%
