In [24]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import sys
sys.path.append("..") # Add parent directory to sys.path to access preprocessing module

from preprocessing.feature_extractor import FeatureExtractor

In [25]:
data_dir = "../../data/Composer_Dataset"
composers = ["Bach", "Beethoven", "Chopin", "Mozart"]
scalar_features, multidimensional_features = FeatureExtractor.extract_features_for_directory(data_dir, composers)

Loading existing features from ../../data/Composer_Dataset/extracted_features.pkl


In [26]:
scalar_features.head()

Unnamed: 0,max_independent_voices,avg_independent_voices,var_independent_voices,avg_simultaneity,var_simultaneity,note_density,avg_note_duration,var_note_duration,initial_tempo,time_signature_numerator,...,perfect_vertical_intervals,vertical_minor_seconds,vertical_thirds,vertical_fifths,vertical_tritones,vertical_octaves,avg_chord_duration,length,file_name,composer
0,4.0,3.899642,0.300477,3.72,1.020588,6.439026,0.621212,0.296121,143.000038,4.0,...,0.391195,0.0,0.327044,0.2,0.045283,0.122013,0.465,29.999992,042100b_.mid,Bach
1,4.0,3.986667,0.114698,3.6,1.2,7.394595,0.526316,0.250579,189.176471,4.0,...,0.400447,0.00522,0.307979,0.168531,0.042506,0.123788,0.321429,25.0,043100b_.mid,Bach
2,7.0,2.664012,1.617631,2.331998,1.816512,6.292553,0.371665,0.686803,181.905446,4.0,...,0.378734,0.006634,0.29159,0.120602,0.057594,0.158185,0.177695,923.701235,Bwv0564-Toccata-Adagio-and-Fugue.mid,Bach
3,4.0,3.970874,0.16816,3.662222,1.112213,8.775758,0.455801,0.212811,177.509434,4.0,...,0.375205,0.009852,0.312808,0.130542,0.030378,0.115764,0.321875,22.5,027400b_.mid,Bach
4,4.0,3.917323,0.275394,3.455782,1.371385,5.195127,0.713614,0.442007,153.000153,4.0,...,0.478439,0.002738,0.309377,0.206023,0.016427,0.140999,0.47037,29.333304,026400b_.mid,Bach


In [27]:
print(f"Piano Roll Shape: {multidimensional_features[0]['piano_roll'].shape}")
print(f"Chroma Piano Roll Shape: {multidimensional_features[0]['chroma_piano_roll'].shape}")
print(f"Pitch Class Histogram Shape: {multidimensional_features[0]['pitch_class_histogram'].shape}")
print(f"Pitch Class Transition Matrix Shape: {multidimensional_features[0]['pitch_class_transition_matrix'].shape}")

Piano Roll Shape: (128, 299)
Chroma Piano Roll Shape: (12, 299)
Pitch Class Histogram Shape: (12,)
Pitch Class Transition Matrix Shape: (12, 12)


## CNN Example

### Train Val Test Split

In [115]:
train_split = 0.8  # ratio of all data to use for training
val_test_split = 0.5  # ratio of holdout data to use for test set

x = multidimensional_features
x2 = scalar_features.drop(columns=['composer', 'file_name'], axis=1)
y = scalar_features['composer']

# Normalize x2 with max min scaling

x2 = (x2 - x2.min()) / (x2.max() - x2.min())

#x2 = (x2 - x2.mean()) / x2.std()

print(x2.head())

# Print the max from each column
print(x2.max())

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Add x2 as additional feature to x1 dictionary
for i in range(len(x)):
    x[i]['scalar_features'] = x2.iloc[i].values

x_train, x_holdout, y_train, y_holdout = train_test_split(x, y, test_size=(1 - train_split), random_state=1)
x_val, x_test, y_val, y_test = train_test_split(x_holdout, y_holdout, test_size=val_test_split, random_state=1)



   max_independent_voices  avg_independent_voices  var_independent_voices  \
0                0.166667                0.420559                0.082873   
1                0.166667                0.433181                0.031634   
2                0.333333                0.241345                0.446149   
3                0.166667                0.430890                0.046379   
4                0.166667                0.423123                0.075955   

   avg_simultaneity  var_simultaneity  note_density  avg_note_duration  \
0          0.175759          0.127265      0.079933           0.360989   
1          0.168435          0.149638      0.097153           0.295509   
2          0.091051          0.226516      0.077293           0.188798   
3          0.172233          0.138691      0.122042           0.246853   
4          0.159634          0.171009      0.057517           0.424747   

   var_note_duration  initial_tempo  time_signature_numerator  ...  \
0           0.051324  

### Chunk Into Sequences

In [116]:
def chunk_sequences(X_in, y_in, feature, sequence_length=100):
    X_out = []
    y_out = []

    # make sequences split along the time axis
    for i in range(len(X_in)):
        for j in range(0, len(X_in[i][feature][1]) - sequence_length, sequence_length):
            X_out.append((X_in[i][feature][:, j:j + sequence_length], X_in[i]['scalar_features']))
            y_out.append(y_in[i])

    return X_out, y_out

### Setup

### Make Dataloader

In [117]:
class PianoRollDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X1 = torch.tensor(self.X[idx][0], dtype=torch.float32) 
        X2 = torch.tensor(self.X[idx][1], dtype=torch.float32) 
        y = torch.tensor(self.y[idx], dtype=torch.long)
        return X1, X2, y

In [118]:
def get_dataloaders(use_chroma, seq_length=100, batch_size=8):


    # Set random seed for PyTorch
    torch.manual_seed(42) 
    
    feature_set = 'chroma_piano_roll' if use_chroma else 'piano_roll'

    X_train_seq, y_train_seq = chunk_sequences(x_train, y_train, feature_set, seq_length)
    X_val_seq, y_val_seq = chunk_sequences(x_val, y_val, feature_set, seq_length)
    X_test_seq, y_test_seq = chunk_sequences(x_test, y_test, feature_set, seq_length)

    train_dataset = PianoRollDataset(X_train_seq, y_train_seq)
    val_dataset = PianoRollDataset(X_val_seq, y_val_seq)
    test_dataset = PianoRollDataset(X_test_seq, y_test_seq)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

### Define CNN

In [119]:
class PianoRollCNN(nn.Module):
    def __init__(self, h_params):
        super(PianoRollCNN, self).__init__()
        # Conv1d layers: input channels = 12, height = 1, width = 50

        X2_DIM = 31  # Number of scalar features

        input_channels = 12 if h_params['use_chroma'] else 128

        self.conv1 = nn.Conv1d(input_channels, h_params['conv1_out_channels'], 
                               kernel_size=h_params['kernel_size'], stride=h_params['stride'], 
                               padding=h_params['kernel_size'] // 2)
        self.conv2 = nn.Conv1d(h_params['conv1_out_channels'], h_params['conv2_out_channels'], 
                               kernel_size=h_params['kernel_size'], stride=h_params['stride'],
                               padding=h_params['kernel_size'] // 2)
        self.pool = nn.MaxPool1d(kernel_size=h_params['pool_kernel_size'], stride=h_params['pool_stride'], padding=0)

        def conv_output_size(input_size, kernel_size, stride, padding):
            return (input_size - kernel_size + 2 * padding) // stride + 1
    
        padding = h_params['kernel_size'] // 2  # Assuming same padding

        # First conv layer output size
        conv1_output_width = conv_output_size(int(h_params['seq_length']), h_params['kernel_size'], h_params['stride'], padding)
        # First pooling layer output size
        pooled_width = conv_output_size(conv1_output_width, h_params['pool_kernel_size'], h_params['pool_stride'], 0)
        # Second conv layer output size
        conv2_output_width = conv_output_size(pooled_width, h_params['kernel_size'], h_params['stride'], padding)
        # Second pooling layer output size
        pooled_width = conv_output_size(conv2_output_width, h_params['pool_kernel_size'], h_params['pool_stride'], 0)

        self.fc1 = nn.Linear(h_params['conv2_out_channels'] * pooled_width, h_params['fc1_out'])
        self.dropout = nn.Dropout(h_params['dropout'])
        self.fc2 = nn.Linear(h_params['fc1_out'] + X2_DIM, 4)  # Output of 4 classes (composers)

    def forward(self, x1, x2):
        x = x1.squeeze(2) # This can potentially be removed if the input is reshaped correctly in the dataloader
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1) 
        x = self.dropout(F.relu(self.fc1(x)))
        x = torch.cat((x, x2), dim=1)
        print("X1: ", x)
        x = self.fc2(x)
        print("X2: ", x)
        return x

In [120]:
def train_cnn(model, train_loader, val_loader, num_epochs, criterion, optimizer):

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        report_interval = 1000
        for i, data in enumerate(train_loader):
            input1, input2, labels = data
            optimizer.zero_grad()
            outputs = model(input1, input2)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if i % report_interval == (report_interval - 1):
                print(f'[Epoch {epoch + 1}, Batch {i + 1}] loss: {running_loss / report_interval:.3f}')
                running_loss = 0.0  # Reset running loss

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for data in val_loader:
                input1, input2, labels = data
                outputs = model(input1, input2)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        print(f'Validation loss: {val_loss / len(val_loader):.3f}, Accuracy: {100 * correct / total:.2f}%')


### Evaluate Performance

In [121]:
def evaluate_performance(model, data_loader):
    model.eval()

    all_labels = []
    all_predictions = []

    # Disable gradient computation for evaluation
    with torch.no_grad():
        for data in data_loader:
            input1, input2, labels = data
            outputs = model(input1, input2)
            _, predicted = torch.max(outputs.data, 1)

            # Store true and predicted labels
            all_labels.extend(labels.numpy())
            all_predictions.extend(predicted.numpy())

    # Compute the classification report
    report = classification_report(all_labels, all_predictions, target_names=label_encoder.classes_)
    print(report)

    return report, all_labels, all_predictions

### Implement One Training Run

In [122]:
def training_run(h_params):

    seq_length = int(h_params['seq_length'])
    batch_size = int(h_params['batch_size'])

    train_loader, val_loader, test_loader = get_dataloaders(h_params['use_chroma'], seq_length, batch_size)

    model = PianoRollCNN(h_params)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=h_params['lr'])
    train_cnn(model, train_loader, val_loader, h_params['num_epochs'], criterion, optimizer)

    # TODO: Add model save 
    ###### Save results #####
    return model, test_loader

### Process All Hyperparameter Sets

In [123]:
# Function to process the file and train models
def process_file(file_path):
    infile = pd.read_csv(file_path)

    outputs = ['accuracy', 'bach_f1', 'beethoven_f1', 'chopin_f1', 'mozart_f1']
    floating_point_columns = ['lr']

    for index, row in infile.iterrows():

        # Check that the row has not been processed
        if not pd.isnull(row['accuracy']):
            continue

        # Create hyperparameter dictionary dynamically
        hyperparams = {col: row[col] for col in infile.columns if col not in outputs}
        # Convert int columns to int
        for col in infile.columns:
            if col not in floating_point_columns and col not in outputs:
                hyperparams[col] = int(hyperparams[col])
            elif col in floating_point_columns:
                hyperparams[col] = float(hyperparams[col])

        # Train model
        model, test_loader = training_run(hyperparams)

        # Evaluate performance
        report, all_labels, all_predictions = evaluate_performance(model, test_loader)

        # Write accuracy to data then save
        infile.at[index, 'accuracy'] = float(report.split('\n')[7].split()[1])
        infile.at[index, 'bach_f1'] = float(report.split('\n')[2].split()[3])
        infile.at[index, 'beethoven_f1'] = float(report.split('\n')[3].split()[3])
        infile.at[index, 'chopin_f1'] = float(report.split('\n')[4].split()[3])
        infile.at[index, 'mozart_f1'] = float(report.split('\n')[5].split()[3])
        infile.to_csv(file_path, index=False)


In [124]:
process_file('cnn_optimization.csv')

## Single Training Run

In [125]:
test_params = {}
test_params['use_chroma'] = True
test_params['seq_length'] = 50
test_params['batch_size'] = 8
test_params['lr'] = 0.00001
test_params['num_epochs'] = 1

test_params['conv1_out_channels'] = 32
test_params['conv2_out_channels'] = 64
test_params['kernel_size'] = 3
test_params['stride'] = 1

test_params['pool_kernel_size'] = 2
test_params['pool_stride'] = 2

test_params['dropout'] = 0.1
test_params['fc1_out'] = 128

# Print out the first item in the test dataloader
for i, data in enumerate(test_dataloader):
    input1, input2, labels = data
    print(input1.shape)
    print(input2.shape)
    print(input2[0])
    print(labels)
    break

model, test_dataloader = training_run(test_params)



report, _, _ = evaluate_performance(model, test_dataloader)

torch.Size([8, 12, 50])
torch.Size([8, 31])
tensor([-0.4131, -0.0516,  0.2149,  0.1595, -0.0317, -0.5345,  0.8691, -0.1194,
        -0.7721,  0.0998, -0.2751, -0.9863, -1.0813, -1.1279,  1.9923,  0.0462,
        -1.2953, -1.4681, -1.2407,  0.9614, -0.1272, -1.7183,  0.5731,  0.5383,
        -0.8542,  0.4843,  0.6355, -0.3982, -0.4510,  1.0463, -0.6040])
tensor([0, 0, 0, 0, 0, 0, 0, 0])


X1:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.2531, 0.0347, 0.1587],
        [0.0000, 0.0000, 0.0000,  ..., 0.2584, 0.1201, 0.0455],
        [0.0000, 0.0000, 0.0000,  ..., 0.1113, 0.0625, 0.1323],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.1417, 0.0614, 0.1556],
        [4.3114, 0.0000, 0.0000,  ..., 0.2122, 0.0432, 0.2836],
        [0.0000, 0.0000, 0.0000,  ..., 0.2136, 0.0089, 0.1356]],
       grad_fn=<CatBackward0>)
X2:  tensor([[-16.2699,  21.3080, -12.1977,  -7.3534],
        [ -2.5088,   4.8339,  -2.7518,   0.4699],
        [ -4.2377,   4.9900,  -0.7207,  -1.3677],
        [ -3.7010,   6.7867,  -6.2523,  -0.5817],
        [ -2.4100,   5.0588,  -2.5580,  -0.1660],
        [ -1.2585,   1.8928,  -2.1856,   0.0651],
        [  0.0333,   2.2561,  -0.2857,  -1.5715],
        [ -2.4020,   2.5648,  -1.8023,  -1.2900]], grad_fn=<AddmmBackward0>)
X1:  tensor([[1.8210e+00, 1.5283e+00, 0.0000e+00,  ..., 2.6058e-01, 6.9226e-03,
         7.0796e-02],
        [9.0277e+00, 0.0000e+00,

KeyboardInterrupt: 

In [None]:
import optuna

def objective(trial):
    # Define the hyperparameter search space
    h_params = {
        'use_chroma': False,
        'seq_length': trial.suggest_int('seq_length', 50, 400),
        'batch_size': trial.suggest_int('batch_size', 2, 64),
        'lr': trial.suggest_float('lr', 1e-5, 1e-2, log=True),
        'num_epochs': 3,
        'conv1_out_channels': trial.suggest_int('conv1_out_channels', 16, 64),
        'conv2_out_channels': trial.suggest_int('conv2_out_channels', 32, 128),
        'kernel_size': trial.suggest_int('kernel_size', 3, 9),
        'stride': 1,
        'pool_kernel_size': 2,
        'pool_stride': 2,
        'dropout': trial.suggest_float('dropout', 0.0, 0.5),
        'fc1_out': trial.suggest_int('fc1_out', 64, 256)
    }

    model, test_loader = training_run(h_params)
    report, _, _ = evaluate_performance(model, test_loader)

    return float(report.split('\n')[7].split()[1])

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1)

[I 2024-07-30 22:06:12,309] A new study created in memory with name: no-name-5065eeda-4302-4dee-8819-ccb0377c7f26


tensor([[ 2.9581,  1.7765,  0.3818,  ..., -0.5110, -0.2803,  0.1372],
        [ 0.0000,  1.5622,  0.0000,  ...,  0.3263, -0.8878,  0.4417],
        [ 0.4649,  2.1049,  0.0000,  ..., -0.6339, -0.3677, -0.3116],
        [ 0.0000,  8.0648,  0.0000,  ...,  1.5634, -0.2976,  1.9033],
        [ 2.0789,  0.0000,  0.0000,  ...,  1.4761, -0.6070,  0.4875],
        [ 1.3217,  2.4439,  0.0000,  ...,  2.1110, -0.9436,  0.4132]],
       grad_fn=<CatBackward0>)
tensor([[ 0.0000,  0.0000,  0.0000,  ..., -1.2601,  0.0342,  1.1296],
        [ 0.0000,  0.6512,  0.0000,  ..., -0.3525,  0.3807,  1.0229],
        [ 1.6255,  2.8174,  0.0000,  ...,  0.0753, -0.3277,  2.0623],
        [ 0.0000,  7.3436,  1.5993,  ...,  1.3202, -0.5291,  0.8962],
        [ 0.0000,  0.0000,  0.0328,  ..., -1.0697,  0.2608,  0.4912],
        [ 0.9162,  4.9058,  1.2602,  ..., -0.7182, -0.3152,  0.1925]],
       grad_fn=<CatBackward0>)
tensor([[ 0.4176,  0.0000,  0.0000,  ...,  0.9329, -0.7669,  1.3667],
        [ 1.0258,  1.1321,

[W 2024-07-30 22:06:52,921] Trial 0 failed with parameters: {'seq_length': 72, 'batch_size': 6, 'lr': 6.156278420713227e-05, 'conv1_out_channels': 18, 'conv2_out_channels': 61, 'kernel_size': 6, 'dropout': 0.20918843531811682, 'fc1_out': 154} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/kdevoe/Documents/CS/Masters/AAI511/ms-aai-511-final-project/511_final_env/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/v3/6g8j12qj0yd6zlbh6nr578p80000gn/T/ipykernel_73684/3811368885.py", line 21, in objective
    model, test_loader = training_run(h_params)
  File "/var/folders/v3/6g8j12qj0yd6zlbh6nr578p80000gn/T/ipykernel_73684/2227660758.py", line 12, in training_run
    train_cnn(model, train_loader, val_loader, h_params['num_epochs'], criterion, optimizer)
  File "/var/folders/v3/6g8j12qj0yd6zlbh6nr578p80000gn/T/ipykernel_73684/2591230134.py", line 12, i

tensor([[    nan,     nan,     nan,  ...,  1.3222,  0.0795,  0.6263],
        [    nan,     nan,     nan,  ...,  0.3622, -0.8980, -0.0570],
        [    nan,     nan,     nan,  ...,  0.7950, -0.8006, -0.5978],
        [    nan,     nan,     nan,  ...,  0.0033,  1.1839, -0.1858],
        [    nan,     nan,     nan,  ..., -0.1664, -0.7849,  2.9028],
        [    nan,     nan,     nan,  ..., -0.1084, -0.5163,  1.1108]],
       grad_fn=<CatBackward0>)
tensor([[    nan,     nan,     nan,  ..., -0.4805, -0.8548, -0.4611],
        [    nan,     nan,     nan,  ...,  0.5408, -0.8457,  2.0627],
        [    nan,     nan,     nan,  ..., -0.0984, -0.0495,  0.1341],
        [    nan,     nan,     nan,  ..., -0.4322, -0.4856,  2.5456],
        [    nan,     nan,     nan,  ..., -0.1764,  2.0414, -0.6619],
        [    nan,     nan,     nan,  ...,  1.4979, -0.8124,  0.9637]],
       grad_fn=<CatBackward0>)
tensor([[    nan,     nan,     nan,  ...,  1.0440,  0.1049,  1.1219],
        [    nan,     nan,

KeyboardInterrupt: 

In [None]:
# Get results from the optimization

print(study.best_params)
print(study.best_value)

{'seq_length': 355, 'batch_size': 3, 'lr': 0.00022470939685615568, 'conv1_out_channels': 24, 'conv2_out_channels': 47, 'kernel_size': 6, 'dropout': 0.36957547648537353, 'fc1_out': 203}
0.45
