In [2]:
import librosa
import librosa.display
import os
import numpy as np
from torch.utils.data.dataset import random_split
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score

In [3]:
file_repo = '../CMR_subset_1.0/audio'

ss = StandardScaler()

# extract features from all the files in the repo
def extract_features(file_repo):
  features = []
  for file in os.listdir(file_repo):
    if file.endswith(".wav"):
      file_path = os.path.join(file_repo, file)
      y, sr = librosa.load(file_path)
      chroma = librosa.feature.chroma_stft(y=y, sr=sr)
      mfccs = librosa.feature.mfcc(y=y, sr=sr)
      mag_spec = np.abs(librosa.stft(y))
      mag_spec = mag_spec[:8, :]
      combined_features = np.vstack((chroma, mfccs, mag_spec)).T
      ss.fit(combined_features)
      features.append(combined_features)
  return features

extracted_features = extract_features(file_repo)

In [4]:
file_path = '../CMR_subset_1.0/CMRdataset.csv'

def extract_labels(file_path):
  labels = []
  with open(file_path, 'r') as f:
    # leave out the header
    next(f)
    for line in f:
      line = line.strip()
      line = line.split(',')
      labels.append(int(line[-1]))
  return labels

extracted_labels = extract_labels(file_path)

In [5]:
for i in range(len(extracted_features)):
  num_steps, num_features = extracted_features[i].shape
  if num_steps != 5168:
    for j in range(5168 - num_steps):
      extracted_features[i] = np.vstack((extracted_features[i], np.zeros(num_features)))
      
extracted_features = np.array(extracted_features).reshape(len(extracted_features), 1, 5168, 40)
print(extracted_features.shape)

(118, 1, 5168, 40)


In [6]:
label_encoder = LabelEncoder()

extracted_labels = label_encoder.fit_transform(extracted_labels)
print(extracted_labels.shape)

extracted_features = torch.from_numpy(extracted_features).float()
extracted_labels = torch.tensor(extracted_labels).long().squeeze()

(118,)


In [7]:
dataset = TensorDataset(extracted_features, extracted_labels)

total_size = len(dataset)
train_size = int(0.7 * total_size)
val_size = (total_size - train_size) // 2
test_size = total_size - train_size - val_size

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 82
Validation dataset size: 18
Test dataset size: 18


### CNN Model for Rhythmic beat classification

In [10]:
class Reshape(nn.Module):
  def __init__(self, shape):
    super(Reshape, self).__init__()
    self.shape = shape

  def forward(self, x):
    return x.view(self.shape)

class CNN(nn.Module):
  def __init__(
    self,
    kernel_size=3,
    stride=1,
    padding=1,
    dropout=0.2,
    learning_rate=0.001,
    batch_size=64,
    num_epochs=5
  ):
    super(CNN, self).__init__()
    
    self.conv1 = nn.Conv2d(
      in_channels=1, 
      out_channels=16, 
      kernel_size=kernel_size, 
      stride=stride,
      padding=padding
    )
    self.relu = nn.ReLU()
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    
    self.conv2 = nn.Conv2d(
      in_channels=16, 
      out_channels=32,
      kernel_size=kernel_size,
      stride=stride,
      padding=padding
    )
    self.dim1 = 1292
    self.dim2 = 10
    
    self.adjust_shape = Reshape((-1, 32*self.dim1*self.dim2))
    
    self.dropout = nn.Dropout(p=dropout)
    self.fc = nn.Linear(32*self.dim1*self.dim2, 4)
    self.softmax = nn.Softmax(dim=1)
    
    self.layers = [
      self.conv1,         
      self.pool,          
      self.relu,          
      self.conv2,         
      self.pool,          
      self.relu,          
      self.dropout,       
      self.adjust_shape,  
      self.fc,            
      self.softmax        
    ]
    
    self.criterion = nn.CrossEntropyLoss()
    self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
    self.num_epochs = num_epochs
    self.batch_size = batch_size

  def forward(self, x):
    x = self.conv1(x)
    x = self.pool(x)
    x = self.relu(x)
    
    x = self.conv2(x)
    x = self.pool(x)
    x = self.relu(x)
    
    x = self.dropout(x)
    x = self.adjust_shape(x)
    x = self.fc(x)
    x = self.softmax(x)
    return x
  
  def train_model(self, train_dataset, val_dataset):
    train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=True)
    
    train_loss, val_loss, accuracy, val_accuracy = 0,0,0,0
    
    for epoch in range(self.num_epochs):
      self.train()
      train_loss = 0.0
      correct = 0
      total = 0
      
      val_loss = 0.0
      val_correct = 0
      val_total = 0
      
      for inputs, labels in train_loader:
        self.optimizer.zero_grad()
        outputs = self(inputs)
        loss = self.criterion(outputs, labels)
        loss.backward(retain_graph=True)
        self.optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

      self.eval()
      for inputs, labels in val_loader:
        self.optimizer.zero_grad()
        outputs = self(inputs)
        loss = self.criterion(outputs, labels)
        loss.backward(retain_graph=True)
        self.optimizer.step()

        val_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        val_total += labels.size(0)
        val_correct += (predicted == labels).sum().item()

      val_accuracy = val_correct / val_total
      accuracy = correct / total

      print(
        f'Epoch {epoch+1}/{self.num_epochs}, '
        f'Loss(Train): {train_loss:.4f}, '
        f'Accuracy(Train): {accuracy:.2f}, '
        f'Loss(Val): {val_loss:.4f}, '
        f'Accuracy(Val): {val_accuracy:.2f}'
      )
    return train_loss, val_loss, accuracy, val_accuracy
  
  def predict(self, pred_dataset):
    pred_loader = DataLoader(pred_dataset, batch_size=self.batch_size, shuffle=False)
    self.eval()
    predictions = []
    labels_true = []
    for inputs, labels in pred_loader:
      outputs = self(inputs)
      _, predicted = torch.max(outputs.data, 1)
      predictions.extend(predicted.tolist())
      labels_true.extend(labels.numpy())
    return predictions, labels_true

In [11]:
# train the model
model = CNN()
train_loss, val_loss, accuracy, val_accuracy = model.train_model(train_dataset, val_dataset)

print(f'Final Accuracy(Train): {accuracy:.2f}, Final Accuracy(Val): {val_accuracy:.2f}')

Epoch 1/5, Loss(Train): 3.0066, Accuracy(Train): 0.22, Loss(Val): 1.5214, Accuracy(Val): 0.22
Epoch 2/5, Loss(Train): 3.0550, Accuracy(Train): 0.24, Loss(Val): 1.5214, Accuracy(Val): 0.22
Epoch 3/5, Loss(Train): 2.9353, Accuracy(Train): 0.24, Loss(Val): 1.5214, Accuracy(Val): 0.22
Epoch 4/5, Loss(Train): 3.0151, Accuracy(Train): 0.24, Loss(Val): 1.5214, Accuracy(Val): 0.22
Epoch 5/5, Loss(Train): 3.0151, Accuracy(Train): 0.24, Loss(Val): 1.5214, Accuracy(Val): 0.22
Final Accuracy(Train): 0.24, Final Accuracy(Val): 0.22


In [12]:
predictions_val, labels_true_val = model.predict(val_dataset)
predictions_val = label_encoder.inverse_transform(predictions_val)
labels_true_val = label_encoder.inverse_transform(labels_true_val)

print(f'predictions_val: {predictions_val}')
print(f'labels_true_val: {labels_true_val}')

print(accuracy_score(labels_true_val, predictions_val))
print(classification_report(labels_true_val, predictions_val, zero_division=0))

predictions_val: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
labels_true_val: [5 7 3 7 5 7 7 3 3 7 7 5 5 7 7 3 8 7]
0.2222222222222222
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           5       0.22      1.00      0.36         4
           7       0.00      0.00      0.00         9
           8       0.00      0.00      0.00         1

    accuracy                           0.22        18
   macro avg       0.06      0.25      0.09        18
weighted avg       0.05      0.22      0.08        18



In [13]:
### Test the model
predictions, labels_true = model.predict(test_dataset)
predictions = label_encoder.inverse_transform(predictions)
labels_true = label_encoder.inverse_transform(labels_true)

print(f'Predictions: {predictions}')
print(f'Labels: {labels_true}')

print(accuracy_score(labels_true, predictions))
print(classification_report(labels_true, predictions, zero_division=0))

Predictions: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
Labels: [7 7 5 8 3 7 7 5 3 7 8 5 3 8 5 3 3 8]
0.2222222222222222
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           5       0.22      1.00      0.36         4
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         4

    accuracy                           0.22        18
   macro avg       0.06      0.25      0.09        18
weighted avg       0.05      0.22      0.08        18



### Decision Tree Model for Rhythmic beat classification

In [19]:
# use sklearn decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

clf = DecisionTreeClassifier()

extracted_features_dt = extracted_features.reshape(len(extracted_features), -1)

X_train, X_test, y_train, y_test = train_test_split(extracted_features_dt, extracted_labels, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

clf.fit(X_train, y_train)
print(accuracy_score(y_train, clf.predict(X_train)))

y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(label_encoder.inverse_transform(y_pred))
print(label_encoder.inverse_transform(y_test))

1.0
0.2777777777777778
[5 8 5 3 5 3 3 3 8 3 7 8 5 8 5 8 5 7]
[5 7 3 8 3 3 3 3 7 8 5 5 8 7 7 8 3 5]


### Random Forest Model for Rhythmic beat classification

In [29]:
# Random forest from sklearn
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()

rf_clf.fit(X_train, y_train)
print(accuracy_score(y_train, rf_clf.predict(X_train)))

y_pred = rf_clf.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(label_encoder.inverse_transform(y_pred))
print(label_encoder.inverse_transform(y_test))

1.0
0.05555555555555555
[3 8 3 3 5 7 7 5 3 3 3 7 5 5 3 7 5 7]
[5 7 3 8 3 3 3 3 7 8 5 5 8 7 7 8 3 5]
