In [20]:
import pandas as pd
import numpy as np
import torch
import copy
import matplotlib.pyplot as plt
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import cv2
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler

In [22]:
resnet50_model=torchvision.models.inception_v3(weights=torchvision.models.inception.Inception_V3_Weights)

In [23]:
resnet50_model

Inception3(
  (Conv2d_1a_3x3): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2a_3x3): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2b_3x3): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2d_3b_1x1): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_4a_3x3): BasicConv2d(
    (conv): Conv2d(80, 192, kernel_size=(3, 3), stri

In [24]:
num_classes = 2
batch = 100
num_epochs = 3
learning_rate = 0.0001

In [25]:
def output_label(label):
    output_mapping = {
                 0: "Not Prog Rock",
                 1: "Prog rock"
                 }
    input = (label.item() if type(label) == torch.Tensor else label)
    return output_mapping[input]

In [26]:
data = np.load('mfcc.npz')['arr_0']
y = np.load('labels.npy')
x_dim = data.shape[1]
y_dim = data.shape[2]

In [31]:
data[0].shape

(20, 431)

In [27]:
total_samples = data.shape[0]
all_indices = [i for i in range(total_samples)]
np.random.shuffle(all_indices)
train_percentage=50
train_size = int(data.shape[0]/2)
X_train = []
y_train = []
X_test = []
y_test = []
for indice in all_indices[:train_size]:
    X_train.append(data[indice])
    y_train.append(y[indice])
for indice in all_indices[train_size:]:
    X_test.append(data[indice])
    y_test.append(y[indice])
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [28]:
tensor_X_train = torch.Tensor(X_train) 
tensor_y_train = torch.Tensor(y_train)
tensor_y_train=tensor_y_train.long()

traindataset = TensorDataset(tensor_X_train,tensor_y_train)
trainloader = DataLoader(traindataset,batch_size=batch)

tensor_X_test = torch.Tensor(X_test) 
tensor_y_test = torch.Tensor(y_test)
tensor_y_test=tensor_y_test.long()


testdataset = TensorDataset(tensor_X_test,tensor_y_test)
testloader = DataLoader(testdataset,batch_size=batch)

class MusicCNN(nn.Module):   
    def __init__(self):
        super(MusicCNN, self).__init__()
        
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU()
        )
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU()
        )
        
        self.layer3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        
        self.fc1 = nn.Sequential(
            nn.Linear(in_features=64*20*431, out_features=10),
            nn.ReLU()
        )

        self.fc2 = nn.Linear(in_features=10, out_features=2)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        #print("Before")
        #print(out.shape)
        out = out.view(out.size(0), -1)
        #print("After")
        #print(out.shape)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [29]:
class MusicCNN(nn.Module):   
    def __init__(self):
        super(MusicCNN, self).__init__()
        
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=3, kernel_size=3, padding=1),
            resnet50_model
        )
        
        self.fc1 = nn.Linear(in_features=1000, out_features=2)
        
    def forward(self, x):
        out = self.layer1(x)
        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        return out

In [30]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Device", device)
print("batch_size:",batch)
print("number of epochs:",num_epochs)
print("learning rate:",learning_rate)
print("optimizer:","Adam")
print("Loss function","Cross Entropy Loss")
model = MusicCNN()
model.to(device)
# model.fc1.register_forward_hook(get_activation('fc1'))
# model.fc2.register_forward_hook(get_activation('fc2'))
# model.layer4.register_forward_hook(get_activation('layer4'))

error = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(model)


for epoch in range(num_epochs):
    for images, labels in trainloader:
        # Transfering images and labels to GPU if available
        images, labels = images.to(device), labels.to(device)
        #print(labels)
    
        train = images.view(images.shape[0], 1, x_dim, y_dim)
        
        # Forward pass 
        outputs = model(train)
        loss = error(outputs, labels)
        
        # Initializing a gradient as 0 so there is no mixing of gradient among the batches
        optimizer.zero_grad()
        
        #Propagating the error backward
        loss.backward()
        
        # Optimizing the parameters
        optimizer.step()
        
    
    print("Epoch: {}, Loss: {:.7f}".format(epoch, loss.data))

Device mps
batch_size: 100
number of epochs: 3
learning rate: 0.0001
optimizer: Adam
Loss function Cross Entropy Loss
MusicCNN(
  (layer1): Sequential(
    (0): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): Inception3(
      (Conv2d_1a_3x3): BasicConv2d(
        (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (Conv2d_2a_3x3): BasicConv2d(
        (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (Conv2d_2b_3x3): BasicConv2d(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False

RuntimeError: Given input size: (192x1x104). Calculated output size: (192x0x51). Output size is too small

In [16]:
class_correct = [0. for _ in range(num_classes)]
total_correct = [0. for _ in range(num_classes)]

with torch.no_grad():
    for images, labels in testloader:
        images, labels = images.to(device), labels.to(device)
        test = images.view(images.shape[0], 1, x_dim, y_dim)
        outputs = model(test)
        predicted = torch.max(outputs, 1)[1]
        c = (predicted == labels).squeeze()
        
        for i in range(images.shape[0]):
            label = labels[i]
            class_correct[label] += c[i].item()
            total_correct[label] += 1


print("Test Set Accuracy")      
for i in range(num_classes):
    print("Accuracy of {}: {:.2f}%".format(output_label(i), class_correct[i] * 100 / total_correct[i]))

Test Set Accuracy
Accuracy of Not Prog Rock: 0.00%
Accuracy of Prog rock: 100.00%


In [11]:
np.unique(y_train,return_counts=True)

(array([0, 1]), array([1514, 2469]))

In [12]:
np.unique(y_test,return_counts=True)

(array([0, 1]), array([1581, 2402]))

In [13]:
predicted

tensor([0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
        1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
        1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1], device='mps:0')

In [14]:
class_correct = [0. for _ in range(num_classes)]
total_correct = [0. for _ in range(num_classes)]

with torch.no_grad():
    for images, labels in trainloader:
        images, labels = images.to(device), labels.to(device)
        test = images.view(images.shape[0], 1, x_dim, y_dim)
        outputs = model(test)
        predicted = torch.max(outputs, 1)[1]
        c = (predicted == labels).squeeze()
        
        for i in range(images.shape[0]):
            label = labels[i]
            class_correct[label] += c[i].item()
            total_correct[label] += 1


print("Train Set Accuracy")      
for i in range(num_classes):
    print("Accuracy of {}: {:.2f}%".format(output_label(i), class_correct[i] * 100 / total_correct[i]))

Train Set Accuracy
Accuracy of Not Prog Rock: 97.82%
Accuracy of Prog rock: 98.83%


# predictions on new test set

In [15]:
test_data = np.load('test_mfcc.npz')['arr_0']
test_y = np.load('test_labels.npy')

In [16]:
tensor_X_test = torch.Tensor(test_data) 
tensor_y_test = torch.Tensor(test_y)
tensor_y_test=tensor_y_test.long()


newtestdataset = TensorDataset(tensor_X_test,tensor_y_test)
newtestloader = DataLoader(newtestdataset,batch_size=batch)

In [17]:
class_correct = [0. for _ in range(num_classes)]
total_correct = [0. for _ in range(num_classes)]

with torch.no_grad():
    for images, labels in newtestloader:
        images, labels = images.to(device), labels.to(device)
        test = images.view(images.shape[0], 1, x_dim, y_dim)
        outputs = model(test)
        predicted = torch.max(outputs, 1)[1]
        c = (predicted == labels).squeeze()
        
        for i in range(images.shape[0]):
            label = labels[i]
            class_correct[label] += c[i].item()
            total_correct[label] += 1


print("New Test Set Accuracy")      
for i in range(num_classes):
    print("Accuracy of {}: {:.2f}%".format(output_label(i), class_correct[i] * 100 / total_correct[i]))

New Test Set Accuracy
Accuracy of Not Prog Rock: 40.75%
Accuracy of Prog rock: 73.47%


In [18]:
class_correct

[251.0, 1030.0]

In [19]:
total_correct

[616.0, 1402.0]

In [20]:
(282+970)/(616+1402)

0.6204162537165511