In [2]:
from scipy.io import wavfile
import numpy as np
import os

genres = ['blues','metal','country','hiphop','jazz','classical']

N=len(genres)

wavlist = []
labels = []

# blues=0,metal=1,country=2,hiphop=3
for i,genre in enumerate(genres):
    files = os.listdir('/content/drive/My Drive/dataset/genres/'+genre)
    for f in files:
        filename = '/content/drive/My Drive/dataset/genres/'+genre+'/'+f
        count,data = wavfile.read(filename)
        
        if i % 3 == 0:                  
            # Here, I am downsampling the data by a factor of 8, and keeping only the first 65536 features,
            # roughly 30 sec
            wavlist.append(data[:2**19:8]/2**15)
            labels.append(i)
       
y = np.array(labels)
X = np.vstack(wavlist)
print(X.shape)

(200, 65536)


In [3]:
from sklearn.model_selection import train_test_split

# Training/test set split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

X_train = X_train.reshape((*X_train.shape,1))
X_test = X_test.reshape((*X_test.shape,1))

print(X_train.shape,y_train.shape)

(160, 65536, 1) (160,)


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

X_train = torch.from_numpy(X_train)
X_test = torch.from_numpy(X_test)
y_train = torch.from_numpy(y_train)
y_test = torch.from_numpy(y_test)


X_train = X_train.to(torch.float32)

#Pytorch expects channels first, so reshape
X_train = X_train.reshape(-1,1,2**16)
X_test = X_test.to(torch.float32)
X_test = X_test.reshape(-1,1,2**16)
y_train = y_train.to(torch.long)
y_test = y_test.to(torch.long)

device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")

X_train = X_train.to(device)
X_test = X_test.to(device)
y_train = y_train.to(device)
y_test = y_test.to(device)

from torch.utils.data import TensorDataset

training_data = TensorDataset(X_train,y_train)
test_data = TensorDataset(X_test,y_test)

batch_size = 32
train_loader = torch.utils.data.DataLoader(dataset=training_data,
                                           batch_size=batch_size, 
                                           shuffle=True)

batch_size = 32
test_loader = torch.utils.data.DataLoader(dataset=test_data,
                                           batch_size=batch_size, 
                                           shuffle=False)

In [5]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

torch.Size([160, 1, 65536]) torch.Size([160])
torch.Size([40, 1, 65536]) torch.Size([40])


In [0]:
import torch.optim as optim

# class definition
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, output_dim=6, num_layers=2):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers

        # setup LSTM layer
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)

        # setup output layer
        self.linear = nn.Linear(self.hidden_dim, output_dim)

    def init_hidden(self):
        return (
            torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
            torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
        )

    def forward(self, input):
       
        lstm_out, hidden = self.lstm(input)
        logits = self.linear(lstm_out[-1])
        genre_scores = F.log_softmax(logits, dim=1)
        return genre_scores

In [0]:
# model = Net()
# model.to(device)

model = LSTM(input_dim=1, hidden_dim=100, batch_size=batch_size, output_dim=6, num_layers=2)
model.to(device)

criterion = nn.NLLLoss()  # expects ouputs from LogSoftmax
# criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# criterion = torch.nn.CrossEntropyLoss(reduction='mean')

# optimizer = torch.optim.Adam(model.parameters())

train_accs = []
test_accs = []

epochs = 50
# Loop over the data
for epoch in range(epochs):
    model.train()
    # Loop over each subset of data
    for d,t in train_loader:

        # Zero out the optimizer's gradient buffer
        optimizer.zero_grad()
        d = d.view((65536,-1,1))
        # Make a prediction based on the model
        outputs = model(d)
        
        # print("output_shape: ", outputs.shape)  
        # print("t: ", t.shape)

        # Compute the loss
        loss = criterion(outputs,t) 

        # Use backpropagation to compute the derivative of the loss with respect to the parameters
        loss.backward()
        
        # Use the derivative information to update the parameters
        optimizer.step()

    model.eval()
    # After every 10th epoch, compute the test set accuracy
    if epoch%10==0:
        total=0.
        correct=0.
        # Loop over all the test examples and accumulate the number of correct results in each batch
        for d,t in test_loader:
            
            d = d.view((65536,-1,1))
            outputs = model(d)
            _, predicted = torch.max(outputs.data,1)
            total += float(t.size(0))
            correct += float((predicted==t).sum())
        total_train = 0
        correct_train = 0
        for d,t in train_loader:

            d = d.view((65536,-1,1))

            outputs = model(d)
            _, predicted = torch.max(outputs.data,1)
            total_train += float(t.size(0))
            correct_train += float((predicted==t).sum())
        
        # Print the epoch, the training loss, and the test set accuracy.
        train_accs.append(100.*correct_train/total_train)
        test_accs.append(100.*correct/total)

        print(epoch,loss.item(),train_accs[-1],test_accs[-1])


model.eval()
X_test = X_test.view((65536,-1,1))
y_pred = np.argmax(model(X_test).cpu().detach(),axis=1)
y_test_cpu = y_test.cpu()



0 0.7012228965759277 51.875 42.5
10 0.6996627449989319 48.125 57.5
20 0.6780390739440918 48.125 57.5
30 0.7233601212501526 51.875 42.5
40 0.7125394344329834 51.875 42.5
