In [80]:
import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [13]:
list_of_files = 100

In [71]:
data_train = np.zeros(
            (list_of_files, 217, 174), dtype=np.float64
        )
labels = np.zeroes(
    (list_of_files,3), dtype = np.float64

)

In [33]:
data_train.shape

(100, 217, 174)

In [70]:
del data_train

In [18]:
data_directory = 'music_3080_parsed_11.mp3'

In [72]:
def load_features_into_data(waveform,data,labels,file_number,file_name):
    mfcc = librosa.feature.mfcc(waveform,sr = 44100, n_mfcc = 15)
    mel = librosa.feature.melspectrogram(waveform, sr = 44100)
    cens = librosa.feature.chroma_cens(waveform, sr = 44100, n_chroma = 15)
    ton = librosa.feature.tonnetz(waveform,sr = 44100, chroma = cens)
    cqt = librosa.feature.chroma_cqt(waveform,sr = 44100, n_chroma = 5)
    stft = librosa.feature.chroma_stft(waveform, sr = 44100, n_chroma = 5)
    data[file_number,:,0:15] = mfcc.T
    data[file_number,:,15:143] = mel.T
    data[file_number,:,143:158] = cens.T
    data[file_number,:,158:164] = ton.T
    data[file_number,:,164:169] = cqt.T
    data[file_number,:,169:174] = stft.T
    label = file_name.split('_')[0]
    for i in range(3):
        labels[file_number][i] = label[i]
    return data,labels
    
    
    
    

In [73]:
f_n = 0


In [74]:
f = librosa.load(data_directory)

In [75]:
d = load_features_into_data(f[0],data_train,0)

In [40]:
data_train[0].shape

(217, 174)

In [51]:
data_train[0][50]

array([-40.32453537, 104.41842651,  -6.20430183,  41.90998077,
       -25.0476799 , -13.0345459 ,  24.00344086,   4.51055908,
        19.46505737,  17.90101814,  11.11997414,  16.29535675,
       -14.38159561,   9.679636  ,  10.76276207,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.  

In [77]:
d[0][216]

array([-3.28942528e+01,  9.63850174e+01,  1.86830978e+01,  3.96652770e+00,
        2.30371933e+01,  8.59519672e+00,  1.28909550e+01,  1.58068943e+01,
       -1.22594681e+01,  4.18005228e-01,  8.16409492e+00,  2.60835037e+01,
        2.60919094e+00,  5.23773670e+00, -2.76716089e+00,  1.34363266e+02,
        1.82679840e+02,  3.51267761e+02,  2.43906372e+02,  8.57272110e+01,
        5.25289726e+01,  1.56688719e+01,  1.20454493e+01,  5.75646305e+00,
        1.13034592e+01,  5.87525558e+00,  6.01613760e+00,  1.47809803e+00,
        7.61048841e+00,  4.58567619e+00,  1.68514729e+01,  5.26687670e+00,
        9.90575695e+00,  3.93803430e+00,  7.45014286e+00,  2.33907771e+00,
        6.19673967e+00,  4.86912346e+00,  8.71595669e+00,  4.98408222e+00,
        3.34902859e+00,  1.13442385e+00,  1.49114823e+00,  3.28647107e-01,
        1.38497150e+00,  9.20922816e-01,  3.76273298e+00,  9.49288666e-01,
        9.99694228e-01,  1.43198645e+00,  1.05583191e+00,  6.21294677e-01,
        1.20141017e+00,  

In [81]:
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, output_dim=3, num_layers=2):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers

        # setup LSTM layer
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)

        # setup output layer
        self.linear = nn.Linear(self.hidden_dim, output_dim)

    def init_hidden(self):
        return (
            torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
            torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
        )

    def forward(self, input):
        # lstm step => then ONLY take the sequence's final timetep to pass into the linear/dense layer
        # Note: lstm_out contains outputs for every step of the sequence we are looping over (for BPTT)
        # but we just need the output of the last step of the sequence, aka lstm_out[-1]
        lstm_out, hidden = self.lstm(input)
        logits = self.linear(lstm_out[-1])
        genre_scores = F.log_softmax(logits, dim=1)
        return genre_scores

    def get_accuracy(self, logits, target):
        """ compute accuracy for training round """
        corrects = (
            torch.max(logits, 1)[1].view(target.size()).data == target.data
        ).sum()
        accuracy = 100.0 * corrects / self.batch_size
        return accuracy.item()

In [84]:
batch_size = 35  # num of training examples per minibatch
num_epochs = 400

# Define model
print("Build LSTM RNN model ...")
model = LSTM(
    input_dim=174, hidden_dim=256, batch_size=batch_size, output_dim=3, num_layers=2
)
loss_function = nn.NLLLoss()  # expects ouputs from LogSoftmax

optimizer = optim.Adam(model.parameters(), lr=0.0001)

train_on_gpu = torch.cuda.is_available()
if train_on_gpu:
    print("\nTraining on GPU")
else:
    print("\nNo GPU, training on CPU")

Build LSTM RNN model ...

Training on GPU


In [85]:
model

LSTM(
  (lstm): LSTM(174, 256, num_layers=2)
  (linear): Linear(in_features=256, out_features=3, bias=True)
)

In [None]:
# Read Whole Directory with all labels
#For each file, add it into data and labels array
# Split into train, test
# Convert them into torch tensors.
# Run for each epoch, for each batch, the forward and backward pass and calculate loss and accuracy.
