In [31]:
from screamdataset import ScreamDataset
import torch 
import torchaudio
from torch import nn
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader
from cnn import CNNNetwork
import numpy as np
from sklearn.metrics import confusion_matrix,accuracy_score, precision_score

import plotly.express as px

TRAIN_ANNOTATIONS_FILE = '/home/vedant/projects/ScreamDetection/resources/dataset/pytorch-dataset-train.csv'
TEST_ANNOTATIONS_FILE = '/home/vedant/projects/ScreamDetection/resources/dataset/pytorch-dataset-test.csv'
VALID_ANNOTATIONS_FILE = '/home/vedant/projects/ScreamDetection/resources/dataset/pytorch-dataset-validation.csv'

TRAIN_AUDIO_DIR = '/home/vedant/projects/ScreamDetection/resources/dataset/blocked_audio/train'
TEST_AUDIO_DIR = '/home/vedant/projects/ScreamDetection/resources/dataset/blocked_audio/test'
VALID_AUDIO_DIR = '/home/vedant/projects/ScreamDetection/resources/dataset/blocked_audio/validation'

BATCH_SIZE = 1024
SAMPLE_RATE=44100
EPOCHS = 5
LEARNING_RATE = 0.001

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate = 44100,
        n_fft = 1024,
        hop_length = 512,
        n_mels = 64
    )
if torch.cuda.is_available():
    DEVICE = 'cuda'
else:
    DEVICE = 'cpu'

def create_data_loader(train_data,batch_size=None):
    dataloader = DataLoader(train_data,batch_size=batch_size)
    return dataloader

def train_one_epoch(model, train_data_loader,test_data,loss_function,optimiser,device):
    for inputs,targets in train_data_loader:
        model.train()
        inputs,targets = inputs.to(device),targets.to(device)

        # Calculate Loss
        predictions = model(inputs)
        loss = loss_function(predictions,targets)

        # Backpropagate Loss, update weights
        optimiser.zero_grad()
        loss.backward() # Apply backpropagation
        optimiser.step() # Update weights

        #EVAL
        class_mapping=[
                    'no_vocals',
                    'midfry',
                    'clean',
                    'highfry',
                    'lowfry',
                    'layered'
                ]
        model.eval()
        with torch.no_grad():
            predictions=[]
            expectation=[]
            for i in range(len(test_data)):
                inputs,targets=test_data[i]
                inputs.unsqueeze_(0)
                prediction = model(inputs)
                predicted_index = prediction[0].argmax(0) #Find the predicted class with highest probability
                predicted = class_mapping[predicted_index]
                expected = class_mapping[targets]
                predictions.append(predicted)
                expectation.append(expected)
            accuracy = accuracy_score(predictions,expectation)

            macro_accuracy = precision_score(predictions,expectation,average='macro')

    print(f"Loss : {loss.item()}")
    print(f"Accuracy: {accuracy}")
    print(f"Macro Accuracy: {macro_accuracy}")
    return loss.item(),accuracy_score,macro_accuracy

def train(model, train_data_loader,test_data,validation_data,loss_function, optimiser, device, epochs):
    losses=[]
    epoch=[]
    accuracies=[]
    macro_accuracies=[]
    for i in range(epochs):
        epoch.append(i)
        print(f"Epoch {i+1}:")
        loss,accuracy,macro_accuracy = train_one_epoch(model, train_data_loader,test_data, loss_function, optimiser, device)
        losses.append(loss)
        accuracies.append(accuracy)
        macro_accuracies.append(macro_accuracy)
        print("-------------------------------------------------------")
    print("Training done")
    #Find validation loss
    model.eval()
    class_mapping=[
                    'no_vocals',
                    'midfry',
                    'clean',
                    'highfry',
                    'lowfry',
                    'layered'
                ]
    with torch.no_grad():
        predictions=[]
        expectation=[]
        for i in range(len(validation_data)):
            inputs,targets=validation_data[i]
            inputs = inputs.to(device)
            inputs.unsqueeze_(0)
            prediction = model(inputs)
            predicted_index = prediction[0].argmax(0) #Find the predicted class with highest probability
            predicted = class_mapping[predicted_index]
            expected = class_mapping[targets]
            predictions.append(predicted)
            expectation.append(expected)
        accuracy = accuracy_score(predictions,expectation)

        macro_accuracy = precision_score(predictions,expectation,average='macro')
    print(f"Accuracy Score : {accuracy}")
    print(f"Macro Accuracy : {macro_accuracy}")
    return losses,accuracies,macro_accuracies,epoch

In [32]:
if __name__ == '__main__':
    from sklearn.metrics import confusion_matrix,accuracy_score, precision_score
    #import sys
    #sys.setrecursionlimit(10000)
    if torch.cuda.is_available():
        DEVICE = 'cuda'
    else:
        DEVICE = 'cpu'

    print(f"Using device: {DEVICE}")

    

    #instantiating dataset object and transform
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate = SAMPLE_RATE,
        n_fft = 1024,
        hop_length = 512,
        n_mels = 64
    )

    sd_train = ScreamDataset(TRAIN_ANNOTATIONS_FILE, TRAIN_AUDIO_DIR, mel_spectrogram, DEVICE)
    train_dataloader= create_data_loader(sd_train,BATCH_SIZE)

    sd_test = ScreamDataset(TEST_ANNOTATIONS_FILE, TEST_AUDIO_DIR, mel_spectrogram, DEVICE)
    # test_dataloader= create_data_loader(sd_test,BATCH_SIZE)
    # Loading entire dataset into lists
    test_inputs=[]
    for i in range(len(sd_test)):
        test_inputs.append(sd_test[i])

    sd_valid = ScreamDataset(VALID_ANNOTATIONS_FILE, VALID_AUDIO_DIR, mel_spectrogram, DEVICE)
    # valid_dataloader= create_data_loader(sd_valid,BATCH_SIZE)
    valid_inputs=[]
    for i in range(len(sd_valid)):
        valid_inputs.append(sd_valid[i])

    # # CREATE TRAIN-VALIDATION SPLIT
    # validation_split = .2
    # shuffle_dataset = True
    # random_seed= 42

    # # Creating data indices for training and validation splits:
    # dataset_size = len(sd)
    # indices = list(range(dataset_size))
    # split = int(np.floor(validation_split * dataset_size))
    # if shuffle_dataset :
    #     np.random.seed(random_seed)
    #     np.random.shuffle(indices)
    # train_indices, val_indices = indices[split:], indices[:split]

    # # Creating PT data samplers and loaders:
    # train_sampler = SubsetRandomSampler(train_indices)
    # valid_sampler = SubsetRandomSampler(val_indices)

    # train_loader = torch.utils.data.DataLoader(sd, batch_size=BATCH_SIZE, 
    #                                         sampler=train_sampler)
    # validation_loader = torch.utils.data.DataLoader(sd, batch_size=BATCH_SIZE,
    #                                                 sampler=valid_sampler)

    cnn = CNNNetwork().to(DEVICE)
    # Instantiating loss function and optimiser
    loss_function = nn.CrossEntropyLoss()
    optimiser=torch.optim.Adam(cnn.parameters(),
                            lr=LEARNING_RATE
                                )

    #Train Model (model, train_data_loader,test_data_loader,validation_data_loader,loss_function, optimiser, device, epochs)
    losses,accuracies,macro_accuracies,epoch = train(cnn,train_dataloader, test_inputs, valid_inputs, loss_function, optimiser, DEVICE, EPOCHS)

    #Save results
    torch.save(cnn.state_dict(),"/home/vedant/projects/ScreamDetection/CNN/trained_models/scream_cnn_crossentropy_adam.pth")

    print("Model trained and stored at /CNN/trained_models/scream_cnn_crossentropy_adam.pth")
    fig = px.line(x=epoch,y=losses)
    fig.add(px.line(x=epoch,y=accuracies))
    fig.add(px.line(x=epoch,y=macro_accuracies))
    fig.show()

Using device: cuda
Epoch 1:


IndexError: list index out of range

# Full Code

In [59]:
from screamdataset import ScreamDataset
import torch 
import torchaudio
from torch import nn
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader
#from cnn2 import CNNNetwork
from cnn2 import CNNNetwork
import numpy as np
from sklearn.metrics import confusion_matrix,accuracy_score, precision_score
import pandas as pd
import plotly.express as px

TRAIN_ANNOTATIONS_FILE = '/home/vedant/projects/ScreamDetection/resources/dataset/pytorch-dataset-train.csv'
TEST_ANNOTATIONS_FILE = '/home/vedant/projects/ScreamDetection/resources/dataset/pytorch-dataset-test.csv'
VALID_ANNOTATIONS_FILE = '/home/vedant/projects/ScreamDetection/resources/dataset/pytorch-dataset-validation.csv'

TRAIN_AUDIO_DIR = '/home/vedant/projects/ScreamDetection/resources/dataset/blocked_audio/train'
TEST_AUDIO_DIR = '/home/vedant/projects/ScreamDetection/resources/dataset/blocked_audio/test'
VALID_AUDIO_DIR = '/home/vedant/projects/ScreamDetection/resources/dataset/blocked_audio/validation'

BATCH_SIZE = 128
SAMPLE_RATE = 44100
EPOCHS = 10
LEARNING_RATE = 0.01

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate = SAMPLE_RATE,
        n_fft = 1024,
        hop_length = 512,
        n_mels = 64
    )
if torch.cuda.is_available():
    DEVICE = 'cuda'
else:
    DEVICE = 'cpu'

def create_data_loader(train_data,batch_size=None,shuffle=True):
    dataloader = DataLoader(train_data,batch_size=batch_size,shuffle=shuffle)
    return dataloader

def train_one_epoch(model, train_data_loader,loss_function,optimiser,device):
    for inputs,targets in train_data_loader:
        # model.train()
        inputs,targets = inputs.to(device),targets.to(device)

        # Calculate Loss
        predictions = model(inputs)
        loss = loss_function(predictions,targets)
        #print(predictions)
        # Backpropagate Loss, update weights
        s=0
        for param in cnn.parameters():
            s=param.view(-1)[0]
            break
        print(f"first weight (before): {s}")
        optimiser.zero_grad()
        loss.backward() # Apply backpropagation
        optimiser.step() # Update weights
        s=0
        for param in cnn.parameters():
            s=param.view(-1)[0]
            break
        print(f"first weight (after): {s}")
        #print([prediction.argmax[0] for prediction in predictions])

        #TRAINING ACCURACY
        class_mapping=[
                    'no_vocals',
                    'midfry',
                    'clean',
                    'highfry',
                    'lowfry',
                    'layered'
                ]
        pred=[]
        for prediction in predictions.detach().cpu():
            p = prediction[0].argmax().item()
            pred.append(p)
        accuracy = accuracy_score(pred,targets.detach().cpu())
        #macro_accuracy = precision_score(pred,targets.detach().cpu(),average='macro')

    print(f"Loss : {loss.item()}")
    print(f"Accuracy: {accuracy}")
    #print(f"Macro Accuracy: {macro_accuracy}")
    return loss.item(),accuracy#,macro_accuracy

def train(model, train_data_loader,loss_function, optimiser, device, epochs):
    # print(f"Initial weights: {}")
    losses=[]
    epoch=[]
    accuracies=[]
    macro_accuracies=[]
    for i in range(epochs):
        # print(f"Weights before epoch {i} : {}")
        epoch.append(i)
        print(f"Epoch {i+1}:")
        # loss,accuracy,macro_accuracy = train_one_epoch(model, train_data_loader,test_data, loss_function, optimiser, device)
        loss,accuracy = train_one_epoch(model, train_data_loader,loss_function, optimiser, device)
        # print(f"Weights after epoch {i} : {}")
        losses.append(loss)
        accuracies.append(accuracy)
        #macro_accuracies.append(macro_accuracy)
        print("-------------------------------------------------------")
    print("Training done")
    return losses,accuracies,epoch#macro_accuracies,epoch


if __name__ == '__main__':
    from sklearn.metrics import confusion_matrix,accuracy_score, precision_score
    #import sys
    #sys.setrecursionlimit(10000)
    if torch.cuda.is_available():
        DEVICE = 'cuda'
    else:
        DEVICE = 'cpu'

    print(f"Using device: {DEVICE}")

    #instantiating dataset object and transform
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate = SAMPLE_RATE,
        n_fft = 1024,
        hop_length = 512,
        n_mels = 64
    )

    sd_train = ScreamDataset(TRAIN_ANNOTATIONS_FILE, TRAIN_AUDIO_DIR, mel_spectrogram, DEVICE)
    train_dataloader= create_data_loader(sd_train,BATCH_SIZE,shuffle=True)

    sd_test = ScreamDataset(TEST_ANNOTATIONS_FILE, TEST_AUDIO_DIR, mel_spectrogram, DEVICE)
    # test_dataloader= create_data_loader(sd_test,BATCH_SIZE)
    # Loading entire dataset into lists
    test_inputs=[]
    for i in range(len(sd_test)):
        test_inputs.append(sd_test[i])

    sd_valid = ScreamDataset(VALID_ANNOTATIONS_FILE, VALID_AUDIO_DIR, mel_spectrogram, DEVICE)
    # valid_dataloader= create_data_loader(sd_valid,BATCH_SIZE)
    valid_inputs=[]
    for i in range(len(sd_valid)):
        valid_inputs.append(sd_valid[i])

    cnn = CNNNetwork().to(DEVICE)
    # Instantiating loss function and optimiser
    loss_function = nn.CrossEntropyLoss()
    optimiser=torch.optim.Adadelta(cnn.parameters(),
                            lr=LEARNING_RATE
                                )

    #Train Model (model, train_data_loader,test_data_loader,validation_data_loader,loss_function, optimiser, device, epochs)
    #losses,accuracies,macro_accuracies,epoch = train(cnn,train_dataloader, test_inputs, valid_inputs, loss_function, optimiser, DEVICE, EPOCHS)
    losses,accuracies,epoch = train(cnn,train_dataloader, loss_function, optimiser, DEVICE, EPOCHS)

    #Save results
    torch.save(cnn.state_dict(),"/home/vedant/projects/ScreamDetection/CNN/trained_models/scream_cnn_crossentropy_adam.pth")

    print("Model trained and stored at /CNN/trained_models/scream_cnn_crossentropy_adam.pth")
    fig = px.line(x=epoch,y=losses)
    fig.add_scatter(x=epoch, y=accuracies, mode='lines')
    fig.add_scatter(x=epoch, y=macro_accuracies, mode='lines')
    # # fig.add(px.line(x=epoch,y=accuracies))
    # # fig.add(px.line(x=epoch,y=macro_accuracies))
    fig.show()
    df=pd.DataFrame()
    df['epoch'] = epoch
    df['accuracy'] = accuracies
    #df['macro_accuracy'] = macro_accuracies

    df.to_csv('/home/vedant/projects/ScreamDetection/CNN/trained_models/training_results.csv',header=True,index=False,encoding='utf-8-sig')


At least one mel filterbank has all zero values. The value for `n_mels` (512) may be set too high. Or, the value for `n_freqs` (513) may be set too low.



Using device: cuda
Epoch 1:
first weight (before): -0.06481902301311493
first weight (after): -0.06481902301311493
first weight (before): -0.06481902301311493
first weight (after): -0.06481902301311493
first weight (before): -0.06481902301311493
first weight (after): -0.06481902301311493
first weight (before): -0.06481902301311493
first weight (after): -0.06481902301311493
first weight (before): -0.06481902301311493
first weight (after): -0.06481902301311493
first weight (before): -0.06481902301311493
first weight (after): -0.06481902301311493
first weight (before): -0.06481902301311493
first weight (after): -0.06481902301311493
first weight (before): -0.06481902301311493
first weight (after): -0.06481902301311493
first weight (before): -0.06481902301311493
first weight (after): -0.06481902301311493
first weight (before): -0.06481902301311493
first weight (after): -0.06481902301311493
first weight (before): -0.06481902301311493
first weight (after): -0.06481902301311493
first weight (b

In [46]:
s=0
for param in cnn.parameters():
    s+=param.view(-1).sum().item()
s

-4.482299838215113

In [52]:
s=0
for param in cnn.parameters():
    s=param.view(-1)[0]
    break
s

s=param.view(-1)[0]
s.item()

-0.01240419689565897

In [45]:
param.view(-1).sum().item()

-0.021897513419389725

In [19]:
for i in range(len(sd_train)): 
    if sd_train[i][1] != 2:
        print(sd_train[i][1])

3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5


In [10]:
cnn = CNNNetwork().to(DEVICE)
device=DEVICE
model = cnn
sd_train = ScreamDataset(TRAIN_ANNOTATIONS_FILE, TRAIN_AUDIO_DIR, mel_spectrogram, DEVICE)
train_data_loader= create_data_loader(sd_train,BATCH_SIZE)
for inputs,targets in train_data_loader:
    model.train()
    inputs,targets = inputs.to(device),targets.to(device)

    # Calculate Loss
    predictions = model(inputs)
    loss = loss_function(predictions,targets)
    #print(predictions)
    # Backpropagate Loss, update weights
    optimiser.zero_grad()
    loss.backward() # Apply backpropagation
    optimiser.step() # Update weights
    #print([prediction.argmax[0] for prediction in predictions])
    class_mapping=[
                'no_vocals',
                'midfry',
                'clean',
                'highfry',
                'lowfry',
                'layered'
            ]
    pred=[]
    for prediction in predictions.detach().cpu():
        p = prediction[0].argmax().item()
        pred.append(p)
    #print(pred)
    accuracy = accuracy_score(pred,targets.detach().cpu())
    macro_accuracy = precision_score(pred,targets.detach().cpu(),average='macro')

print(f"Loss : {loss.item()}")
print(f"Accuracy: {accuracy}")
print(f"Macro Accuracy: {macro_accuracy}")

  _warn_prf(average, modifier, msg_start, len(result))


Loss : 2.0435922145843506
Accuracy: 1.0
Macro Accuracy: 1.0


In [8]:
prediction.argmax().item()

5

In [73]:
fig = px.line(x=epoch,y=losses)
fig.add_scatter(x=epoch, y=accuracies, mode='lines')
fig.add_scatter(x=epoch, y=macro_accuracies, mode='lines')
# fig.add(px.line(x=epoch,y=accuracies))
# fig.add(px.line(x=epoch,y=macro_accuracies))


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [48]:
sd_test = ScreamDataset(TEST_ANNOTATIONS_FILE, TEST_AUDIO_DIR, mel_spectrogram, DEVICE)
# test_dataloader= create_data_loader(sd_test,BATCH_SIZE)
# Loading entire dataset into lists
test_inputs=[]
for i in range(len(sd_test)):
    test_inputs.append(sd_test[i])

In [49]:
test_inputs[0][0].shape

torch.Size([1, 64, 87])

In [52]:
for i in range(len(test_inputs)):
    melspec,label = test_inputs[i]
    print(melspec.unsqueeze(0).shape)

torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
torch.Size([1, 1, 64, 87])
t

In [40]:
melspec.unsqueeze(0).shape

torch.Size([1, 1, 64, 87])