In [2]:
import torch
import torchaudio
from torchaudio import transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn. functional as F
import pandas as pd
import matplotlib.pyplot as plt
import os
import random
import numpy as np
from transformers import AutoProcessor, AutoModel

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
model = AutoModel.from_pretrained("Harveenchadha/wav2vec2-pretrained-clsril-23-10k")

Some weights of the model checkpoint at Harveenchadha/wav2vec2-pretrained-clsril-23-10k were not used when initializing Wav2Vec2Model: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at Harveenchadha/wav2vec2-pretrained-clsril-23-10k and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model 

In [5]:
wav, sr = torchaudio.load("/home/ubuntu/Daniyal/work/data/Prima/SC_audio_Kannada/Abuse_0.wav")
inp = processor(wav.squeeze(0),sampling_rate=sr,return_tensors="pt",return_attention_mask=True,padding="max_length",max_length=16000*30, truncation= True)

In [6]:
inp

{'input_values': tensor([[0.0017, 0.0017, 0.0017,  ..., 0.0000, 0.0000, 0.0000]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)}

In [6]:
from transformers import Wav2Vec2Model,Wav2Vec2FeatureExtractor

In [7]:
with torch.no_grad():
    out = model(**inp)

In [8]:
out.last_hidden_state.shape

torch.Size([1, 1499, 768])

In [9]:
class AudioDataset (Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.annotations = pd. read_csv (csv_file)
        self.root_dir = root_dir
        self.transform = transform
        
    def __len__(self) :
        return len(self.annotations)

    def __getitem__(self, index) :

        # 16000 * 30
        file_path = os.path.join(self. root_dir, self.annotations. iloc[index, 0])
        waveform, sr = torchaudio.load(file_path)


        #waveform= truncate_or_pad_waveform(waveform, target_length=100000)
        
        if self .annotations. iloc [index, 1]=='Yes':
            y_label = torch. tensor(1)
        else:
            y_label = torch. tensor (0)
        
        if self.transform:
            waveform = self.transform(waveform)
        
        
        waveform= processor(waveform.squeeze(0),sampling_rate=sr,return_tensors="pt",return_attention_mask=True,padding="max_length",max_length=16000*30, truncation= True)
        #max_pool, _ = torch.max(waveform, dim=2)
        inp=waveform['input_values'].squeeze(0)
        att=waveform['attention_mask'].squeeze(0)

        t={'input_values': inp, 'attention_mask': att}

        return (t, y_label)

In [16]:
dataset = AudioDataset (csv_file = '/home/ubuntu/Daniyal/work/data/Kannada_train.csv', root_dir = "/home/ubuntu/Daniyal/work/data/Prima/SC_audio_Kannada/")

In [17]:
train_size = int (0.90 * len (dataset))
val_size = len (dataset) - train_size

In [18]:
train_set, val_set = torch.utils. data. random_split (dataset, [train_size, val_size])


train_loader = DataLoader (dataset=train_set, batch_size=4, shuffle=True)
val_loader = DataLoader (dataset=val_set, batch_size=4, shuffle=False)

In [19]:
len (train_loader), len(val_loader)

(185, 21)

In [21]:
for au, l in train_loader:
    print(au, l)
    break

{'input_values': tensor([[0.0032, 0.0032, 0.0032,  ..., 0.0000, 0.0000, 0.0000],
        [0.0020, 0.0020, 0.0020,  ..., 0.0000, 0.0000, 0.0000],
        [0.0037, 0.0037, 0.0037,  ..., 0.0000, 0.0000, 0.0000],
        [0.0004, 0.0004, 0.0004,  ..., 0.0000, 0.0000, 0.0000]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)} tensor([0, 1, 0, 1])


In [22]:
au['input_values']

tensor([[0.0032, 0.0032, 0.0032,  ..., 0.0000, 0.0000, 0.0000],
        [0.0020, 0.0020, 0.0020,  ..., 0.0000, 0.0000, 0.0000],
        [0.0037, 0.0037, 0.0037,  ..., 0.0000, 0.0000, 0.0000],
        [0.0004, 0.0004, 0.0004,  ..., 0.0000, 0.0000, 0.0000]])

In [35]:
for au, l in train_loader:
    print(model(**au)['last_hidden_state'].shape, l)
    break

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

In [23]:
temp=torch.rand(1499, 768)
temp

tensor([[0.8438, 0.6494, 0.6316,  ..., 0.2321, 0.3686, 0.4064],
        [0.2750, 0.1236, 0.5593,  ..., 0.1715, 0.1127, 0.3205],
        [0.9500, 0.0806, 0.3946,  ..., 0.7015, 0.0449, 0.1992],
        ...,
        [0.0893, 0.3139, 0.0862,  ..., 0.4433, 0.9469, 0.5088],
        [0.5294, 0.6102, 0.1989,  ..., 0.8358, 0.3585, 0.7121],
        [0.5711, 0.0517, 0.8753,  ..., 0.9776, 0.2903, 0.2674]])

In [24]:
max_pool, _ = torch.max(temp, dim=0)
max_pool.shape

torch.Size([768])

In [25]:
#Sequential model
class NN(nn.Module): #inherit n module
    def __init__(self) :
        super(NN, self).__init__()
        self.feature_vector=model
        self.fc1 = nn. Linear (768, 256)
        self.d1= nn. Dropout (p=0.2)
        self. fc2 = nn. Linear (256, 128)
        self.d2= nn. Dropout (p=0.3)
        self. fc3 = nn. Linear (128, 2)

    def forward (self, inp, att):
        x= model(input_values=inp, attention_mask= att)
        x, _ = torch.max(x['last_hidden_state'], dim=1)
        x = self.d1(F.gelu(self.fc1(x) ) )
        x = self.d2(F.gelu(self. fc2(x) ) )
        x=  self. fc3(x)
        return x

In [26]:
linear_model= NN().to(device)
print(linear_model)

NN(
  (feature_vector): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): Wav2

In [27]:
learning_rate = 0.0001
num_epochs = 5

In [1]:
criterion = nn.CrossEntropyLoss ()
optimizer = optim.Adam(linear_model.parameters (), lr=learning_rate, betas=(0.9, 0.999), eps=1e-07)

NameError: name 'nn' is not defined

In [29]:
import torchmetrics as tm
precision=tm.classification.BinaryPrecision().to(device)
recall=tm.classification.BinaryRecall().to(device)
accuracy=tm.classification.BinaryAccuracy().to(device)

In [30]:

for epoch in range(num_epochs):

  print(f"epoch: {epoch+1}/{num_epochs}")
  
  pred=torch.tensor([]).to(device=device)
  label=torch.tensor([]).to(device=device)
  running_loss = 0.0


  for batch_idx, (data, targets) in enumerate(train_loader):

    inp = data['input_values'].to(device=device)
    att= data['attention_mask'].to(device=device)
    targets = targets.to(device=device)

    #data = data.reshape (data.shape [0], -1)

    scores = linear_model(inp= inp, att= att)
    loss = criterion(scores, targets) 

    optimizer.zero_grad()
    loss.backward()
    optimizer .step()

    running_loss += loss.item()
    _, prediction = torch.max(scores, 1)

    pred=torch.cat([pred, prediction])
    label=torch.cat([label, targets])


  num_batches = len(train_loader)
  avg_loss = running_loss / num_batches
  

  print(f'Training: Epoch: {epoch}, loss: {avg_loss:.2f}, train_accuracy: {accuracy(pred, label):.2f}, train_precision: {precision(pred, label): .2f}, train_recall: {recall(pred, label): .2f}, ')

  
  val_pred=torch.tensor([]).to(device=device)
  val_label=torch.tensor([]).to(device=device)

  linear_model.eval()
  with torch.no_grad(): 
    for batch_idx, (data, targets) in enumerate(val_loader):

      inp = data['input_values'].to(device=device)
      att= data['attention_mask'].to(device=device)
      targets = targets.to(device=device)

      #data = data.reshape (data.shape [0], -1)

      scores = linear_model(inp= inp, att= att)

      _, prediction = torch.max(scores, 1)

      val_pred=torch.cat([val_pred, prediction])
      val_label=torch.cat([val_label, targets])

    print(f'val_accuracy: {accuracy(val_pred, val_label):.2f}, val_precision: {precision(val_pred, val_label): .2f}, val_recall: {recall(val_pred, val_label): .2f}, lr: {optimizer.param_groups[0]['lr']}')
  linear_model.train()



  #   if batch_idx%10==0:
  #     print(f"batch: {batch_idx+1} loss: {loss.item()}")

  # print(f"epoch-{epoch+1} loss: {loss.item()} ")

epoch: 1/5
Training: Epoch: 0, loss: 0.66, train_accuracy: 0.65, train_precision:  0.50, train_recall:  0.00,
val_accuracy: 0.66, val_precision:  0.00, val_recall:  0.00,
epoch: 2/5
Training: Epoch: 1, loss: 0.66, train_accuracy: 0.64, train_precision:  0.39, train_recall:  0.05,
val_accuracy: 0.66, val_precision:  0.00, val_recall:  0.00,
epoch: 3/5
Training: Epoch: 2, loss: 0.65, train_accuracy: 0.65, train_precision:  0.00, train_recall:  0.00,
val_accuracy: 0.66, val_precision:  0.00, val_recall:  0.00,
epoch: 4/5
Training: Epoch: 3, loss: 0.65, train_accuracy: 0.65, train_precision:  0.00, train_recall:  0.00,
val_accuracy: 0.66, val_precision:  0.00, val_recall:  0.00,
epoch: 5/5
Training: Epoch: 4, loss: 0.65, train_accuracy: 0.65, train_precision:  0.00, train_recall:  0.00,
val_accuracy: 0.66, val_precision:  0.00, val_recall:  0.00,


### Rough

In [124]:
import torchmetrics as tm
from tqdm import tqdm

In [126]:
correct_prediction = 0
total_prediction = 0
running_loss = 0.0

pred=torch.tensor([]).to(device=device)
label=torch.tensor([]).to(device=device)

for batch_idx, (data, targets) in enumerate(tqdm(train_loader, desc="Training")):

    inp = data['input_values'].to(device=device)
    att= data['attention_mask'].to(device=device)
    targets = targets.to(device=device)

    #data = data.reshape (data.shape [0], -1)

    scores = linear_model(inp= inp, att= att)
    loss = criterion(scores, targets) 

    optimizer.zero_grad()
    loss.backward()
    optimizer .step()

    running_loss += loss.item()
    _, prediction = torch.max(scores, 1)
    print('score',scores)
    print('prediction',prediction)
    print('targets',targets)

    pred=torch.cat([pred, prediction])
    label=torch.cat([label, targets])

    print('pred',pred)
    print('label',label)



    if batch_idx==5:
        break
  


Training:   1%|          | 1/185 [00:01<05:50,  1.91s/it]

score tensor([[ 0.3708, -0.1830],
        [ 0.3392, -0.0823],
        [ 0.3177, -0.1461],
        [ 0.3663, -0.0777]], device='cuda:0', grad_fn=<AddmmBackward0>)
prediction tensor([0, 0, 0, 0], device='cuda:0')
targets tensor([1, 1, 0, 0], device='cuda:0')
pred tensor([0., 0., 0., 0.], device='cuda:0')
label tensor([1., 1., 0., 0.], device='cuda:0')


Training:   1%|          | 2/185 [00:03<05:56,  1.95s/it]

score tensor([[ 0.3152, -0.0327],
        [ 0.4366, -0.1371],
        [ 0.4676, -0.1171],
        [ 0.4437, -0.1788]], device='cuda:0', grad_fn=<AddmmBackward0>)
prediction tensor([0, 0, 0, 0], device='cuda:0')
targets tensor([0, 1, 1, 0], device='cuda:0')
pred tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
label tensor([1., 1., 0., 0., 0., 1., 1., 0.], device='cuda:0')


Training:   2%|▏         | 3/185 [00:05<05:57,  1.97s/it]

score tensor([[ 0.2851, -0.1009],
        [ 0.4276, -0.1340],
        [ 0.2518, -0.0738],
        [ 0.3417, -0.1436]], device='cuda:0', grad_fn=<AddmmBackward0>)
prediction tensor([0, 0, 0, 0], device='cuda:0')
targets tensor([0, 0, 1, 0], device='cuda:0')
pred tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')
label tensor([1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0.], device='cuda:0')


Training:   2%|▏         | 4/185 [00:07<05:48,  1.92s/it]

score tensor([[ 0.4292, -0.2103],
        [ 0.4190, -0.1178],
        [ 0.4714, -0.2155],
        [ 0.4134, -0.1631]], device='cuda:0', grad_fn=<AddmmBackward0>)
prediction tensor([0, 0, 0, 0], device='cuda:0')
targets tensor([0, 0, 0, 0], device='cuda:0')
pred tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')
label tensor([1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       device='cuda:0')


Training:   3%|▎         | 5/185 [00:09<05:51,  1.95s/it]

score tensor([[ 0.5053, -0.1908],
        [ 0.3894, -0.1592],
        [ 0.3593, -0.0076],
        [ 0.4165, -0.2148]], device='cuda:0', grad_fn=<AddmmBackward0>)
prediction tensor([0, 0, 0, 0], device='cuda:0')
targets tensor([1, 0, 0, 1], device='cuda:0')
pred tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')
label tensor([1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
        0., 1.], device='cuda:0')


Training:   3%|▎         | 5/185 [00:11<06:53,  2.30s/it]

score tensor([[ 0.4111, -0.1281],
        [ 0.3914, -0.1026],
        [ 0.3718, -0.2169],
        [ 0.4698, -0.2068]], device='cuda:0', grad_fn=<AddmmBackward0>)
prediction tensor([0, 0, 0, 0], device='cuda:0')
targets tensor([0, 1, 0, 0], device='cuda:0')
pred tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')
label tensor([1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
        0., 1., 0., 1., 0., 0.], device='cuda:0')





In [111]:
precision=tm.classification.BinaryPrecision().to(device)
recall=tm.classification.BinaryRecall().to(device)
accuracy=tm.classification.BinaryAccuracy().to(device)

In [112]:
precision(pred, label)

tensor(0.5000, device='cuda:0')

In [113]:
recall(pred, label)

tensor(1., device='cuda:0')

In [114]:
accuracy(pred, label)

tensor(0.5000, device='cuda:0')

In [115]:
label

tensor([0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1.,
        0., 0., 0., 0., 1., 1.], device='cuda:0')

In [116]:
scores

tensor([[ 0.0084,  0.0834],
        [-0.0085,  0.0926],
        [-0.0145,  0.0973],
        [ 0.0091,  0.1066]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [117]:
pred

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1.], device='cuda:0')