In [21]:
import torch
import torch.nn as nn
import torchvision
import torchaudio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm

In [3]:
from google.colab import drive
drive.mount("./content")

Mounted at ./content


In [6]:
!mkdir ~/.kaggle
!cp ./content/MyDrive/MachineLearning/kaggle/kaggle.json ~/.kaggle

In [19]:
train_ds = torchaudio.datasets.SPEECHCOMMANDS("./",download=True,subset="training")
test_ds = torchaudio.datasets.SPEECHCOMMANDS("./",download=True,subset="validation")

  0%|          | 0.00/2.26G [00:00<?, ?B/s]

In [27]:
commands_list = set()
for _, _ ,curr, _, _ in test_ds:
  commands_list.add(curr)

In [29]:
commands_list = sorted(list(commands_list))
commands_dict = {k: v for v,k in enumerate(commands_list)}

In [46]:
torchaudio.transforms.Resample(new_freq=8000)(train_ds[20][0]).shape

torch.Size([1, 8000])

In [22]:
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [23]:
def evaluate(model, test_dl, crit):
  total = 0
  correct = 0
  tot_loss = 0
  model.eval()
  for x, y in test_dl:
    x = x.to(dev)
    y = y.to(dev)
    o = model(x)
    l = crit(o,y)
    tot_loss += l.item()
    correct += torch.sum(torch.argmax(o,axis=1) == y).item()
    total += len(y)
  test_loss = tot_loss / len(test_dl)
  test_acc = 100 * correct / total
  return test_loss, test_acc

def train_one_epoch(model, train_dl, crit, optim):
  total = 0
  correct = 0
  tot_loss = 0
  model.train()
  for x, y in train_dl:
    optim.zero_grad()
    x = x.to(dev)
    y = y.to(dev)
    o = model(x)
    l = crit(o,y)
    l.backward()
    optim.step()
    tot_loss += l.item()
    correct += torch.sum(torch.argmax(o,axis=1) == y).item()
    total += len(y)
  train_loss = tot_loss / len(train_dl)
  train_acc = 100 * correct / total
  return train_loss, train_acc

def train(model, train_dl, test_dl, crit, optim, lr_sched=None, epochs=10):
  for epoch in tqdm.tqdm_notebook(range(epochs)):
    train_loss, train_acc = train_one_epoch(model, train_dl, crit, optim)
    test_loss, test_acc = evaluate(model, test_dl, crit)
    if lr_sched is not None:
      lr_sched.step()
    print(f"epoch: {epoch}, train loss: {train_loss}, train accuracy: {train_acc:.2f}%, test loss: {test_loss}, test accuracy: {test_acc:.2f}%")

In [24]:
def make_simple_conv1d_block(in_planes, out_planes, kernel_size=3,stride=1,padding=0):
  return nn.Sequential(
      nn.Conv1d(in_planes, out_planes, kernel_size, stride=stride, padding=padding,bias=False),
      nn.BatchNorm1d(out_planes),
      nn.ReLU(inplace=True)
  )

def make_simple_conv2d_block(in_planes, out_planes, kernel_size=3, stride=1, padding=0):
  return nn.Sequential(
      nn.Conv2d(in_planes, out_planes, kernel_size, stride=stride, padding=padding, bias=False),
      nn.BatchNorm2d(out_planes),
      nn.ReLU(inplace=True)
  )

# Conv1d

In [49]:
class MyDataset(torch.utils.data.Dataset):
  def __init__(self, raw_data):
    super().__init__()
    self.raw_data = raw_data
    self.transform = torchaudio.transforms.Resample(new_freq=8000)
  
  def __len__(self):
    return len(self.raw_data)

  def __getitem__(self, idx):
    x, _, label_, _, _ = self.raw_data[idx]
    x = self.transform(x)
    x = x[:,:8000]
    C, L = x.shape
    x = torch.functional.F.pad(x,(0,(8000 - L)),mode="constant",value=0.0)
    y = commands_dict[label_]
    return x, y

In [50]:
train_ds_ = MyDataset(train_ds)
test_ds_ = MyDataset(test_ds)

In [65]:
train_dl = torch.utils.data.DataLoader(train_ds_, batch_size=128,shuffle=True)
test_dl = torch.utils.data.DataLoader(test_ds_, batch_size=128,shuffle=True)

In [73]:
model = nn.Sequential(
    make_simple_conv1d_block(1,16,kernel_size=40),
    nn.MaxPool1d(4),
    make_simple_conv1d_block(16,32,kernel_size=3),
    nn.MaxPool1d(4),
    make_simple_conv1d_block(32,32,kernel_size=3),
    nn.MaxPool1d(4),
    make_simple_conv1d_block(32,64,kernel_size=3),
    nn.MaxPool1d(4),
    make_simple_conv1d_block(64,64,kernel_size=3),
    nn.MaxPool1d(4),
    nn.Flatten(),
    nn.Linear(7*64,35)
).to(dev)

In [74]:
crit = nn.CrossEntropyLoss()
optim = torch.optim.SGD(model.parameters(), lr=0.1)
lr_sched = torch.optim.lr_scheduler.StepLR(optim, step_size=5,gamma=0.2)
train(model, train_dl, test_dl, crit, optim, lr_sched=lr_sched, epochs=10)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/10 [00:00<?, ?it/s]

epoch: 0, train loss: 1.9375568135651349, train accuracy: 46.07%, test loss: 1.1984088971064641, test accuracy: 65.74%
epoch: 1, train loss: 0.9835253300350356, train accuracy: 72.77%, test loss: 1.0319349078031688, test accuracy: 71.32%
epoch: 2, train loss: 0.745238889441411, train accuracy: 79.13%, test loss: 0.703399386161413, test accuracy: 79.99%
epoch: 3, train loss: 0.6334856946364068, train accuracy: 82.17%, test loss: 0.6311719283843652, test accuracy: 82.15%
epoch: 4, train loss: 0.5626381826706421, train accuracy: 84.13%, test loss: 0.5633378892372816, test accuracy: 84.19%
epoch: 5, train loss: 0.43417159163394486, train accuracy: 88.01%, test loss: 0.44970435209763354, test accuracy: 87.61%
epoch: 6, train loss: 0.41153484625604175, train accuracy: 88.56%, test loss: 0.44563012551038694, test accuracy: 87.66%
epoch: 7, train loss: 0.402007919046494, train accuracy: 88.91%, test loss: 0.42999051969784957, test accuracy: 88.20%
epoch: 8, train loss: 0.3916631625069033, trai

In [75]:
torch.save(model.state_dict(),"./content/MyDrive/MachineLearning/models/speech_commands_conv1d.ckpt")

# Spectrogram

In [85]:
class MySpectrogramDataset(torch.utils.data.Dataset):
  def __init__(self, raw_data):
    super().__init__()
    self.raw_data = raw_data
    self.transform = torchaudio.transforms.Resample(new_freq=8000)
    self.spectrogram = torchaudio.transforms.Spectrogram(n_fft=160)
  
  def __len__(self):
    return len(self.raw_data)

  def __getitem__(self, idx):
    x, _, label_, _, _ = self.raw_data[idx]
    x = self.transform(x)
    x = x[:,:8000]
    C, L = x.shape
    x = torch.functional.F.pad(x,(0,(8000 - L)),mode="constant",value=0.0)
    y = commands_dict[label_]
    return torch.log(self.spectrogram(x) + 1e-5), y

In [86]:
train_ds_ = MySpectrogramDataset(train_ds)
test_ds_ = MySpectrogramDataset(test_ds)

In [87]:
test_ds_[0][0].shape

torch.Size([1, 81, 101])

In [88]:
train_dl = torch.utils.data.DataLoader(train_ds_, batch_size=128,shuffle=True)
test_dl = torch.utils.data.DataLoader(test_ds_, batch_size=128,shuffle=True)

In [101]:
model = nn.Sequential(
    make_simple_conv2d_block(1,16,kernel_size=3),
    nn.MaxPool2d(2),
    make_simple_conv2d_block(16,32,kernel_size=3),
    nn.MaxPool2d(2),
    make_simple_conv2d_block(32,32,kernel_size=3),
    nn.MaxPool2d(2),
    make_simple_conv2d_block(32,64,kernel_size=3),
    make_simple_conv2d_block(64,64,kernel_size=3),
    nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(384,35)
).to(dev)

In [102]:
crit = nn.CrossEntropyLoss()
optim = torch.optim.SGD(model.parameters(), lr=0.1)
lr_sched = torch.optim.lr_scheduler.StepLR(optim, step_size=5,gamma=0.2)
train(model, train_dl, test_dl, crit, optim, lr_sched=lr_sched, epochs=10)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/10 [00:00<?, ?it/s]

epoch: 0, train loss: 1.1247543501098771, train accuracy: 68.35%, test loss: 0.8696442299928421, test accuracy: 74.29%
epoch: 1, train loss: 0.48581422472072044, train accuracy: 85.97%, test loss: 0.5854697915223929, test accuracy: 82.97%
epoch: 2, train loss: 0.3806301877014957, train accuracy: 88.94%, test loss: 0.4559169243543576, test accuracy: 86.55%
epoch: 3, train loss: 0.32432052659413035, train accuracy: 90.56%, test loss: 0.49076379396212405, test accuracy: 85.67%
epoch: 4, train loss: 0.2865928241345318, train accuracy: 91.63%, test loss: 0.38627349184109616, test accuracy: 89.02%
epoch: 5, train loss: 0.21075729453770464, train accuracy: 94.11%, test loss: 0.26599844659750277, test accuracy: 92.18%
epoch: 6, train loss: 0.19493207992876277, train accuracy: 94.58%, test loss: 0.2649967258748336, test accuracy: 92.40%
epoch: 7, train loss: 0.18569903448224068, train accuracy: 94.90%, test loss: 0.26530232710333973, test accuracy: 92.35%
epoch: 8, train loss: 0.179264359322248

In [103]:
torch.save(model.state_dict(),"./content/MyDrive/MachineLearning/models/speech_commands_conv2d.ckpt")