# Shallow detectors

In [22]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [35]:
drivepath = '/content/gdrive/MyDrive/CMU/11785_Intro_to_Deep_Learning/DL_Group_Project/Dataset/Preprocessed_Data'

In [24]:
!pip install tqdm



In [25]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from tqdm import tqdm
import time

In [69]:
NUM_EPOCHS = 30
BATCH_SIZE = 64
HIDDEN_SIZE = 128
MODEL_VERSION = 1
LEARNING_RATE = 0.01
LOGISTIC_THRESHOLD = 0.5

In [27]:
cuda = torch.cuda.is_available()
num_workers = 8 if cuda else 0
DEVICE = "cuda" if cuda else "cpu"
print("Cuda = "+str(cuda)+" with num_workers = "+str(num_workers))

Cuda = True with num_workers = 8


In [67]:
class PhonemeDataset(Dataset):
    
    def __init__(self, basepath, phoneme_tag, mode):
      # AA.wav
      # AA recording -> FRAMES, LABELS
      # 70% of FRAMES to train; 20% to dev; 10% to test
      # AA_train_features.npy (NUM_FRAMES, 40)
      # AA_dev_features.npy
      # AA_test_features.npy
      # AA_train_labels.npy

      # load specific recording features and labels
      phoneme_features_path = f"{basepath}/{phoneme_tag}_{mode}_features.npy"
      phoneme_labels_path = f"{basepath}/{phoneme_tag}_{mode}_labels.npy"

      phoneme_features = np.load(phoneme_features_path, allow_pickle=True)
      phoneme_labels = np.load(phoneme_labels_path, allow_pickle=True)

      print(f"[{phoneme_tag}] phoneme_features.shape: {phoneme_features.shape}")
      print(f"[{phoneme_tag}] phoneme_labels.shape: {phoneme_labels.shape}")

      phoneme_labels = np.where(phoneme_labels != 0, 1, 0)

      # load 30% of other_phoneme=1, do not load silence
      # go through all other files in basepath and load them with label=0
      with os.scandir(basepath) as entries:
        for entry in entries:
          if entry.is_file() and phoneme_tag not in entry.name:
            if "features" in entry.name and mode in entry.name:
              other_phoneme_tag = entry.name.split("_")[0]

              features_filepath = entry.path
              labels_filepath = f"{basepath}/{other_phoneme_tag}_{mode}_labels.npy"

              other_phoneme_features = np.load(features_filepath, allow_pickle=True)
              other_phoneme_labels = np.load(labels_filepath, allow_pickle=True)
              
              # find frames where label != 0
              # non_zero_labels = other
              other_phoneme_features[]


              # stack to phoneme features
              phoneme_features = np.concatenate((phoneme_features, other_phoneme_features))



      self.X = phoneme_features
      self.Y = phoneme_labels

    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, index):
        x = torch.Tensor(self.X[index]).float()
        y = torch.as_tensor(self.Y[index]).float()
    
        return x,y

In [64]:
class SilenceDataset(Dataset):
    
    def __init__(self, basepath, mode):
      silence_features = np.zeros((1, 40))  # first row to delete at the end
      silence_labels = np.zeros((1))

      # go through all other files in basepath and load them with label=0
      with os.scandir(basepath) as entries:
        for entry in entries:
          if entry.is_file() and phoneme_tag not in entry.name:
            if "features" in entry.name and mode in entry.name:
              phoneme_features = np.load(entry.path, allow_pickle=True)
              # stack to phoneme features
              phoneme_features = np.concatenate((phoneme_features, other_phoneme_features))

              # we will set all zeros as labels
              other_phoneme_labels = np.zeros((len(other_phoneme_features)))
              phoneme_labels = np.concatenate((phoneme_labels, other_phoneme_labels))
            if "labels" in entry.name and mode in entry.name:
              #pass

      self.X = phoneme_features
      self.Y = phoneme_labels

    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, index):
        x = torch.Tensor(self.X[index]).float()
        y = torch.as_tensor(self.Y[index])
    
        return x,y

IndentationError: ignored

In [30]:
def make_dataloader(dataset, train, batch_size):
  if train:
    shuffle = True
    drop_last = True
  else:
    shuffle = False
    drop_last = False
    
  loader = DataLoader(dataset=dataset, batch_size=batch_size,
                      drop_last=drop_last, shuffle=shuffle,
                      pin_memory=True, num_workers=8)
  
  return loader

In [42]:
class PhonemeShallowDetector(nn.Module):
  
  def __init__(self, hidden_size, activation):
    super(PhonemeShallowDetector, self).__init__()
    
    self.linear_layer = nn.Linear(in_features=40, out_features=hidden_size)
    self.bn_layer = nn.BatchNorm1d(num_features=hidden_size)
    self.activation = activation
    self.output_layer = nn.Linear(in_features=hidden_size, out_features=1)
    self.sigmoid = nn.Sigmoid()
    seq_params = [
      self.linear_layer,
      self.bn_layer,
      self.activation,
      self.output_layer,
      self.sigmoid
    ]

    self.network = nn.Sequential(*seq_params)
    
  def forward(self, x):
    return self.network(x)

In [105]:
class ShallowDetector():

  def __init__(self, phoneme_tag):
    self.phoneme_tag = phoneme_tag

    train_data = PhonemeDataset(basepath=drivepath, phoneme_tag=phoneme_tag, mode="train")
    self.train_loader = make_dataloader(dataset=train_data, train=True, batch_size=BATCH_SIZE)
    print(f"[{phoneme_tag}] train_data.shape: {train_data.X.shape}")

    dev_data = PhonemeDataset(basepath=drivepath, phoneme_tag=phoneme_tag, mode="dev")
    self.dev_loader = make_dataloader(dataset=dev_data, train=False, batch_size=BATCH_SIZE)
    print(f"[{phoneme_tag}] dev_data.shape: {dev_data.X.shape}")

    self.model = PhonemeShallowDetector(hidden_size=HIDDEN_SIZE, 
                                        activation=nn.LeakyReLU()).to(DEVICE)
    self.criterion = nn.BCELoss()
    self.optimizer = torch.optim.SGD(self.model.parameters(), lr=LEARNING_RATE, momentum=0.9)
    self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min')

    self.train_loss_per_epoch = []
    self.train_acc_per_epoch = []
    self.dev_loss_per_epoch = []
    self.dev_acc_per_epoch = []
  
  def save_model(self, epoch):
    model_epoch_path = "{}/shallow_detectors/model_{}_{}_{}".format(drivepath, 
                                                                    self.phoneme_tag, 
                                                                    MODEL_VERSION, 
                                                                    epoch)
    torch.save({
        'model_state_dict': self.model.state_dict(),
        'optimizer_state_dict': self.optimizer.state_dict(),
        'scheduler_state_dict': self.scheduler.state_dict(),
    }, model_epoch_path)
    # print('saved model: {}'.format(model_epoch_path))

  def train(self, epochs):
    # Run training and track with wandb
    total_batches = len(self.train_loader) * epochs
    example_ct = 0  # number of examples seen
    batch_ct = 0

    for epoch in tqdm(range(epochs)):
        train_loss = 0.0
        start_time = time.time()
        total_predictions = 0
        correct_predictions = 0

        for _, (features, targets) in enumerate(self.train_loader):
            batch_loss, outputs = self.train_batch(features, targets)
            train_loss += batch_loss

            example_ct += len(features)
            batch_ct += 1

            targets = targets.reshape(-1, 1)

            # check number of correct predictions
            output_classes = torch.where(outputs > LOGISTIC_THRESHOLD, 1, 0)  # convert to class labels
            total_predictions += len(output_classes)
            correct_predictions += torch.sum(targets == output_classes.detach().cpu())

        end_time = time.time()

        train_loss /= example_ct
        print(f"training loss: {train_loss}; time: {end_time - start_time}s")
        train_acc = (correct_predictions/total_predictions) * 100.0
        print(f"training accuracy: {train_acc}%")

        self.train_loss_per_epoch.append(train_loss)
        self.train_acc_per_epoch.append(train_acc)

        # evaluate model with validation data
        dev_loss, dev_acc = self.evaluate_model()
        
        self.dev_loss_per_epoch.append(dev_loss)
        self.dev_acc_per_epoch.append(dev_acc)

        # Step with the scheduler
        self.scheduler.step(dev_loss)

        # epoch completed, save model
        self.save_model(epoch)

  def train_batch(self, features, targets):
    features, targets = features.to(DEVICE), targets.to(DEVICE)
    targets = targets.reshape(-1, 1)

    self.optimizer.zero_grad()

    # Forward pass ➡
    outputs = self.model(features)
    loss = self.criterion(outputs, targets)  # compare with target outputs
    # Backward pass ⬅
    loss.backward()
    # Step with optimizer
    self.optimizer.step()

    return loss.item(), outputs

  def evaluate_model(self):

    with torch.no_grad():
      self.model.eval()

      running_loss = 0.0
      total_predictions = 0.0
      correct_predictions = 0.0

      example_ct = 0

      start_time = time.time()
      for batch_idx, (features, targets) in enumerate(self.dev_loader):
        features = features.to(DEVICE)
        targets = targets.to(DEVICE)
        targets = targets.reshape(-1, 1)

        example_ct += len(features)

        outputs = self.model(features)
        outputs = outputs.to(DEVICE)

        # check number of correct predictions
        output_classes = torch.where(outputs > LOGISTIC_THRESHOLD, 1, 0)  # convert to class labels
        total_predictions += len(output_classes)
        correct_predictions += torch.sum(targets == output_classes)

        loss = self.criterion(outputs, targets).detach()
        running_loss += loss.item()
      
      end_time = time.time()

      running_loss /= example_ct
      print(f"testing loss: {running_loss}; time: {end_time - start_time}s")
      acc = (correct_predictions/total_predictions) * 100.0
      print(f"testing accuracy: {acc}%")
      return running_loss, acc

## 2. Training shallow detectors (per phoneme + one for silence)

In [36]:
%cd /content/gdrive/MyDrive/CMU/11785_Intro_to_Deep_Learning/DL_Group_Project/Dataset/Preprocessed_Data

/content/gdrive/.shortcut-targets-by-id/1-5qSMUCOdbNlj6D2Mas-EnlePRAw5fu5/11785_Intro_to_Deep_Learning/DL_Group_Project/Dataset/Preprocessed_Data


In [37]:
from utilities import PHONEME_MAPPER

In [93]:
%cd /

/


In [38]:
print(PHONEME_MAPPER)

{0: 'SIL', 1: 'AE', 2: 'AH', 3: 'AW', 4: 'AY', 5: 'B', 6: 'BIT', 7: 'D', 8: 'DH', 9: 'EE', 10: 'FF', 11: 'G', 12: 'HH', 13: 'IH', 14: 'II', 15: 'J', 16: 'K', 17: 'LL', 18: 'MM', 19: 'NN', 20: 'OH', 21: 'OO', 22: 'OW', 23: 'OY', 24: 'P', 25: 'RR', 26: 'SH', 27: 'SS', 28: 'T', 29: 'TH', 30: 'UE', 31: 'UH', 32: 'VV', 33: 'WW', 34: 'YY', 35: 'ZZ'}


In [106]:
for phoneme_index, phoneme_tag in PHONEME_MAPPER.items():
  if phoneme_tag == "SIL":
    continue
  detector = ShallowDetector(phoneme_tag)
  detector.train(epochs=NUM_EPOCHS)
  break

[AE] phoneme_features.shape: (1438, 40)
[AE] phoneme_labels.shape: (1438,)
[AE] train_data.shape: (49976, 40)
[AE] phoneme_features.shape: (308, 40)
[AE] phoneme_labels.shape: (308,)


  cpuset_checked))




  0%|          | 0/30 [00:00<?, ?it/s][A[A[A[A

[AE] dev_data.shape: (10724, 40)
training loss: 0.0005177332491000878; time: 2.192976236343384s
training accuracy: 99.18470001220703%






  3%|▎         | 1/30 [00:02<01:19,  2.76s/it][A[A[A[A

testing loss: 0.0006671954525628551; time: 0.5540025234222412s
testing accuracy: 99.22603607177734%
training loss: 0.0002465968442809156; time: 2.1364266872406006s
training accuracy: 99.35095977783203%






  7%|▋         | 2/30 [00:05<01:16,  2.74s/it][A[A[A[A

testing loss: 0.0008058253783903661; time: 0.5422475337982178s
testing accuracy: 99.18873596191406%
training loss: 0.00013552397364502896; time: 2.1212968826293945s
training accuracy: 99.43710327148438%






 10%|█         | 3/30 [00:08<01:13,  2.73s/it][A[A[A[A

testing loss: 0.0005109972088274305; time: 0.5594527721405029s
testing accuracy: 99.31928253173828%
training loss: 9.510044630722803e-05; time: 2.158604383468628s
training accuracy: 99.46514129638672%






 13%|█▎        | 4/30 [00:10<01:11,  2.73s/it][A[A[A[A

testing loss: 0.0007120125331185475; time: 0.5842471122741699s
testing accuracy: 99.17008972167969%
training loss: 6.924287536632024e-05; time: 2.1492786407470703s
training accuracy: 99.51722717285156%






 17%|█▋        | 5/30 [00:13<01:08,  2.73s/it][A[A[A[A

testing loss: 0.0005447611424104983; time: 0.5656337738037109s
testing accuracy: 99.27265930175781%
training loss: 5.6270180326443e-05; time: 2.2985050678253174s
training accuracy: 99.51722717285156%






 20%|██        | 6/30 [00:16<01:06,  2.77s/it][A[A[A[A

testing loss: 0.00046617617318120115; time: 0.5620584487915039s
testing accuracy: 99.34725952148438%
training loss: 4.6392637280453424e-05; time: 2.140101909637451s
training accuracy: 99.53125%






 23%|██▎       | 7/30 [00:19<01:03,  2.76s/it][A[A[A[A

testing loss: 0.0006475755275122759; time: 0.5605299472808838s
testing accuracy: 99.25401306152344%


Exception in thread Thread-112:
Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 113, in get
    return _ForkingPickler.loads(res)
  File "/usr/local/lib/python3.7/dist-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
    fd = df.detach()
  File "/usr/lib/python3.7/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/usr/lib/python3.7/multiprocessing/resource_sharer.py", line 87, in get_connection
    c = Client(address, authkey=process.current_process().authkey)
  File "/usr/lib/pyth

KeyboardInterrupt: ignored