# Specialized Task Shallow Detectors

*   Vowel vs Consonant 
*   Front vowels vs Back vowels
*   High vowels vs Low vowels

What does Prof. Baker mean with:
*   distinguish manner-of-articulation for consonants
*   place of articulation of consonants



In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [43]:
drivepath = '/content/gdrive/MyDrive/Spring_2021/11785_Intro_to_Deep_Learning/DL_Group_Project'
datapath = '/content/gdrive/MyDrive/Spring_2021/11785_Intro_to_Deep_Learning/dl-group-project-code/preprocessed_data'
output_path = f'{drivepath}/experiments/specialized_detectors/'

In [44]:
!pip install tqdm



In [45]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from tqdm import tqdm
import time
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd

In [46]:
NUM_EPOCHS = 30
BATCH_SIZE = 64
HIDDEN_SIZE = 128
MODEL_VERSION = 1
LEARNING_RATE = 0.01
LOGISTIC_THRESHOLD = 0.5

In [47]:
cuda = torch.cuda.is_available()
num_workers = 8 if cuda else 0
DEVICE = "cuda" if cuda else "cpu"
print("Cuda = "+str(cuda)+" with num_workers = "+str(num_workers))

Cuda = True with num_workers = 8


In [66]:
class SpecializedDataset(Dataset):
    
    def __init__(self, datapath, mode, task_name, phonemes_class_0, phonemes_class_1):
      """
      phonemes_class_0: list of phoneme names for class 0
      phonemes_class_1: list of phoneme names for class 1
      """
      complete_features = np.zeros((1, 40))  # eliminate this row
      complete_labels = np.zeros((1))  # eliminate this row

      # go through all files in datapath, check phoneme if class=0 or class=1
      # assign that label and discard silence frames
      with os.scandir(datapath) as entries:
        for entry in entries:
          if entry.is_file():
            if "features" in entry.name and mode in entry.name:
              phoneme_tag = entry.name.split("_")[0]

              features_filepath = entry.path
              labels_filepath = f"{datapath}/{phoneme_tag}_{mode}_labels.npy"

              phoneme_features = np.load(features_filepath, allow_pickle=True)
              phoneme_labels = np.load(labels_filepath, allow_pickle=True)
              print(f"{phoneme_tag} total features: {phoneme_features.shape}")
              print(f"{phoneme_labels} total labels: {phoneme_labels.shape}")

              # find frames where label != 0 (non-silence)
              non_zero_indexes = phoneme_labels.nonzero()
              phoneme_features = phoneme_features[non_zero_indexes]
              phoneme_labels = phoneme_labels[non_zero_indexes]
              print(f"{phoneme_tag} no-silence features: {phoneme_features.shape}")
              print(f"{phoneme_labels} no-silence labels: {phoneme_labels.shape}")

              # find phoneme in class_0 or class_1 list and assign label
              phoneme_class = None
              if phoneme_tag in phonemes_class_0:
                phoneme_class = 0
              if phoneme_tag in phonemes_class_1:
                phoneme_class = 1

              if phoneme_class is None:
                raise Exception(f"phoneme '{phoneme_tag}' not found on class 0 nor class 1 lists")

              phoneme_labels[:] = phoneme_class  # label=class

              # stack to phoneme features
              complete_features = np.concatenate((complete_features, phoneme_features))
              complete_labels = np.concatenate((complete_labels, phoneme_labels))
      
      self.X = complete_features[1:]
      self.Y = complete_labels[1:]
      print(f"[task={task_name}] {self.X.shape} features")
      print(f"[task={task_name}] {self.Y.shape} labels")

    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, index):
        x = torch.Tensor(self.X[index]).float()
        y = torch.as_tensor(self.Y[index]).float()
    
        return x,y

In [67]:
def make_dataloader(dataset, train, batch_size):
  if train:
    shuffle = True
    drop_last = True
  else:
    shuffle = False
    drop_last = False
    
  loader = DataLoader(dataset=dataset, batch_size=batch_size,
                      drop_last=drop_last, shuffle=shuffle,
                      pin_memory=True, num_workers=8)
  
  return loader

In [68]:
class SpecializedShallowDetector(nn.Module):
  
  def __init__(self, hidden_size, activation):
    super(PhonemeShallowDetector, self).__init__()
    
    self.linear_layer = nn.Linear(in_features=40, out_features=hidden_size)
    self.bn_layer = nn.BatchNorm1d(num_features=hidden_size)
    self.activation = activation
    self.output_layer = nn.Linear(in_features=hidden_size, out_features=1)
    self.sigmoid = nn.Sigmoid()
    seq_params = [
      self.linear_layer,
      self.bn_layer,
      self.activation,
      self.output_layer,
      self.sigmoid
    ]

    self.network = nn.Sequential(*seq_params)
    
  def forward(self, x):
    return self.network(x)

In [69]:
class SpecializedDetector():

  def __init__(self, task_name, phonemes_class_0, phonemes_class_1):
    self.task_name = task_name

    train_data = SpecializedDataset(datapath=datapath, mode="train", 
                                    task_name=task_name, 
                                    phonemes_class_0=phonemes_class_0, 
                                    phonemes_class_1=phonemes_class_1)
    self.train_loader = make_dataloader(dataset=train_data, train=True, batch_size=BATCH_SIZE)

    dev_data = SpecializedDataset(datapath=datapath, mode="dev", 
                                  task_name=task_name, 
                                  phonemes_class_0=phonemes_class_0, 
                                  phonemes_class_1=phonemes_class_1)
    self.dev_loader = make_dataloader(dataset=dev_data, train=False, batch_size=BATCH_SIZE)

    self.model = SpecializedShallowDetector(hidden_size=HIDDEN_SIZE, 
                                            activation=nn.LeakyReLU()).to(DEVICE)
    self.criterion = nn.BCELoss()
    self.optimizer = torch.optim.SGD(self.model.parameters(), lr=LEARNING_RATE, momentum=0.9)
    self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min')

    self.train_loss_per_epoch = []
    self.train_acc_per_epoch = []
    self.dev_loss_per_epoch = []
    self.dev_acc_per_epoch = []
  
  def save_model(self, epoch):
    model_epoch_path = "{}/models/model_{}_{}_{}".format(output_path, self.task_name, 
                                                         MODEL_VERSION, epoch)
    torch.save({
        'model_state_dict': self.model.state_dict(),
        'optimizer_state_dict': self.optimizer.state_dict(),
        'scheduler_state_dict': self.scheduler.state_dict(),
    }, model_epoch_path)
    # print('saved model: {}'.format(model_epoch_path))

  def train(self, epochs):
    # Run training and track with wandb
    total_batches = len(self.train_loader) * epochs
    example_ct = 0  # number of examples seen
    batch_ct = 0

    for epoch in tqdm(range(epochs)):
        train_loss = 0.0
        start_time = time.time()
        total_predictions = 0
        correct_predictions = 0

        true_labels = []
        predictions = []
        for _, (features, targets) in enumerate(self.train_loader):
            batch_loss, outputs = self.train_batch(features, targets)
            train_loss += batch_loss

            example_ct += len(features)
            batch_ct += 1

            targets = targets.reshape(-1, 1)

            # check number of correct predictions
            output_classes = torch.where(outputs > LOGISTIC_THRESHOLD, 1, 0).detach().cpu() # convert to class labels
            total_predictions += len(output_classes)
            correct_predictions += torch.sum(targets == output_classes)

            true_labels += list(targets)
            predictions += list(output_classes)

        end_time = time.time()

        train_loss /= example_ct
        print(f"training loss: {train_loss}; time: {end_time - start_time}s")
        
        if (epoch + 1) % 10 == 0 or epoch == (epochs - 1):
          report = classification_report(true_labels, predictions, output_dict=True)
          df = pd.DataFrame(report).transpose()
          df.to_csv(f"{output_path}/reports/train_{self.task_name}_{MODEL_VERSION}_{epoch + 1}.csv", index=False)

        print(classification_report(true_labels, predictions))
        print(confusion_matrix(true_labels, predictions))

        self.train_loss_per_epoch.append(train_loss)
        #self.train_acc_per_epoch.append(train_acc)

        # evaluate model with validation data
        dev_loss = self.evaluate_model(epoch)
        
        self.dev_loss_per_epoch.append(dev_loss)
        #self.dev_acc_per_epoch.append(dev_acc)

        # Step with the scheduler
        self.scheduler.step(dev_loss)
      
    # epoch completed, save model
    self.save_model(epoch)

  def train_batch(self, features, targets):
    features, targets = features.to(DEVICE), targets.to(DEVICE)
    targets = targets.reshape(-1, 1)

    self.optimizer.zero_grad()

    # Forward pass ➡
    outputs = self.model(features)
    loss = self.criterion(outputs, targets)  # compare with target outputs
    # Backward pass ⬅
    loss.backward()
    # Step with optimizer
    self.optimizer.step()

    return loss.item(), outputs

  def evaluate_model(self, epoch):

    with torch.no_grad():
      self.model.eval()

      running_loss = 0.0
      total_predictions = 0.0
      correct_predictions = 0.0

      true_labels = []
      predictions = []

      example_ct = 0
      start_time = time.time()
      for batch_idx, (features, targets) in enumerate(self.dev_loader):
        features = features.to(DEVICE)
        targets = targets.to(DEVICE)
        targets = targets.reshape(-1, 1)

        example_ct += len(features)

        outputs = self.model(features)
        outputs = outputs.to(DEVICE)

        # check number of correct predictions
        output_classes = torch.where(outputs > LOGISTIC_THRESHOLD, 1, 0)  # convert to class labels
        total_predictions += len(output_classes)
        correct_predictions += torch.sum(targets == output_classes)

        loss = self.criterion(outputs, targets).detach()
        running_loss += loss.item()

        true_labels += list(targets.detach().cpu())
        predictions += list(output_classes.detach().cpu())
      
      end_time = time.time()

      running_loss /= example_ct
      print(f"testing loss: {running_loss}; time: {end_time - start_time}s")

      if (epoch + 1) % 10 == 0:
        report = classification_report(true_labels, predictions, output_dict=True)
        df = pd.DataFrame(report).transpose()
        df.to_csv(f"{output_path}/reports/reports_dev_{self.task_name}_{MODEL_VERSION}_{epoch + 1}.csv", index=False)
  
      print(classification_report(true_labels, predictions))
      print(confusion_matrix(true_labels, predictions))
      return running_loss

## 2. Training shallow detectors (per phoneme + one for silence)

In [70]:
%cd /content/gdrive/MyDrive/Spring_2021/11785_Intro_to_Deep_Learning/dl-group-project-code/utilities

/content/gdrive/MyDrive/Spring_2021/11785_Intro_to_Deep_Learning/dl-group-project-code/utilities


In [71]:
from utilities import SPECIALIZED_TASKS

In [72]:
%cd /

/


In [73]:
print(SPECIALIZED_TASKS)

{'vowel_vs_consonant': {0: ['AE', 'AH'], 1: []}, 'frontvowel_vs_backvowel': {0: [], 1: []}, 'highvowel_vs_lowvowel': {0: [], 1: ['AH']}}


In [74]:
for task_name, classes_dict in SPECIALIZED_TASKS.items():
  detector = SpecializedDetector(task_name, classes_dict[0], classes_dict[1])
  detector.train(epochs=NUM_EPOCHS)

AE total features: (1438, 40)
[0 0 0 ... 1 1 1] total labels: (1438,)
AE no-silence features: (548, 40)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

Exception: ignored