# Frame to Phoneme Classifier
Model network 2, includes:
*    Pre-trained Phoneme Shallow Detectors (38)
*    Pre-trained Specialized Task Classifiers (10)

Combines these models into a final linear layer, and outputs phoneme probabilities



In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
drivepath = '/content/gdrive/MyDrive/DL_Group_Project'
specialized_models_path = f'{drivepath}/experiments/specialized_detectors/models'
phoneme_models_path = f'{drivepath}/experiments/phoneme_detectors/'
datapath = '/content/gdrive/MyDrive/DL_Group_Project/Dataset/Preprocessed_Data'
drivepath_final = f'{drivepath}/experiments/complete_network_2/'

In [3]:
!pip install tqdm



In [4]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from tqdm import tqdm
import time
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd

In [5]:
NUM_EPOCHS = 100
BATCH_SIZE = 64
HIDDEN_SIZE_shallow = 128
HIDDEN_SIZE_spec = 128
MODEL_VERSION = 1
LEARNING_RATE = 0.01
OTHER_PHONEMES_PERCENT = 0.1

In [6]:
cuda = torch.cuda.is_available()
num_workers = 8 if cuda else 0
DEVICE = "cuda" if cuda else "cpu"
print("Cuda = "+str(cuda)+" with num_workers = "+str(num_workers))

Cuda = True with num_workers = 8


In [7]:
class PhonemesDataset(Dataset):
    
    def __init__(self, basepath, mode):
      phoneme_features = np.zeros((1, 40))  # eliminate this row
      phoneme_labels = np.zeros((1))  # eliminate this row

      with os.scandir(basepath) as entries:
        for entry in entries:
          if entry.is_file():
            if "features" in entry.name and mode in entry.name:
              phoneme_tag = entry.name.split("_")[0]

              features_filepath = entry.path
              labels_filepath = f"{basepath}/{phoneme_tag}_{mode}_labels.npy"

              other_phoneme_features = np.load(features_filepath, allow_pickle=True)
              other_phoneme_labels = np.load(labels_filepath, allow_pickle=True)
              
              # stack to phoneme features
              phoneme_features = np.concatenate((phoneme_features, other_phoneme_features))
              phoneme_labels = np.concatenate((phoneme_labels, other_phoneme_labels))
              
      self.X = phoneme_features[1:]
      self.Y = phoneme_labels[1:]

    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, index):
        x = torch.Tensor(self.X[index]).float()
        y = torch.as_tensor(self.Y[index]).long()
    
        return x,y

    def get_phoneme_label_frames(self, phoneme_label):
        phoneme_indexes = np.where(self.Y == phoneme_label)
        return torch.Tensor(self.X[phoneme_indexes]).float(), torch.as_tensor(self.Y[phoneme_indexes]).long()

In [8]:
def make_dataloader(dataset, train, batch_size):
  if train:
    shuffle = True
    drop_last = True
  else:
    shuffle = False
    drop_last = False
    
  loader = DataLoader(dataset=dataset, batch_size=batch_size,
                      drop_last=drop_last, shuffle=shuffle,
                      pin_memory=True, num_workers=8)
  
  return loader

In [45]:
class PhonemeShallowDetector(nn.Module):
  
  def __init__(self, hidden_size, activation):
    super(PhonemeShallowDetector, self).__init__()
    
    self.linear_layer = nn.Linear(in_features=40, out_features=hidden_size)
    self.linear_layer.weight.requires_grad = False
    self.linear_layer.requires_grad = False
    self.bn_layer = nn.BatchNorm1d(num_features=hidden_size)
    self.bn_layer.weight.requires_grad = False
    self.bn_layer.requires_grad = False
    self.activation = activation
    self.output_layer = nn.Linear(in_features=hidden_size, out_features=1)
    self.output_layer.weight.requires_grad = False
    self.output_layer.requires_grad = False
    self.sigmoid = nn.Sigmoid()
    seq_params = [
      self.linear_layer,
      self.bn_layer,
      self.activation,
      self.output_layer,
      self.sigmoid
    ]

    self.network = nn.Sequential(*seq_params)      
    
  def forward(self, x):
    return self.network(x)

In [59]:
class SpecializedShallowDetector(nn.Module):
  
  def __init__(self, hidden_size, activation):
    super(SpecializedShallowDetector, self).__init__()
    
    self.linear_layer = nn.Linear(in_features=40, out_features=hidden_size)
    self.linear_layer.weight.requires_grad = False
    self.linear_layer.requires_grad = False
    self.bn_layer = nn.BatchNorm1d(num_features=hidden_size)
    self.bn_layer.weight.requires_grad = False
    self.bn_layer.requires_grad = False
    self.activation = activation
    self.output_layer = nn.Linear(in_features=hidden_size, out_features=1)
    self.output_layer.weight.requires_grad = False
    self.output_layer.requires_grad = False
    self.sigmoid = nn.Sigmoid()
    seq_params = [
      self.linear_layer,
      self.bn_layer,
      self.activation,
      self.output_layer,
      self.sigmoid
    ]

    self.network = nn.Sequential(*seq_params)
    
  def forward(self, x):
    return self.network(x)

In [61]:
class FramePhonemeClassifierModel(nn.Module):
  
  def __init__(self, phoneme_mapper, specialized_mapper):
    super(FramePhonemeClassifierModel, self).__init__()
    
    self.phoneme_mapper = phoneme_mapper
    self.specialized_mapper = specialized_mapper

    # generate the SpecializedShallowDetector 
    specialized_detectors = []
    for spec_idx, spec_type in enumerate(self.specialized_mapper):
      specialized_detector = SpecializedShallowDetector(hidden_size=HIDDEN_SIZE_spec, 
                                                        activation=nn.LeakyReLU()).to(DEVICE)
      specialized_detectors.append(specialized_detector)

    self.specialized_detectors = nn.ModuleList(specialized_detectors)

    # generate the PhonemeShallowDetectors 
    phoneme_detectors = []
    for phoneme_index, phoneme_tag in phoneme_mapper.items():
      phoneme_detector = PhonemeShallowDetector(hidden_size=HIDDEN_SIZE_shallow, 
                                                activation=nn.LeakyReLU())
      phoneme_detectors.append(phoneme_detector)

    self.phoneme_detectors = nn.ModuleList(phoneme_detectors)

    self.linear_layer = nn.Linear(in_features=len(phoneme_mapper)+len(specialized_mapper), 
                                  out_features=len(phoneme_mapper))

    self.initialize_specialized_detectors()
    self.initialize_shallow_detectors()

  def initialize_shallow_detectors(self):
    # load weights from shallow detectors pre-trained models
    for phoneme_index, phoneme_tag in self.phoneme_mapper.items():
      phoneme_shallow_detector = self.phoneme_detectors[phoneme_index]

      phoneme_model_path = f"{phoneme_models_path}/model_{phoneme_tag}_{MODEL_VERSION}_99"
      temp = torch.load(phoneme_model_path)
      phoneme_shallow_detector.load_state_dict(temp['model_state_dict'])
  
  def initialize_specialized_detectors(self):
    # load weights from shallow detectors pre-trained models
     for spec_idx, spec_type in enumerate(self.specialized_mapper):
      specialized_detector = self.specialized_detectors[spec_idx]

      spec_model_path = f"{specialized_models_path}/model_{spec_type}_{MODEL_VERSION}_29"
      temp = torch.load(spec_model_path)
      specialized_detector.load_state_dict(temp['model_state_dict'])
  
  def forward(self, x):
    """
    Returns tuple:
      final outputs (B, NUM_PHONEMES) and shallow classifier outputs (B, NUM_SHALLOW_DETECTORS)
    """
    shallow_classifier_outputs = []
    for phoneme_index, phoneme_tag in self.phoneme_mapper.items():
      phoneme_shallow_detector = self.phoneme_detectors[phoneme_index]
      # run frame through shallow detector
      output = phoneme_shallow_detector(x)
      shallow_classifier_outputs.append(output.reshape(-1))
    
    for spec_idx, spec_type in enumerate(self.specialized_mapper):
      # go through specialized
      specialized_detector = self.specialized_detectors[spec_idx]
      output = specialized_detector(x)
      shallow_classifier_outputs.append(output.reshape(-1))

    # convert to torch tensor
    shallow_classifier_outputs = torch.vstack(shallow_classifier_outputs).T
    outputs = self.linear_layer(shallow_classifier_outputs)

    return outputs, shallow_classifier_outputs

In [49]:
import numpy as np
import matplotlib.pyplot as plt

def highlight_highest(series):
  pal = []
  # TODO: print the series labels for the 5 highest
  max_values = sorted(series, reverse=True)[:5]

  max_color = 'turquoise'
  other_color = 'lightgrey'

  for item in series:
    if item in max_values:
        pal.append(max_color)
    else:
        pal.append(other_color)
        
  return pal

def plot_phoneme_probabilities(phoneme_tag, phonemes, probabilities):
  fig = plt.figure(figsize=(35, 3))
 
  # creating the bar plot
  plt.bar(phonemes, probabilities, width = 0.4, color = highlight_highest(probabilities))
  
  plt.xlabel("Phonemes")
  plt.ylabel("Probabilities")
  plt.title(f"Model Probabilities for frame (true label={phoneme_tag})")
  plt.show()


def plot_shallow_networks_outputs(phoneme_tag, shallow_networks, probabilities):
  fig = plt.figure(figsize=(35, 3))
 
  # creating the bar plot
  plt.bar(shallow_networks, probabilities, width = 0.4, color = highlight_highest(probabilities))
  
  plt.xlabel("Shallow Networks")
  plt.ylabel("Probabilities")
  plt.title(f"Shallow Networks Probabilities for frame (true label={phoneme_tag})")
  plt.show()

def plot_shallow_networks_importance(target_phoneme_tag, weight_phoneme_tag,
                                     shallow_networks, probabilities, 
                                     phoneme_neuron_weights):
  fig = plt.figure(figsize=(35, 3))

  importance = probabilities.cpu().detach().numpy() * phoneme_neuron_weights.cpu().detach().numpy()

  # creating the bar plot
  plt.bar(shallow_networks, importance, width = 0.4, color = highlight_highest(importance))
  
  plt.xlabel("Shallow Networks")
  plt.ylabel(f"Probabilities * Weights ({weight_phoneme_tag})")
  plt.title(f"Shallow Networks Importance for frame (true label={target_phoneme_tag}; weights phoneme={weight_phoneme_tag})")
  plt.show()

In [50]:
class FramePhonemeClassifier():

  def __init__(self, train_loader, dev_loader, phoneme_mapper, specialized_mapper):
    
    self.model = FramePhonemeClassifierModel(phoneme_mapper, specialized_mapper).to(DEVICE)
    
    self.train_loader = train_loader
    self.dev_loader = dev_loader

    self.criterion = nn.CrossEntropyLoss()
    self.optimizer = torch.optim.SGD(self.model.parameters(), lr=LEARNING_RATE, momentum=0.9)
    self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min')

    self.softmax = nn.Softmax(dim=1)

    self.train_loss_per_epoch = []
    self.train_acc_per_epoch = []
    self.dev_loss_per_epoch = []
    self.dev_acc_per_epoch = []
  
  def save_model(self, epoch):
    model_epoch_path = f"{drivepath_final}/models/model_{MODEL_VERSION}_{epoch}"
    torch.save({
        'model_state_dict': self.model.state_dict(),
        'optimizer_state_dict': self.optimizer.state_dict(),
        'scheduler_state_dict': self.scheduler.state_dict(),
    }, model_epoch_path)
    # print('saved model: {}'.format(model_epoch_path))

  def train(self, epochs):
    self.model.train()

    # Run training and track with wandb
    total_batches = len(self.train_loader) * epochs
    example_ct = 0  # number of examples seen
    batch_ct = 0

    epoch = None
    for epoch in tqdm(range(epochs)):
        train_loss = 0.0
        start_time = time.time()
        total_predictions = 0
        correct_predictions = 0

        true_labels = []
        predictions = []
        for _, (features, targets) in enumerate(self.train_loader):
            batch_loss, outputs = self.train_batch(features, targets)
            train_loss += batch_loss

            example_ct += len(features)
            batch_ct += 1

            # check number of correct predictions
            output_classes = torch.argmax(outputs.log_softmax(1), dim=1).detach().cpu()  # convert to class labels
            total_predictions += len(output_classes)
            correct_predictions += torch.sum(targets == output_classes)

            true_labels += list(targets)
            predictions += list(output_classes)

        end_time = time.time()

        train_loss /= example_ct
        print(f"training loss: {train_loss}; time: {end_time - start_time}s")
        
        if (epoch + 1) % 10 == 0 or epoch == (epochs - 1):
          report = classification_report(true_labels, predictions, output_dict=True)
          df = pd.DataFrame(report).transpose()
          df.to_csv(f"{drivepath_final}/reports/reports_train_{MODEL_VERSION}_{epoch + 1}.csv", index=False)
          self.save_model(epoch)

        train_acc = (correct_predictions/total_predictions) * 100.0
        print(f"training accuracy: {train_acc}%")

        self.train_loss_per_epoch.append(train_loss)
        self.train_acc_per_epoch.append(train_acc)

        # evaluate model with validation data
        dev_loss, dev_acc = self.evaluate_model(epoch)
        
        self.dev_loss_per_epoch.append(dev_loss)
        self.dev_acc_per_epoch.append(dev_acc)

        # Step with the scheduler
        self.scheduler.step(dev_loss)
    
    if epoch is not None:
      # epoch completed, save model
      self.save_model(epoch)

  def train_batch(self, features, targets):
    features, targets = features.to(DEVICE), targets.to(DEVICE)
    targets = targets.reshape(-1, 1)

    self.optimizer.zero_grad()

    # Forward pass ➡
    outputs, shallow_classifier_outputs = self.model(features)
    loss = self.criterion(outputs, targets.reshape(-1))  # compare with target outputs

    # Backward pass ⬅
    loss.backward()
    
    # Step with optimizer
    self.optimizer.step()

    return loss.item(), outputs

  def evaluate_model(self, epoch):
    with torch.no_grad():
      self.model.eval()

      running_loss = 0.0
      total_predictions = 0.0
      correct_predictions = 0.0

      true_labels = []
      predictions = []

      example_ct = 0
      start_time = time.time()
      for batch_idx, (features, targets) in enumerate(self.dev_loader):
        features = features.to(DEVICE)
        targets = targets.to(DEVICE)
        targets = targets.reshape(-1, 1)

        example_ct += len(features)

        outputs, shallow_classifier_outputs = self.model(features)
        outputs = outputs.to(DEVICE)

        # check number of correct predictions
        output_classes = torch.argmax(outputs.log_softmax(1), dim=1)  # convert to class labels
        total_predictions += len(output_classes)
        correct_predictions += torch.sum(targets.reshape(-1) == output_classes)

        loss = self.criterion(outputs, targets.reshape(-1)).detach()
        running_loss += loss.item()

        true_labels += list(targets.detach().cpu())
        predictions += list(output_classes.detach().cpu())
      
      end_time = time.time()

      running_loss /= example_ct
      print(f"testing loss: {running_loss}; time: {end_time - start_time}s")
      acc = (correct_predictions/total_predictions) * 100.0
      print(f"testing accuracy: {acc}%")

      if (epoch + 1) % 10 == 0:
        report = classification_report(true_labels, predictions, output_dict=True)
        df = pd.DataFrame(report).transpose()
        df.to_csv(f"{drivepath_final}/reports/reports_dev_{MODEL_VERSION}_{epoch + 1}.csv", index=False)
  
      return running_loss, acc

  def interpret_phoneme_frame(self, phoneme_tag, phoneme_label, features, targets):
    self.model.eval()
    features = features.to(DEVICE).unsqueeze(0)
    targets = targets.to(DEVICE).unsqueeze(0).reshape(-1, 1)

    outputs, shallow_classifier_outputs = self.model(features)
    output_probabilities = self.softmax(outputs)[0]

    output_class = torch.argmax(output_probabilities)  # convert to class label
    predicted_phoneme_tag = PHONEME_MAPPER[output_class.item()]
    print(f"label assigned: {output_class}")
    print(f"predicted_phoneme_tag: {predicted_phoneme_tag}")

    plot_phoneme_probabilities(phoneme_tag=phoneme_tag, 
                               phonemes=PHONEME_MAPPER.values(), 
                               probabilities=output_probabilities.tolist())
    plot_shallow_networks_outputs(phoneme_tag=phoneme_tag, 
                                  shallow_networks=SHALLOW_NETWORKS, 
                                  probabilities=shallow_classifier_outputs[0].tolist())
    
    plot_shallow_networks_importance(target_phoneme_tag=phoneme_tag, 
                                      weight_phoneme_tag=phoneme_tag,
                                      shallow_networks=SHALLOW_NETWORKS,
                                      probabilities=shallow_classifier_outputs[0],
                                      phoneme_neuron_weights=self.model.linear_layer.weight.data[phoneme_label, :])

    if predicted_phoneme_tag == phoneme_tag:
      print(f"correctly classified as {phoneme_tag}")
    else:
      print(f"incorrectly classified as {phoneme_tag} (true={phoneme_tag})")
      plot_shallow_networks_importance(target_phoneme_tag=phoneme_tag, 
                                  weight_phoneme_tag=predicted_phoneme_tag,
                                  shallow_networks=SHALLOW_NETWORKS,
                                  probabilities=shallow_classifier_outputs[0],
                                  phoneme_neuron_weights=self.model.linear_layer.weight.data[output_class, :])

  def plot_graphs_for_each_phoneme(self, dev_data):
    for phoneme_label, phoneme_tag in PHONEME_MAPPER.items():
      print(f"TESTING PHONEME '{phoneme_tag}' (label={phoneme_label})")
      phoneme_feature, phoneme_label = dev_data.get_phoneme_label_frames(phoneme_label=phoneme_label)
      self.interpret_phoneme_frame(phoneme_tag, phoneme_index, phoneme_features[0], phoneme_labels[0])
      print("")


# Train classifier

In [51]:
%cd /content/gdrive/MyDrive/DL_Group_Project/Dataset/Preprocessed_Data

/content/gdrive/.shortcut-targets-by-id/1qwJK2jyGMl2dPnVFe6JNZvrrG45HoonZ/DL_Group_Project/Dataset/Preprocessed_Data


In [52]:
from utilities import PHONEME_MAPPER
from utilities import SPECIALIZED_TASKS

In [53]:
%cd /

/


In [54]:
print(PHONEME_MAPPER)

{0: 'SIL', 1: 'AE', 2: 'AH', 3: 'AW', 4: 'AY', 5: 'B', 6: 'EH', 7: 'D', 8: 'DH', 9: 'EE', 10: 'FF', 11: 'G', 12: 'HH', 13: 'IH', 14: 'II', 15: 'J', 16: 'K', 17: 'LL', 18: 'MM', 19: 'NN', 20: 'OH', 21: 'OO', 22: 'OW', 23: 'OY', 24: 'P', 25: 'RR', 26: 'SH', 27: 'SS', 28: 'T', 29: 'TH', 30: 'UE', 31: 'UH', 32: 'VV', 33: 'WW', 34: 'YY', 35: 'ZZ', 36: 'CH', 37: 'ER', 38: 'NG'}


In [55]:
print(SPECIALIZED_TASKS)

{'1_vowel_vs_consonant': {0: ['EE', 'IH', 'EH', 'AE', 'UH', 'ER', 'AH', 'AW', 'OO', 'UE'], 1: ['FF', 'HH', 'MM', 'NN', 'NG', 'RR', 'SS', 'SH', 'VV', 'WW', 'YY', 'ZZ']}, '3_highvowel_vs_lowvowel': {0: ['EE', 'IH', 'UE', 'OO'], 1: ['AE', 'AH', 'AW']}, '4_voiced_vs_unvoiced_fricatives': {0: ['DH', 'VV', 'ZZ'], 1: ['FF', 'SS', 'SH', 'TH']}, '5_ss_vs_zz': {0: ['SS'], 1: ['ZZ']}, '6_b_vs_p': {0: ['B'], 1: ['P']}, '7_dh_vs_th': {0: ['DH'], 1: ['TH']}, '8_ww_vs_yy': {0: ['WW'], 1: ['YY']}, '9_ee_vs_aw': {0: ['EE'], 1: ['AW']}, '10_ah_vs_aw': {0: ['AH'], 1: ['AW']}, '11_mm_vs_nn': {0: ['MM'], 1: ['NN']}}


In [56]:
SHALLOW_NETWORKS = list(PHONEME_MAPPER.values())
for task_name in SPECIALIZED_TASKS.keys():
  task_id = task_name.split("_")[0]
  SHALLOW_NETWORKS.append(task_id)

In [57]:
train_data = PhonemesDataset(basepath=datapath, mode="train")
train_loader = make_dataloader(dataset=train_data, train=True, batch_size=BATCH_SIZE)
print(f"train_data.shape: {train_data.X.shape}")

dev_data = PhonemesDataset(basepath=datapath, mode="dev")
dev_loader = make_dataloader(dataset=dev_data, train=False, batch_size=BATCH_SIZE)
print(f"dev_data.shape: {dev_data.X.shape}")

train_data.shape: (53755, 40)
dev_data.shape: (11535, 40)


  cpuset_checked))


In [63]:
classifier = FramePhonemeClassifier(train_loader, dev_loader, PHONEME_MAPPER, SPECIALIZED_TASKS)
#classifier.train(epochs=NUM_EPOCHS)
classifier.train(epochs=20)

  cpuset_checked))


training loss: 0.024840148239084592; time: 33.167975425720215s
training accuracy: 63.068016052246094%


  5%|▌         | 1/20 [00:36<11:37, 36.70s/it]

testing loss: 0.02633913896612962; time: 3.5117571353912354s
testing accuracy: 62.14997863769531%
training loss: 0.010170836112638481; time: 30.215386390686035s
training accuracy: 71.15428161621094%


 10%|█         | 2/20 [01:10<10:43, 35.77s/it]

testing loss: 0.023556753676538093; time: 3.3624560832977295s
testing accuracy: 67.87168884277344%
training loss: 0.006116245970955836; time: 29.51873278617859s
training accuracy: 74.7225112915039%


 15%|█▌        | 3/20 [01:43<09:53, 34.92s/it]

testing loss: 0.021964807341405723; time: 3.411412477493286s
testing accuracy: 70.70654296875%
training loss: 0.0042644158292659455; time: 29.57005214691162s
training accuracy: 77.00759887695312%


 20%|██        | 4/20 [02:15<09:08, 34.26s/it]

testing loss: 0.020879374193225424; time: 3.1439619064331055s
testing accuracy: 71.94624328613281%
training loss: 0.0032231396999924983; time: 27.50362277030945s
training accuracy: 78.41365814208984%


 25%|██▌       | 5/20 [02:46<08:17, 33.18s/it]

testing loss: 0.019966310211007192; time: 3.142854690551758s
testing accuracy: 73.18595123291016%
training loss: 0.002563852425623962; time: 27.523662328720093s
training accuracy: 79.21073913574219%


 30%|███       | 6/20 [03:17<07:33, 32.42s/it]

testing loss: 0.019208911265394726; time: 3.116781234741211s
testing accuracy: 73.62808227539062%
training loss: 0.0021146038939160523; time: 29.880637884140015s
training accuracy: 79.75640869140625%


 35%|███▌      | 7/20 [03:50<07:04, 32.66s/it]

testing loss: 0.01868487010670783; time: 3.301795244216919s
testing accuracy: 74.07888793945312%
training loss: 0.0017907824045373516; time: 30.382603406906128s
training accuracy: 80.19963836669922%


 40%|████      | 8/20 [04:24<06:35, 32.97s/it]

testing loss: 0.01830553935560562; time: 3.314042329788208s
testing accuracy: 74.70307159423828%
training loss: 0.0015482206182849635; time: 30.392257690429688s
training accuracy: 80.59818267822266%


 45%|████▌     | 9/20 [04:57<06:05, 33.20s/it]

testing loss: 0.017950395778577256; time: 3.3196873664855957s
testing accuracy: 75.11053466796875%
training loss: 0.0013609284701556216; time: 30.19190216064453s
training accuracy: 80.99671936035156%


  _warn_prf(average, modifier, msg_start, len(result))


testing loss: 0.01753894193869005; time: 3.556731700897217s
testing accuracy: 75.69137573242188%


 50%|█████     | 10/20 [05:33<05:37, 33.77s/it]

training loss: 0.0012104186589262191; time: 30.30970597267151s
training accuracy: 81.39154052734375%


 55%|█████▌    | 11/20 [06:06<05:03, 33.75s/it]

testing loss: 0.017224585889193938; time: 3.3684308528900146s
testing accuracy: 76.02947235107422%
training loss: 0.001089854886234369; time: 29.5367169380188s
training accuracy: 81.56473541259766%


 60%|██████    | 12/20 [06:39<04:28, 33.58s/it]

testing loss: 0.017015692317801014; time: 3.6499125957489014s
testing accuracy: 76.31555938720703%
training loss: 0.0009894460874134126; time: 28.455681800842285s
training accuracy: 81.91299438476562%


 65%|██████▌   | 13/20 [07:11<03:50, 32.99s/it]

testing loss: 0.01684947331255527; time: 3.124772310256958s
testing accuracy: 76.47160339355469%
training loss: 0.00090470999803108; time: 27.24881887435913s
training accuracy: 82.1886215209961%


 70%|███████   | 14/20 [07:41<03:13, 32.20s/it]

testing loss: 0.01665010206307691; time: 3.0912094116210938s
testing accuracy: 76.61031341552734%
training loss: 0.0008338018084432594; time: 27.11652398109436s
training accuracy: 82.38788604736328%


 75%|███████▌  | 15/20 [08:12<02:38, 31.61s/it]

testing loss: 0.016490962688650048; time: 3.1140451431274414s
testing accuracy: 76.87905883789062%
training loss: 0.0007717599948680559; time: 28.995121240615845s
training accuracy: 82.58529663085938%


 80%|████████  | 16/20 [08:44<02:07, 31.83s/it]

testing loss: 0.016322232863579575; time: 3.335376024246216s
testing accuracy: 77.18247985839844%
training loss: 0.0007182527498371414; time: 29.959548711776733s
training accuracy: 82.73055267333984%


 85%|████████▌ | 17/20 [09:17<01:36, 32.27s/it]

testing loss: 0.01608430620365242; time: 3.3225860595703125s
testing accuracy: 77.36454010009766%
training loss: 0.000670926258981291; time: 30.10239577293396s
training accuracy: 82.95217895507812%


 90%|█████████ | 18/20 [09:51<01:05, 32.61s/it]

testing loss: 0.01618253534419949; time: 3.2953221797943115s
testing accuracy: 77.32119750976562%
training loss: 0.0006299860787213776; time: 29.65043306350708s
training accuracy: 83.13282012939453%


 95%|█████████▌| 19/20 [10:24<00:32, 32.80s/it]

testing loss: 0.015883457515345128; time: 3.5680062770843506s
testing accuracy: 77.55525970458984%
training loss: 0.0005927454017511213; time: 29.863985061645508s
training accuracy: 83.25387573242188%
testing loss: 0.01580355052287619; time: 3.205950975418091s
testing accuracy: 77.83267974853516%


100%|██████████| 20/20 [10:58<00:00, 32.94s/it]


In [None]:
# test
phoneme_index = 1
phoneme_tag = PHONEME_MAPPER[phoneme_index]
phoneme_features, phoneme_labels = dev_data.get_phoneme_label_frames(phoneme_label=phoneme_index)

In [None]:
classifier.interpret_phoneme_frame(phoneme_tag, phoneme_index, phoneme_features[0], phoneme_labels[0])

In [None]:
classifier.plot_graphs_for_each_phoneme(dev_data)