<a href="https://colab.research.google.com/github/charliecurnin/clone-compress/blob/master/clone_compress_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and mount Drive

In [None]:
import os
import h5py
import numpy as np
import random
from IPython.display import Audio
import librosa
import torch
from google.colab import drive
from scipy.io import wavfile
from importlib import reload

!pip install speechbrain -q
import speechbrain as sb
from speechbrain.pretrained import EncoderDecoderASR

!pip install google-cloud-speech
!pip install protobuf==3.19
from google.cloud import speech
import io



In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


### Access GitHub repo

In [130]:
### Access repo 
# !git clone https://github.com/charliecurnin/clone-compress
!git pull https://github.com/charliecurnin/clone-compress

remote: Enumerating objects: 5, done.[K
remote: Counting objects:  20% (1/5)[Kremote: Counting objects:  40% (2/5)[Kremote: Counting objects:  60% (3/5)[Kremote: Counting objects:  80% (4/5)[Kremote: Counting objects: 100% (5/5)[Kremote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (1/1)[Kremote: Compressing objects: 100% (1/1), done.[K
remote: Total 3 (delta 2), reused 3 (delta 2), pack-reused 0[K
Unpacking objects:  33% (1/3)   Unpacking objects:  66% (2/3)   Unpacking objects: 100% (3/3)   Unpacking objects: 100% (3/3), done.
From https://github.com/charliecurnin/clone-compress
 * branch            HEAD       -> FETCH_HEAD
Updating 45be462..f2ccc69
Fast-forward
 compress.py | 2 [32m+[m[31m-[m
 1 file changed, 1 insertion(+), 1 deletion(-)


In [None]:
%cd clone-compress
!ls -l

/content/clone-compress
total 5772
drwxr-xr-x 2 root root    4096 May 15 23:44 asr-crdnn-rnnlm-librispeech
-rw-r--r-- 1 root root 5884715 May 15 23:44 clone_compress_test.ipynb
-rw-r--r-- 1 root root     255 May 15 23:44 compress.py
-rw-r--r-- 1 root root    4237 May 15 23:44 cs224s_course_utils.py
-rw-r--r-- 1 root root     377 May 15 23:44 preprocess.py
drwxr-xr-x 2 root root    4096 May 15 23:44 __pycache__




### Load data as utterances (borrowed from HW3)


In [None]:
from torch.utils.data import Dataset

from cs224s_course_utils import (
  prune_transcripts, pad_wav, pad_transcript_label, get_transcript_labels,
  get_cer_per_sample)


# HarperValleyBank character vocabulary
VOCAB = [' ', "'", '~', '-', '.', '<', '>', '[', ']', 'a', 'b', 'c', 'd', 'e',
         'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
         't', 'u', 'v', 'w', 'x', 'y', 'z']
         
SILENT_VOCAB = ['[baby]', '[ringing]', '[laughter]', '[kids]', '[music]', 
                '[noise]', '[unintelligible]', '[dogs]', '[cough]']


class HarperValleyBank(Dataset):
  """Dataset to be used to train CTC, LAS, and MTL.
  
  Args:
    root: string
          path to the data files.
    split: string (default: train)
            choices: train | val | test
            which split of data to load
    n_mels: integer (default: 128)
            number of mel frequencies
    n_fft: integer (default: 256)
            number of fourier components
    win_length: integer (default: 256)
                should be <= n_fft
    hop_length: integer (default: 128)
                number of frames to skip in between
    wav_max_length: integer (default: 200)
                    maximum number of timesteps in a waveform
    transcript_max_length: integer (default: 200)
                            maximum number of timesteps in a transcript
    append_eos_token: boolean (default: False)
                      add EOS token to the end of every transcription
                      this is used for LAS (and LAS+CTC models)
  """
  def __init__(
      self, root, split='train', n_mels=128, n_fft=256, win_length=256, 
      hop_length=128, wav_max_length=200, transcript_max_length=200, 
      append_eos_token=False):
    super().__init__()
    print(f'> Constructing HarperValleyBank {split} dataset...')

    self.label_data = np.load(os.path.join(root, 'labels.npz'))   
    self.root = root
    self.wav_max_length = wav_max_length
    self.transcript_max_length = transcript_max_length

    self.input_dim = n_mels
    self.n_mels = n_mels
    self.n_fft = n_fft
    self.win_length = win_length
    self.hop_length = hop_length

    # Prune away very short examples.
    # This returns a list of indices of examples longer than 3 words.
    valid_indices = prune_transcripts(self.label_data['human_transcripts'])

    # Decides which indices belong to which split.
    train_indices, val_indices, test_indices = self.split_data(valid_indices)

    if split == 'train':
      indices = train_indices
    elif split == 'val':
      indices = val_indices
    elif split == 'test':
      indices = test_indices
    else:
      raise Exception(f'Split {split} not supported.')

    raw_human_transcripts = self.label_data['human_transcripts'].tolist()
    human_transcript_labels = get_transcript_labels(
      raw_human_transcripts, VOCAB, SILENT_VOCAB)
  
    # Increment all indices by 4 to reserve the following special tokens:
    #   0 for epsilon
    #   1 for start-of-sentence (SOS)
    #   2 for end-of-sentence (EOS)
    #   3 for padding 
    num_special_tokens = 4
    human_transcript_labels = [list(np.array(lab) + num_special_tokens) 
                                for lab in human_transcript_labels]
    # CTC doesn't use SOS nor EOS; LAS doesn't use EPS but add anyway.
    eps_index, sos_index, eos_index, pad_index = 0, 1, 2, 3

    if append_eos_token:
      # Ensert an EOS token to the end of all the labels.
      # This is important for the LAS objective.
      human_transcript_labels_ = []
      for i in range(len(human_transcript_labels)):
        new_label_i = human_transcript_labels[i] + [eos_index]
        human_transcript_labels_.append(new_label_i)
      human_transcript_labels = human_transcript_labels_
    self.human_transcript_labels = human_transcript_labels
  
    # Include epsilon, SOS, and EOS tokens.
    self.num_class = len(VOCAB) + len(SILENT_VOCAB) + num_special_tokens
    self.num_labels = self.num_class  # These are interchangeable.
    self.eps_index = eps_index
    self.sos_index = sos_index
    self.eos_index = eos_index
    self.pad_index = pad_index # Use this index for padding.

    self.indices = indices

  def indices_to_chars(self, indices):
    # indices: list of integers in vocab
    # add special characters in front (since we did this above)
    full_vocab = ['<eps>', '<sos>', '<eos>', '<pad>'] + VOCAB + SILENT_VOCAB
    chars = [full_vocab[ind] for ind in indices]
    return chars

  def split_data(self, valid_indices, train_ratio = 0.8, val_ratio = 0.1):
    """Splits data into train, val, and test sets based on speaker. When 
    evaluating methods on the test split, we measure how well they generalize
    to new (unseen) speakers.
    
    Concretely, this stores and returns indices belonging to each split.
    """
    # Fix seed so everyone reproduces the same splits.
    rs = np.random.RandomState(42)

    speaker_ids = self.label_data['speaker_ids']
    unique_speaker_ids = sorted(list(set(speaker_ids)))
    unique_speaker_ids = np.array(unique_speaker_ids)

    # Shuffle so the speaker IDs are distributed.
    rs.shuffle(unique_speaker_ids)

    num_speaker = len(unique_speaker_ids)
    num_train = int(train_ratio * num_speaker)
    num_val = int(val_ratio * num_speaker)
    num_test = num_speaker - num_train - num_val

    train_speaker_ids = unique_speaker_ids[:num_train]
    val_speaker_ids = unique_speaker_ids[num_train:num_train+num_val]
    test_speaker_ids = unique_speaker_ids[num_train+num_val:]

    train_speaker_dict = dict(zip(train_speaker_ids, ['train'] * num_train))
    val_speaker_dict = dict(zip(val_speaker_ids, ['val'] * num_val))
    test_speaker_dict = dict(zip(test_speaker_ids, ['test'] * num_test))
    speaker_dict = {**train_speaker_dict, **val_speaker_dict, 
                    **test_speaker_dict} 

    train_indices, val_indices, test_indices = [], [], []
    for i in range(len(speaker_ids)):
      speaker_id = speaker_ids[i]
      if speaker_dict[speaker_id] == 'train':
          train_indices.append(i)
      elif speaker_dict[speaker_id] == 'val':
          val_indices.append(i)
      elif speaker_dict[speaker_id] == 'test':
          test_indices.append(i)
      else:
          raise Exception('split not recognized.')

    train_indices = np.array(train_indices)
    val_indices = np.array(val_indices)
    test_indices = np.array(test_indices)

    # Make sure to only keep "valid indices" i.e. those with more than 4 
    # words in the transcription.
    train_indices = np.intersect1d(train_indices, valid_indices)
    val_indices = np.intersect1d(val_indices, valid_indices)
    test_indices = np.intersect1d(test_indices, valid_indices)

    return train_indices, val_indices, test_indices

  def get_primary_task_data(self, index):
    """Returns audio and transcript information for a single utterance.

    Args:
      index: Index of an utterance.

    Returns:
      log melspectrogram, wav length, transcript label, transcript length
    """
    input_feature = None
    input_length = None
    human_transcript_label = None
    human_transcript_length = None

    wav = self.waveform_data[f'{index}'][:] # An h5py file uses string keys.
    sr = 8000 # We fix the sample rate for you.

    ############################ START OF YOUR CODE ############################
    # TODO(1.1)
    # - Compute the mel spectrogram of the audio crop.
    # - Convert the mel spectrogram to log space and normalize it.
    # - This is your primary task feature. Note that models will expect feature
    #   inputs of shape (T, n_mels).
    # - Pad the feature so that all features are fixed-length and
    #   convert it into a tensor.
    # - Likewise, retrieve and pad the corresponding transcript label sequence.
    #
    # Hint:
    # - Refer to https://librosa.org/doc/latest/index.html.
    # - Use `librosa.feature.melspectrogram` and `librosa.util.normalize`.
    # - Make sure to use our provided sr, n_mels, n_fft, win_length, 
    # - and hop_length
    # - utils.py has helpful padding functions.

    # Get, normalize, pad feature
    mels = librosa.feature.melspectrogram(wav,
          sr=sr, n_mels=self.n_mels, n_fft=self.n_fft, hop_length=self.hop_length)
    # print(mels.shape)
    mels_norm = librosa.util.normalize(mels, axis=1) # TODO: ED #163?
    mels_norm = mels_norm.T
    input_feature, input_length = pad_wav(mels_norm, self.wav_max_length)
    input_feature = torch.Tensor(input_feature) 

    # Get and pad transcript labels
    unpadded_label = self.human_transcript_labels[index]
    human_transcript_label, human_transcript_length = pad_transcript_label(unpadded_label, self.transcript_max_length, pad=self.pad_index)
    human_transcript_label = torch.tensor(human_transcript_label)
    ############################# END OF YOUR CODE #############################

    return input_feature, input_length, human_transcript_label, human_transcript_length

  def load_waveforms(self):
    # Make a file pointer to waveforms file.
    waveform_h5 = h5py.File(os.path.join(self.root, 'data.h5'), 'r')
    self.waveform_data = waveform_h5.get('waveforms')

  def __getitem__(self, index):
    """Serves primary task data for a single utterance."""
    if not hasattr(self, 'waveform_data'):
      # Do this in __getitem__ function so we enable multiprocessing.
      self.load_waveforms()
    index = int(self.indices[index])
    return self.get_primary_task_data(index)

  def __len__(self):
    """Returns total number of utterances in the dataset."""
    return len(self.indices)

In [None]:
SYM_PATH = '/content/gdrive/MyDrive/cs224s_spring2022'
DATA_PATH = '{}/data'.format(SYM_PATH)

print("exists: ", os.path.exists('/content'))

root = os.path.join(DATA_PATH, 'harper_valley_bank_minified')
waveform_h5 = h5py.File(os.path.join(root, 'data.h5'), 'r')
waveform_data = waveform_h5.get('waveforms')
label_data = np.load(os.path.join(root, 'labels.npz'))
assert len(waveform_data) == len(label_data['human_transcripts'])
index = random.randint(0, len(waveform_data) - 1)
w = waveform_data[f'{index}'][:]
t = label_data['human_transcripts'][index]

print('index {}: "{}"\n'.format(index, t))
Audio(w, rate=8000)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
exists:  True
index 3932: "uh um"



In [None]:
 # Do not modify.
root = os.path.join(DATA_PATH, 'harper_valley_bank_minified')
train_dataset = HarperValleyBank(root, split='train')
val_dataset = HarperValleyBank(root, split='val')
test_dataset = HarperValleyBank(root, split='test')

assert len(train_dataset) == 10402
assert len(val_dataset) == 679
assert len(test_dataset) == 2854 

input, input_length, label, label_length = train_dataset.__getitem__(224)
assert input.size() == torch.Size([train_dataset.wav_max_length, train_dataset.n_mels])
assert input_length == 92
assert label_length == 26
print('\nValidated dataset class implementation!')

> Constructing HarperValleyBank train dataset...
> Constructing HarperValleyBank val dataset...
> Constructing HarperValleyBank test dataset...

Validated dataset class implementation!


### Load data as conversations

In [None]:
AUDIO_PATH = '/content/gdrive/MyDrive/cs224s_spring2022/data/harpervalleybank'
agent_audio_path = os.path.join(AUDIO_PATH, 'audio/agent')
caller_audio_path = os.path.join(AUDIO_PATH, 'audio/caller')
transcript_path = os.path.join(AUDIO_PATH, 'transcript')
metadata_path = os.path.join(AUDIO_PATH, 'metadata')

### Test preprocessing

In [134]:
agent_audio_file = os.path.join(
    agent_audio_path, np.random.choice(os.listdir(agent_audio_path)))
audio_ex, sr = librosa.load(agent_audio_file)
Audio(audio_ex, rate=sr)

In [None]:
import preprocess
reload(preprocess)

pp_audio_ex = preprocess.preprocess(audio_ex)
Audio(pp_audio_ex, rate=sr)

### Test compression

In [135]:
import compress
reload(compress)

clip, transcript = compress.compress(agent_audio_file, creds_path)
print(transcript)

Audio(clip, rate=sr)

hello this is Harper Valley National Bank my name is Linda how can I help you today  what is the company name  can you repeat the company name  what is the company address  what is the bill amount  we will send your payment to smart electric  is there anything else I can help you with  thank you for calling have a great day
