<a href="https://colab.research.google.com/github/dodofk/End-to-End-Spoken-Language-Understanding/blob/master/SLU_Downstream_Hubert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi
!pip install --upgrade transformers

Tue Dec 21 06:52:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os
import itertools
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.nn.utils import rnn
import torch.nn.functional as F


import torchaudio
from torch.utils.data import Dataset, DataLoader

from sklearn import preprocessing
from tqdm.autonotebook import tqdm

from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel

from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer


In [None]:
test = torch.tensor([1,2,3,4])
t = torch.rand(4, 26, 768)
t.size()

torch.Size([4, 26, 768])

In [None]:
t.max(dim=1)[0].size()

torch.Size([4, 768])

## Notebook Config

In [None]:
class CFG:
    batch_size = 4
    downstream_lr = 1e-4
    audio_encoder_lr = 1e-5
    weight_decay = 1e-2
    audio_encoder_model = 'facebook/hubert-base-ls960'
    audio_embedding = 768
    hidden_dim = 256
    intent_dim = 31
    trainable=True

    num_of_workers = 2

    # 用不到, 懶得改code
    text_tokenizer = "distilbert-base-uncased"
    max_length=128

    data_root="drive/MyDrive/MIULAB_DATASET/fsc/fluent_speech_commands_dataset"
    project_root="drive/MyDrive/MIULAB_DATASET/fsc"

    patience = 1
    factor = 0.8

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    reduction="mean"

    epochs = 10


## Utils

In [None]:
class AvgMeter:
    def __init__(self, name="Metric"):
        self.name = name
        self.reset()

    def reset(self):
        self.avg, self.sum, self.count = [0] * 3

    def update(self, val, count=1):
        self.count += count
        self.sum += val * count
        self.avg = self.sum / self.count

    def __repr__(self):
        text = f"{self.name}: {self.avg:.4f}"
        return text

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]

## Dataset

### Define Dataset

In [None]:
class FluentSpeechDATASET(Dataset):

    def __init__(self, 
                 data_root, 
                 tokenizer,
                 split:str ='train'
                 ) -> None:
        assert split in ['train', 'test', 'valid'], 'Invalid split'

        self.data_root = data_root
        self.df = pd.read_csv(os.path.join(self.data_root, 'data/{}_data.csv'.format(split)))
        self.df['intent'] = self.df[['action', 'object', 'location']].apply('-'.join, axis=1)
        self.transcription = self.df['transcription'].values

        self.intent_encoder = preprocessing.LabelEncoder()
        self.intent_encoder.fit(self.df['intent'])

        self.df['intent_label'] = self.intent_encoder.transform(self.df['intent'])

        self.labels_set = set(self.df['intent_label'])
        self.labels2index = dict()

        for label in self.labels_set:
            idx = np.where(self.df['intent_label']==label)[0]
            self.labels2index[label] = idx

        self.encoded_transcription = tokenizer(
            list(self.transcription), padding=True, truncation=True, max_length=CFG.max_length
        )

    def __len__(self):
        return len(self.df)

    def get_dict(self, waveform, intent, transcription, suffix=''):
        ret_dict = {
            'waveform': waveform,
            'intent': intent,
            'transcription': transcription,
        }
        ret_dict = {k+suffix:v for k, v in ret_dict.items()}
        return ret_dict

    def load_audio(self, idx):
        df_row = self.df.iloc[idx]
        filename = os.path.join(self.data_root, df_row['path'])
        waveform, sr = torchaudio.load(filename)
        intent = df_row['intent_label']
        transcription = df_row['transcription']
        return waveform.squeeze(), intent, transcription

    def __getitem__(self, index):
        item = {
            key: values[index]
            for key, values in self.encoded_transcription.items()
        }
        waveform, intent, transcription = self.load_audio(index)
        item['waveform'] = waveform
        item['intent'] = intent
        item['transcription'] = transcription
        return item

    def labels_list(self):
        return self.intent_encoder.classes_


def default_collate(inputs):
    waveforms = [data['waveform'] for data in inputs]
    intents = [data['intent'] for data in inputs]
    transcriptions = [data['transcription'] for data in inputs]
    padded_waveforms = rnn.pad_sequence(waveforms, batch_first=True)

    return {
        'waveform': padded_waveforms, 
        'intent': torch.tensor(intents), 
        'transcription': transcriptions, 
        'input_ids':torch.tensor([data['input_ids'] for data in inputs]),
        'attention_mask': torch.tensor([data['attention_mask'] for data in inputs]),
    }
  

In [None]:
CFG.num_of_workers

2

In [None]:
def build_loaders(split):
      tokenizer = DistilBertTokenizer.from_pretrained(CFG.text_tokenizer)
      dataset = FluentSpeechDATASET(
          CFG.data_root, 
          tokenizer=tokenizer, 
          split=split,
      )
      dataloader = DataLoader(
          dataset, 
          batch_size=CFG.batch_size,
          collate_fn=default_collate,
          num_workers=CFG.num_of_workers,
          shuffle=True if split=="train" else False,
      )
      return dataloader

## Model

In [None]:
class FinalPool(torch.nn.Module):
	def __init__(self):
		super(FinalPool, self).__init__()

	def forward(self, input):
		return input.max(dim=1)[0]

class AudioEncoder(nn.Module):

      def __init__(
          self, 
          model_name=CFG.audio_encoder_model, 
          trainable=CFG.trainable,
      ):
          super().__init__()
          self.model = AutoModel.from_pretrained(model_name)
          for p in self.model.parameters():
              p.requires_grad = trainable

      def forward(self, x):
          output = self.model(x)
          last_hidden_state = output.last_hidden_state
          return last_hidden_state[:,0,:]


class E2ESLU(nn.Module):
    def __init__(
        self,
        model_name=CFG.audio_encoder_model,
        embedding=CFG.audio_embedding,
        trainable=CFG.trainable,
        intent_dim=CFG.intent_dim,
        hidden_dim=CFG.hidden_dim,
    ):
        super().__init__()
        self.audio_encoder = AudioEncoder()

        for p in self.audio_encoder.parameters():
            p.requires_grad = trainable
        
        self.final_classifier = nn.Sequential(
            nn.Linear(embedding, hidden_dim),
            nn.LeakyReLU(inplace=True),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(inplace=True),
            nn.Linear(hidden_dim, intent_dim),
        )
    
    def forward(self, x):
        output = self.audio_encoder(x)
        output = self.final_classifier(output)
        return output


## Train

In [None]:
from matplotlib import pyplot as plt

def train_epoch(model, train_loader, optimizer, lr_scheduler, step):
    loss_meter = AvgMeter()
    tqdm_object = tqdm(train_loader, total=len(train_loader))
    loss_fn = nn.CrossEntropyLoss(reduction='mean')
    losses = []
    train_total = 0
    train_acc = 0
    for batch in tqdm_object:
        output = model(batch['waveform'].to(CFG.device))
        target = batch['intent'].to(CFG.device)
        pred = torch.argmax(output, dim=1)

        loss = loss_fn(output, target)

        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if step == "batch":
            lr_scheduler.step()


        count = batch['waveform'].size(0)

        train_total += count
        train_acc += (pred.to("cpu") == target.to("cpu")).sum().item()

        loss_meter.update(loss.item(), count)

        tqdm_object.set_postfix(train_loss=loss.item(), train_acc=train_acc/train_total)

        losses.append(loss_meter.avg)

    plt.plot(losses)

    return loss_meter

In [None]:
def valid_epoch(model, valid_loader):
    loss_meter = AvgMeter()
    loss_fn = nn.CrossEntropyLoss(reduction='mean')

    val_total = 0
    val_acc = 0

    tqdm_object = tqdm(valid_loader, total=len(valid_loader))
    for batch in tqdm_object:

        output = model(batch['waveform'].to(CFG.device))
        target = batch['intent'].to(CFG.device)
        pred = torch.argmax(output, dim=1)
        
        loss = loss_fn(output, target)

        count = batch["waveform"].size(0)

        val_total += count
        val_acc += (pred.to("cpu") == target.to("cpu")).sum().item()

        loss_meter.update(loss.item(), count)

        tqdm_object.set_postfix(valid_loss=loss_meter.avg, valid_acc=val_acc/val_total)
    return loss_meter

In [None]:
torch.cuda.empty_cache()

def train_and_valid():
    train_loader = build_loaders(split="train")
    valid_loader = build_loaders(split="valid")

    models = E2ESLU().to(CFG.device)
    params = [
        {"params": models.audio_encoder.parameters(), "lr": CFG.audio_encoder_lr},
        {"params": models.final_classifier.parameters(), "lr": CFG.downstream_lr},
    ]
    optimizer = torch.optim.AdamW(
        params=params,
        weight_decay=CFG.weight_decay,
    )
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, 
        mode="min",
        patience=CFG.patience,
        factor=CFG.factor,
    )
    step = "epoch"

    best_loss = float("inf")

    for epoch in range(CFG.epochs):
        print(f"Epoch: {epoch+1}")
        models.train()
        train_loss = train_epoch(models, train_loader, optimizer, lr_scheduler, step)
        models.eval()
        with torch.no_grad():
          valid_loss = valid_epoch(models, valid_loader)

        if valid_loss.avg < best_loss:
            best_loss = valid_loss.avg
            torch.save(models.state_dict(), CFG.project_root+"no_clap_pretrain_best.pt")
            print("Saved Best Model!")

        lr_scheduler.step(valid_loss.avg)


In [None]:
# train_and_valid()

In [None]:
def test(pretrain_path):

    model = E2ESLU()
    model.load_state_dict(torch.load(pretrain_path))
    model.eval()
    model.to(CFG.device)
    test_total = 0
    test_acc = 0

    test_loader = build_loaders(split="test")

    tqdm_object = tqdm(test_loader, total=len(test_loader))
    for batch in tqdm_object:

        output = model(batch['waveform'].to(CFG.device))
        target = batch['intent'].to(CFG.device)
        pred = torch.argmax(output, dim=1)
        

        count = batch["waveform"].size(0)

        test_total += count
        test_acc += (pred.to("cpu") == target.to("cpu")).sum().item()


        tqdm_object.set_postfix(test_acc=test_acc/test_total)
    return test_acc/test_total

In [None]:
 test_acc = test(CFG.project_root+"no_clap_pretrain_best.pt")

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/360M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

  0%|          | 0/949 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
test_acc

In [None]:
import torch
import torch.nn as nn
import numpy as np

In [None]:
width = 64
width ** -0.5

In [None]:
class testTransformer(nn.Module):
    def __init__(self, width):
        scale = width ** -0.5
        self.class_embedding = nn.Parameter(scale * torch.randn(width))
    def forward(self, x):
        x = torch.cat([])


In [None]:
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True, activation="gelu")
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
src = torch.rand(32, 1, 512)
output = transformer_encoder(src)
output.size()

In [None]:
data = torch.from_numpy(np.random.randint(1,11, size=(32, 10)))
data[:, 0] = 1
data.size()

In [None]:
a = torch.randn(64)
b = torch.randn(8, 10, 64)

torch.cat([a + torch.zeros(b.s), b])

In [None]:
c = torch.zeros(b.shape[0], 1, b.shape[-1])

In [None]:
c.size()

In [None]:
torch.cat([torch.zeros(b.shape[0], 1, b.shape[-1]), b], dim=1).size()

In [None]:
input = torch.zeros(8, 2, 3)
position_embedding = torch.randn(3, 4)
input + position_embedding[:input.shape[1],:input.shape[2]]

In [None]:
a = 3 or 4
b = None or 5

c = None
d = c.f or 6
print(a ,b, d)

In [None]:
import torch


text = torch.randn(8, 1024)
r = torch.randn(8, 1024, 768)

# r = r[torch.arange(r.shape[0]), text.argmax(dim=-1)] @ self.text_projection
r.size()

In [None]:
class ConvFeature(nn.Module):
    def __init__(
        self,
    ):
        super().__init__()
        def make_Conv1d():
            conv = nn.Conv1d(1, 10, 3)
            nn.init.kaiming_normal_(conv.weight)
            return conv

        

        

In [None]:
import torch
import torch.nn as nn

In [None]:
x.unsqueeze(1).size()

In [None]:
class test(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 512, 10, stride=3)
        self.conv2 = nn.Conv1d(512, 512, 3, stride=2)
        self.conv3 = nn.Conv1d(512, 512, 3, stride=2)

    def forward(self, x):
        x = self.conv1(x)
        print(x.size())
        x = self.conv2(x)
        print(x.size())
        return self.conv3(x)

In [None]:
model = test()
x = torch.randn(8, 1, 20000)
y = torch.randn(8,  79)


model(x).size()

In [None]:
t = model(x)
print(t.size())
model(t).size()

In [None]:
eval("[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2")

In [None]:
import torch
x = torch.randn(8, 2, 4)
x

In [None]:
x[torch.arange(x.shape[0]), 0]

In [None]:
t = torch.arange(8)

In [None]:
t.argmax(dim=-1)

In [None]:
t.