In [1]:
from tara.data import load_data_encoded, load_data, load_dataloader
from tara.trainer import TrainingArgs, TrainerBERT

In [2]:
train, test, val = load_data('./data')

KeyboardInterrupt: 

In [None]:
from transformers import (
    AutoTokenizer, 
    BertForSequenceClassification, 
    get_linear_schedule_with_warmup
)

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=7, output_attentions=False, output_hidden_states=False)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'enc

In [None]:
batch_size = 32
train_text , test_text, val_text = load_dataloader('./data', tokenizer, batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
args = TrainingArgs()
trainer = TrainerBERT(
    model = bert_model,
    trainloader=train_text,
    testloader=val_text,
    args=args,
    out_dir='./saved_weights'   
)

TypeError: __init__() got an unexpected keyword argument 'out_dir'

In [None]:
trainer.train()


Training...


100%|██████████| 313/313 [00:48<00:00,  6.42it/s]



  Average training loss: 1.38
  Training epcoh took: 0:00:49

Running Validation...


100%|██████████| 35/35 [00:01<00:00, 19.29it/s]


  Accuracy: 0.60

Training...


100%|██████████| 313/313 [00:50<00:00,  6.24it/s]



  Average training loss: 1.26
  Training epcoh took: 0:00:50

Running Validation...


100%|██████████| 35/35 [00:01<00:00, 18.91it/s]


  Accuracy: 0.62

Training...


100%|██████████| 313/313 [00:50<00:00,  6.16it/s]



  Average training loss: 1.18
  Training epcoh took: 0:00:51

Running Validation...


100%|██████████| 35/35 [00:01<00:00, 18.78it/s]


  Accuracy: 0.62

Training...


100%|██████████| 313/313 [00:51<00:00,  6.14it/s]



  Average training loss: 1.13
  Training epcoh took: 0:00:51

Running Validation...


100%|██████████| 35/35 [00:01<00:00, 18.75it/s]


  Accuracy: 0.61


In [None]:
import time
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import random
from tqdm import tqdm

def fix_seeds(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    
def train_epoch(epoch_i, dataloader, schedular):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(tqdm(dataloader)):
    
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)        
        loss = output.loss
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
 
        optimizer.step()
        schedular.step()
        

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    
    return (avg_train_loss, training_time)

def validate(dataloader):
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    
    preds = []
    labels = []
    
    for _, batch in enumerate(tqdm(dataloader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            output= model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        preds.extend(np.argmax(logits, axis=1))
        labels.extend(label_ids)
        # total_eval_accuracy += accuracy_score(np.argmax(logits, axis=1), label_ids)
    # Report the final accuracy for this validation run.
    preds, labels = np.array(preds), np.array(labels)
    avg_val_accuracy = f1_score(preds.flatten(), labels.flatten(), average='weighted')# total_eval_accuracy / len(dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    return (avg_val_accuracy, avg_val_loss, validation_time)

def train(num_epochs, train_dataloader, val_dataloader, schedular):
    training_stats = []
    for epoch in range(num_epochs):
        train_loss, train_time = train_epoch(epoch, train_dataloader, schedular)
        val_acc, val_loss, val_time = validate(val_dataloader)
        training_stats.append(
            {
                'epoch': epoch + 1,
                'Training Loss': train_loss,
                'Valid. Loss': val_loss,
                'Valid. Accur.': val_acc,
                'Training Time': train_time,
                'Validation Time': val_time
            }
        )
    return training_stats

In [None]:
from transformers import (
    AutoTokenizer, 
    BertForSequenceClassification, 
    get_linear_schedule_with_warmup
)

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = BertForSequenceClassification.from_pretrained(
    model_name, num_labels=7, output_attentions=False, 
    output_hidden_states=False)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'enc

In [None]:
bert_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Training Audio Model

In [None]:
import librosa as lb
from torch.utils.data import Dataset
from typing import Any
from tara.data import load_audio_dataloader, load_text_audio_dataloader

import torch.nn as nn
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
from tara.trainer import TrainingArgs, TrainerWave2Vec

# load pretrained model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wave2vec = Wav2Vec2ForSequenceClassification.\
    from_pretrained("facebook/wav2vec2-base-960h", num_labels=7)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

wave2vec.classifier = nn.Sequential(
                nn.Linear(256, 256),
                nn.ReLU(),
                nn.Dropout(0.1),
                nn.LayerNorm(256)
            )

In [None]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [None]:
train_audio, test_audio, val_audio = load_audio_dataloader(
    './data',
    processor=processor,
    sampling_rate=16000,
    duration=3,
    batch_size=16
)

In [None]:
i, temp = next(enumerate(train_audio))

In [None]:
temp[0].shape

torch.Size([16, 48000])

In [None]:
wave2vec(temp[0]).logits

tensor([[-0.6598,  0.8177, -0.2298,  ..., -0.6598, -0.6598, -0.6598],
        [-0.6384,  1.6621, -0.6384,  ..., -0.6384, -0.6384, -0.6384],
        [ 1.1589,  1.0839, -0.6323,  ..., -0.6323, -0.6323, -0.6323],
        ...,
        [-0.6557,  1.2204,  0.1638,  ..., -0.6557, -0.6557, -0.6557],
        [-0.6450,  1.5619,  0.3052,  ..., -0.6450, -0.6450, -0.6450],
        [-0.5512,  0.9403, -0.6228,  ..., -0.6228, -0.6228, -0.6228]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
train_ta, test_ta, val_ta = load_text_audio_dataloader(
    './data',
    processor,
    tokenizer,
    batch_size=16
)



<class 'dict'>
dict_keys(['input_ids', 'attention_masks', 'labels'])


In [None]:
i, temp = next(enumerate(train_ta))

In [None]:
a_i, t_i, t_m, l = temp
a_i.shape, t_i.shape, t_m.shape, l.shape

(torch.Size([16, 48000]),
 torch.Size([16, 95]),
 torch.Size([16, 95]),
 torch.Size([16]))

In [None]:
trainer = TrainerWave2Vec(
    model=wave2vec,
    trainloader=train_audio,
    testloader=test_audio,
    out_dir='./saved_weights',
    args=TrainingArgs()

)

In [None]:
import warnings

# Some code where you don't want to see deprecation warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=DeprecationWarning)
    # Code that might trigger a deprecation warning
    # e.g., deprecated_function()

    trainer.train()

# Text + Audio

In [3]:
# training the combined model
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'


from tara.model import TextAudioModel
from typing import Any
from tara.data import load_audio_dataloader, load_text_audio_dataloader
from transformers import  Wav2Vec2Processor, AutoTokenizer

from tara.trainer import TrainingArgs, TrainerTextAudio

text_model_path = "distilbert-base-uncased"
audio_model_path = "facebook/wav2vec2-base-960h"

processor = Wav2Vec2Processor.from_pretrained(audio_model_path)
tokenizer = AutoTokenizer.from_pretrained(text_model_path)


model = TextAudioModel(
    audio_model_path,
    text_model_path,
    num_labels=7
)

Loading Text Model from Huggingface


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'enc

Loading Audio model from Huggingface


You are using a model of type distilbert to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.layer_norm.bias', 'encoder.layer_norm.weight', 'encoder.layers.0.attention.k_proj.bias', 'encoder.layers.0.attention.k_proj.weight', 'encoder.layers.0.attention.out_proj.bias', 'encoder.layers.0.attention.out_proj.weight', 'encoder.layers.0.attention.q_proj.bias', 'encoder.layers.0.attention.q_proj.weight', 'encoder.layers.0.attention.v_proj.bias', 'encoder.layers.0.attention.v_proj.weight', 'encoder.layers.0.feed_forward.intermediate_dense.bias', 'encoder.layers.0.feed_forward.intermediate_dense.weight', 'encoder.layers.0.feed_forward.output_dense.bias', 'encoder.layers.0.feed_forward.output_dense.weight', 'encoder.layers.0.final_l

In [4]:
train_ta, test_ta, val_ta = load_text_audio_dataloader(
    './data',
    processor,
    tokenizer,
    batch_size=16
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [5]:
trainer = TrainerTextAudio(
    model= model,
    trainloader=train_ta,
    testloader=test_ta,
    out_dir='./saved_weights/'
    args = TrainingArgs()
)

TypeError: __init__() missing 1 required positional argument: 'out_dir'

In [None]:
trainer.train()


Training...


  9%|▉         | 58/625 [02:13<21:41,  2.30s/it]


KeyboardInterrupt: 