In [None]:
import os
os.chdir('..')

In [None]:
!pwd

## Source Model Embeddings

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
from ofa.utils import perform_factorize

In [None]:
source_model_name = "xlm-roberta-base"
source_model = AutoModelForMaskedLM.from_pretrained(source_model_name)

In [None]:
source_tokenizer = AutoTokenizer.from_pretrained(source_model_name)

In [None]:
source_embeddings = source_model.get_input_embeddings().weight.detach().numpy()

In [None]:
# 250K vocab size for XLM-Roberta-Base
source_embeddings.shape

In [None]:
primitive_embeddings, lower_coordinates = perform_factorize(source_embeddings)

In [None]:
lower_coordinates.shape

In [None]:
primitive_embeddings.shape

## Word Tokens

In [None]:
from gensim.models import KeyedVectors
from ofa.utils import WordEmbedding

In [None]:
# loading multilingual embeddings
embedding_path = "data/colexnet_vectors_minlang_50_200_10_updated.wv"
loaded_n2v = KeyedVectors.load(embedding_path)
multilingual_embeddings = WordEmbedding(loaded_n2v)

In [None]:
# Numb of words in the multilingual embeddings: 3610267
multilingual_words = multilingual_embeddings.get_words()
len(multilingual_words)

In [None]:
multilingual_embeddings.get_word_vector(multilingual_words[0]).shape

## Subword to Word Mappings

In [None]:
from ofa.utils import get_subword_to_word_mappings

In [None]:
subword_to_word_mapping, not_covered_subwords = get_subword_to_word_mappings(
    tokenizer=source_tokenizer, 
    model=multilingual_embeddings,
    multilingual=True,
    languages_considered=None,
    max_n_word_vectors=None)

In [None]:
# Note that there are duplication in the subword_to_word_mapping
subword_to_word_mapping

In [None]:
# Example case
print(f"Source Token: {source_tokenizer.convert_ids_to_tokens(ids=[42872])}")
print("Matched Word 1: ", multilingual_words[subword_to_word_mapping[42872][0]])
print("Matched Word 2: ", multilingual_words[subword_to_word_mapping[42872][1]])

In [None]:
# Input will be the word vectors
multilingual_embeddings.get_word_vector(multilingual_words[subword_to_word_mapping[42872][0]]).shape

In [None]:
# Output will be the subword vector
lower_coordinates[42872].shape

In [None]:
# Target tokenizer
target_model_name = 'cis-lmu/glot500-base'
target_tokenizer = AutoTokenizer.from_pretrained(target_model_name)

In [None]:
# Target token mapping with words
target_subword_to_word_mapping, target_not_covered_subwords = get_subword_to_word_mappings(
    tokenizer=target_tokenizer, 
    model=multilingual_embeddings,
    multilingual=True,
    languages_considered=None,
    max_n_word_vectors=None)

## Embedding Matrix from ColexNet

In [None]:
from setformer.utils import create_word_embedding_matrix

In [None]:
colexnet_word_embedding_matrix = create_word_embedding_matrix(multilingual_embeddings)

In [None]:
colexnet_word_embedding_matrix.size()

In [None]:
# PAD tokem embedding
colexnet_word_embedding_matrix[3610267]

In [None]:
# CLS token embedding
colexnet_word_embedding_matrix[3610268]

## Setformer model tests

In [None]:
from setformer.setformer import SetFormer

In [None]:
NUM_HEADS = 4
NUM_LAYERS = 4
DIM_FEEDFORWARD = 400
OUTPUT_DIM = 100
CONTEXT_SIZE = 512
DROPOUT = 0.1

In [None]:
setformer = SetFormer(emb_dim=colexnet_word_embedding_matrix.shape[1], 
                      num_heads=NUM_HEADS, num_layers=NUM_LAYERS, 
                      dim_feedforward=DIM_FEEDFORWARD, output_dim=OUTPUT_DIM, 
                      context_size=CONTEXT_SIZE, dropout=DROPOUT, 
                      word_vector_emb=colexnet_word_embedding_matrix)

In [None]:
# Number of parameters of the model
sum(p.numel() for p in setformer.parameters() if p.requires_grad)

In [None]:
import torch

In [None]:
example_input = torch.tensor([[0, 3610267],
                              [1, 3610267],
                              [2, 3610267],])
example_input.shape

In [None]:
output = setformer(example_input)

In [None]:
output.shape

In [None]:
output[0]

## Dataset tests

In [None]:
from setformer.utils import create_mapping_dataset

In [None]:
train_set, val_set, prediction_set = create_mapping_dataset(subword_to_word_mapping, lower_coordinates,
                                                            target_subword_to_word_mapping)

In [None]:
train_set.__len__()

In [None]:
val_set.__len__()

In [None]:
train_set.__getitem__(0)

In [None]:
len(prediction_set['inputs'])

In [None]:
import torch
from torch.utils.data import DataLoader

In [None]:
PAD_IDX = 3610267
CLS_IDX = 3610268

def collate_fn(batch):
    '''
    Collate function for the dataloader
    Add CLS token to the beginning of the input
    Add PAD token to make the input size equal across the batch
    The targets stay the same
    '''
    inputs, targets = zip(*batch)
    batch_size = len(targets)
    
    # Add CLS token to the beginning of the input
    padded_inputs = torch.nn.utils.rnn.pad_sequence([torch.cat([torch.tensor([CLS_IDX]), torch.tensor(i)]) for i in inputs], 
                                                    batch_first=True, padding_value=PAD_IDX)
    targets = torch.tensor(targets, dtype=torch.float32).view(batch_size, targets[0].shape[0])

    return padded_inputs, targets

In [None]:
train_loader = DataLoader(dataset=train_set, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [None]:
# Custom Sampler for a similar batch length can be applied later
for batch in train_loader:
    inputs, targets = batch
    print(inputs.shape)
    print(targets.shape)
    print(inputs[0])
    print("===")
    print(targets[0])
    break

# Training

In [1]:
import os
os.chdir('..')

In [2]:
import torch

In [None]:
from setformer.utils import create_word_embedding_matrix
from setformer.train_setformer import train_setformer


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from gensim.models import KeyedVectors
from ofa.utils import WordEmbedding
# loading multilingual embeddings
embedding_path = "data/colexnet_vectors_minlang_50_200_10_updated.wv"
loaded_n2v = KeyedVectors.load(embedding_path)
multilingual_embeddings = WordEmbedding(loaded_n2v)

In [5]:
colexnet_word_embedding_matrix = create_word_embedding_matrix(multilingual_embeddings)

  word_vectors = torch.tensor([multilingual_embeddings.get_word_vector(word) for word in words])


In [6]:
# Load train_set and val_set variable
import pickle

with open('data/train_set.pkl', 'rb') as f:
    train_set = pickle.load(f)

with open('data/val_set.pkl', 'rb') as f:
    val_set = pickle.load(f)

In [7]:
train_setformer("setformer/configs/setformer_config.yaml", 
                colexnet_word_embedding_matrix, train_set, val_set)

Totla model parameter size: 737802668
Model parameter size without word vectors which is FROZEN: 15748868
Model configs: {'model_hps': {'num_layers': 16, 'emb_dim': 200, 'num_heads': 8, 'dim_feedforward': 2048, 'dropout': 0.1, 'output_dim': 100, 'max_context_size': 256, 'padding_idx': 3610267, 'cls_idx': 3610268}, 'training_hps': {'batch_size': 128, 'lr': 0.0001, 'epochs': 3}, 'logging': {'checkpoint_dir': 'outputs/checkpoints', 'save_best_only': True}, 'notes': 'Set Former training hyperparameters'}


Epoch 1/3:   0%|          | 1/763 [00:29<6:20:40, 29.97s/it]

Epoch 1/3, Batch 1/763, Loss: 0.9680014252662659


Epoch 1/3:   0%|          | 3/763 [01:31<6:30:04, 30.79s/it]

Epoch 1/3, Batch 3/763, Loss: 0.9481009244918823


Epoch 1/3:   1%|          | 5/763 [02:31<6:19:58, 30.08s/it]

Epoch 1/3, Batch 5/763, Loss: 0.9172612428665161


Epoch 1/3:   1%|          | 5/763 [02:32<6:26:12, 30.57s/it]


KeyboardInterrupt: 