In [1]:
import os
os.chdir('..')

In [2]:
!pwd

/Users/eno/Documents/my-repos/smarter-ofa


## Source Model Embeddings

In [3]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
from ofa.utils import perform_factorize

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
source_model_name = "xlm-roberta-base"
source_model = AutoModelForMaskedLM.from_pretrained(source_model_name)

In [5]:
source_tokenizer = AutoTokenizer.from_pretrained(source_model_name)

In [6]:
source_embeddings = source_model.get_input_embeddings().weight.detach().numpy()

In [7]:
# 250K vocab size for XLM-Roberta-Base
source_embeddings.shape

(250002, 768)

In [8]:
primitive_embeddings, lower_coordinates = perform_factorize(source_embeddings)

In [9]:
lower_coordinates.shape

(250002, 100)

In [10]:
primitive_embeddings.shape

(100, 768)

## Word Tokens

In [11]:
from gensim.models import KeyedVectors
from ofa.utils import WordEmbedding

In [12]:
# loading multilingual embeddings
embedding_path = "data/colexnet_vectors_minlang_50_200_10_updated.wv"
loaded_n2v = KeyedVectors.load(embedding_path)
multilingual_embeddings = WordEmbedding(loaded_n2v)

In [13]:
multilingual_words = multilingual_embeddings.get_words()
len(multilingual_words)

3610267

In [14]:
multilingual_embeddings.get_word_vector(multilingual_words[0]).shape

(200,)

## Subword to Word Mappings

In [19]:
from ofa.utils import get_subword_to_word_mappings

In [20]:
subword_to_word_mapping, not_covered_subwords = get_subword_to_word_mappings(
    tokenizer=source_tokenizer, 
    model=multilingual_embeddings,
    multilingual=True,
    languages_considered=None,
    max_n_word_vectors=None)

Matching subwords and words: 100%|██████████| 3610267/3610267 [03:41<00:00, 16291.15it/s]


===Subword to Word Mapping Statistics===
Matched Subword Token Count: 108438
Unmatched Subword Token Count: 141564
Coverage:  43.37%
# of Subword - Word match percentiles:  {'0%': 1, '25%': 2, '50%': 5, '75%': 23, '90%': 150, '95%': 389, '99%': 1853, '100%': 315693}


In [21]:
# Note that there are duplication in the subword_to_word_mapping
subword_to_word_mapping

{114331: [2973138],
 226088: [623008, 3480580, 994951, 1642664, 3581323, 141844, 2609113, 1806683],
 9178: [2888194,
  461323,
  129163,
  2470803,
  2508285,
  2620450,
  281763,
  1300387,
  2826663,
  2274221,
  3313582,
  1384241,
  2164275,
  2062264,
  3442364,
  2478397,
  3442365,
  3442366,
  2531781,
  3284425,
  785483,
  2653137,
  3443026,
  2125779,
  3443027,
  2973015,
  3076826,
  2501085,
  867047,
  1224938,
  1789932,
  2228205,
  2072573,
  1457139,
  2261108,
  2235773],
 8231: [493570,
  2355206,
  1120265,
  1566732,
  2355212,
  2480141,
  2355215,
  2355217,
  2355222,
  1304600,
  1919001,
  2355229,
  2355232,
  2291746,
  2830373,
  2355238,
  2379818,
  1908787,
  901178,
  2355259,
  2379835,
  1134653,
  2355262,
  2355264,
  2637888,
  2355647,
  1896523,
  2379852,
  1073230,
  2355284,
  2355285,
  2355650,
  2224218,
  2379865,
  2379871,
  1314912,
  2379872,
  2355299,
  2355300,
  2355652,
  2355304,
  327785,
  2355305,
  2355653,
  2355310,
  23

In [22]:
# Example case
print(f"Source Token: {source_tokenizer.convert_ids_to_tokens(ids=[42872])}")
print("Matched Word 1: ", multilingual_words[subword_to_word_mapping[42872][0]])
print("Matched Word 2: ", multilingual_words[subword_to_word_mapping[42872][1]])

Source Token: ['ói']
Matched Word 1:  kze:$ísóigon-aai
Matched Word 2:  nhd:$mbói


In [23]:
# Input will be the word vectors
multilingual_embeddings.get_word_vector(multilingual_words[subword_to_word_mapping[42872][0]]).shape

(200,)

In [24]:
# Output will be the subword vector
lower_coordinates[42872].shape

(100,)

In [25]:
# Target tokenizer
target_model_name = 'cis-lmu/glot500-base'
target_tokenizer = AutoTokenizer.from_pretrained(target_model_name)

In [26]:
# Target token mapping with words
target_subword_to_word_mapping, target_not_covered_subwords = get_subword_to_word_mappings(
    tokenizer=target_tokenizer, 
    model=multilingual_embeddings,
    multilingual=True,
    languages_considered=None,
    max_n_word_vectors=None)

Matching subwords and words: 100%|██████████| 3610267/3610267 [03:59<00:00, 15077.04it/s]


===Subword to Word Mapping Statistics===
Matched Subword Token Count: 194478
Unmatched Subword Token Count: 206667
Coverage:  48.48%
# of Subword - Word match percentiles:  {'0%': 1, '25%': 2, '50%': 6, '75%': 25, '90%': 103, '95%': 223, '99%': 837, '100%': 2375788}


## Embedding Matrix from ColexNet

In [27]:
from setformer.utils import create_word_embedding_matrix

In [28]:
colexnet_word_embedding_matrix = create_word_embedding_matrix(multilingual_embeddings)

  word_vectors = torch.tensor([multilingual_embeddings.get_word_vector(word) for word in words])


In [29]:
colexnet_word_embedding_matrix.size()

torch.Size([3610268, 200])

In [30]:
# CLS token embedding
colexnet_word_embedding_matrix[3610267]

tensor([0.7854, 0.6672, 0.6433, 0.7514, 0.0796, 0.0581, 0.2755, 0.1837, 0.9682,
        0.9558, 0.1871, 0.3438, 0.6403, 0.4127, 0.7417, 0.9719, 0.4842, 0.1541,
        0.9827, 0.2968, 0.4333, 0.6032, 0.5010, 0.3152, 0.2098, 0.3453, 0.5581,
        0.8765, 0.5068, 0.3579, 0.6180, 0.3683, 0.9841, 0.2833, 0.5539, 0.1673,
        0.8642, 0.7112, 0.5199, 0.5354, 0.6323, 0.9676, 0.5371, 0.0043, 0.3772,
        0.7979, 0.8253, 0.7931, 0.8170, 0.6288, 0.7540, 0.0599, 0.2405, 0.3855,
        0.2606, 0.3236, 0.6144, 0.4124, 0.4982, 0.7268, 0.7605, 0.1545, 0.2182,
        0.6333, 0.7157, 0.5253, 0.5273, 0.3620, 0.6114, 0.7551, 0.5749, 0.3965,
        0.7633, 0.4566, 0.6162, 0.4479, 0.6185, 0.7289, 0.7498, 0.5071, 0.5711,
        0.2658, 0.8782, 0.3272, 0.9861, 0.6592, 0.1181, 0.4314, 0.5478, 0.6335,
        0.5476, 0.5027, 0.2705, 0.9595, 0.6928, 0.6864, 0.1119, 0.3390, 0.9819,
        0.0637, 0.3517, 0.8799, 0.0100, 0.3128, 0.7150, 0.3153, 0.4963, 0.1786,
        0.9905, 0.6381, 0.7027, 0.1629, 

## Setformer model tests

In [31]:
from setformer.setformer import SetFormer

In [32]:
NUM_HEADS = 4
NUM_LAYERS = 4
DIM_FEEDFORWARD = 400
OUTPUT_DIM = 100
CONTEXT_SIZE = 512
DROPOUT = 0.1

In [33]:
setformer = SetFormer(emb_dim=colexnet_word_embedding_matrix.shape[1], 
                      num_heads=NUM_HEADS, num_layers=NUM_LAYERS, 
                      dim_feedforward=DIM_FEEDFORWARD, output_dim=OUTPUT_DIM, 
                      context_size=CONTEXT_SIZE, dropout=DROPOUT, 
                      word_vector_emb=colexnet_word_embedding_matrix)

In [34]:
# Number of parameters of the model
sum(p.numel() for p in setformer.parameters() if p.requires_grad)

1308900

In [36]:
import torch

In [37]:
example_input = torch.tensor([[0, 3610267],
                              [1, 3610267],
                              [2, 3610267],])
example_input.shape

torch.Size([3, 2])

In [38]:
output = setformer(example_input)

In [39]:
output.shape

torch.Size([3, 100])

In [40]:
output[0]

tensor([-0.0397, -0.3466, -0.2455,  0.3517,  1.5205,  0.7284, -0.3170, -1.0146,
         0.1091,  0.5687,  0.7922,  0.1409,  0.6161, -0.7855, -1.4606, -0.3852,
        -0.9576, -1.2105,  0.4437,  0.0222,  0.2778,  0.4406, -0.2612, -0.2576,
         0.3077, -0.6664, -0.2993, -1.1925, -0.6371, -0.6428,  0.3063, -0.0197,
         0.5654, -1.8720,  0.4993,  0.8346, -0.8957,  0.7133, -0.0323, -0.3094,
        -0.3887, -0.6455, -0.2009, -1.0095,  0.0863,  0.1647,  0.6778,  0.0647,
        -1.1390,  0.0064,  0.8327,  0.7000, -0.7832,  0.5597, -0.6069,  1.3132,
         0.4362,  0.1314,  0.5840,  0.6415,  0.2254, -0.2172, -0.8715, -0.7630,
        -0.8667, -0.1267, -0.0083,  0.4423,  0.0221,  0.4458, -0.4010, -0.9468,
         0.0674, -0.3888, -0.5352,  0.6752,  0.2445,  0.1367,  0.3529,  0.7539,
        -0.6157, -0.1360,  1.2912,  1.1016,  0.2026,  0.6317,  0.0970, -0.5267,
         0.2425,  0.8588,  0.4040,  0.1947,  0.0264,  0.1129,  0.4715,  0.4333,
        -0.7620,  0.7148, -0.5388,  0.55

## Dataset tests

In [43]:
from setformer.utils import create_mapping_dataset

In [49]:
train_set, val_set, prediction_set = create_mapping_dataset(subword_to_word_mapping, lower_coordinates,
                                                            target_subword_to_word_mapping, multilingual_embeddings)

In [52]:
train_set.__len__()

97595

In [53]:
val_set.__len__()

10843

In [58]:
train_set.__getitem__(56)

([3610267, 1456982, 3448749, 473690, 690346, 2134137, 3534665],
 array([-1.6730862e-03,  1.5891336e-03, -3.0825636e-03,  6.8053824e-04,
         1.9938739e-03, -3.4390560e-03, -1.7398275e-03,  4.7464349e-04,
        -1.9102395e-03,  1.2431863e-03,  2.4262939e-03, -1.5394324e-03,
        -9.8115089e-04, -1.1211250e-04,  3.1605235e-03, -2.4745741e-03,
        -1.0608099e-03,  1.7273498e-03,  3.5462578e-04, -5.6526292e-04,
         1.0879253e-03,  2.2850125e-03,  3.8277956e-03, -1.5926191e-03,
         4.4729823e-04,  3.3470590e-04,  2.7624907e-03, -4.4854532e-04,
        -2.0534457e-03,  5.4308714e-04,  2.7710958e-03,  1.6229927e-03,
        -3.9430070e-03,  2.1083860e-03,  2.3180575e-03, -2.8122226e-03,
         1.7352451e-03,  4.2609475e-03, -2.0777556e-04,  2.8165348e-03,
         2.1691970e-03, -1.7718630e-04,  3.1522708e-03, -6.1200542e-04,
         1.9057143e-05,  9.0967544e-04,  4.1193920e-03,  2.0998325e-03,
        -2.7319447e-03, -4.8538891e-04,  5.2157186e-05,  2.4047652e-03,


In [62]:
len(prediction_set['inputs'])

194478