In [69]:
from datasets import Dataset, DatasetDict
from transformers import (
    RobertaConfig,
    RobertaForMaskedLM,
    RobertaTokenizerFast,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline
)
import tokenizers

import os
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

import nlpsig

import random
import math
import pickle

from nlpsig import set_seed
seed = 2022

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.cuda.device_count()

1

## Language dataset

In the `data/` folder, we have several text folders of words from different languages:
- `wordlist_de.txt`: German words
- `wordlist_en.txt`: English words
- `wordlist_fr.txt`: French words
- `wordlist_it.txt`: Italian words
- `wordlist_pl.txt`: Polish words
- `wordlist_sv.txt`: Swedish words

We additionally have a `alphabet.txt` file which just stores the alphabet characters ('a', 'b', 'c', ...).

The task is to split the words into its individual characters and to obtain an embedding for each of them. We can represent a word by a path of its character embeddings and compute its path signature to use as features in predicting the language for which the word belongs.

Here we look at obtaining embeddings using a Transformer model.

In [4]:
ALPHABET_FILE = 'data/alphabet.txt'
with open(ALPHABET_FILE) as f:
    alphabet = f.read().splitlines()
print(alphabet)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


## Set up Tokenizer for word corpora

If we were to fine-tune an existing pretrained transformer, we could use the same tokenizer that the model was pretrained with. However, here we want to train a model from stratch, and so using a tokenizer that was pretrained on a corpus that looks quite different to ours is suboptimal.

Here, we need to use the `tokenizers` library to set up and train a new tokenizer for our text.

In particular, we're going to start off with a character-based tokenizer (as we're going to split up our words into characters), and train it to our data.

In [5]:
# initialise character based tokenizer
tokenizer = tokenizers.CharBPETokenizer()
tokenizer.train(files=[ALPHABET_FILE],
                show_progress=False,
                special_tokens=['<s>', '</s>', '<unk>', '<pad>', '<mask>'])

# save the tokenizer to "CHAR_BERT/" folder
if not os.path.exists("CHAR_BERT"):
    os.makedirs("CHAR_BERT")

tokenizer.save_model("CHAR_BERT")

['CHAR_BERT/vocab.json', 'CHAR_BERT/merges.txt']

In [6]:
wordlist_files = ["data/wordlist_de.txt",
                  "data/wordlist_en.txt",
                  "data/wordlist_fr.txt",
                  "data/wordlist_it.txt",
                  "data/wordlist_pl.txt",
                  "data/wordlist_sv.txt"]

wordlist_dfs = []
for filename in wordlist_files:
    with open(filename, "r") as f:
        words = f.read().splitlines()
        words_df = pd.DataFrame({"word": words})
        words_df["language"] = filename.split("_")[1][0:2]
        wordlist_dfs.append(words_df)

corpus_df = pd.concat(wordlist_dfs).reset_index(drop=True)

In [7]:
corpus_df

Unnamed: 0,word,language
0,a,de
1,aal,de
2,aale,de
3,aalen,de
4,aalend,de
...,...,...
3922530,zons,sv
3922531,zoo,sv
3922532,zoologisk,sv
3922533,zoologiska,sv


We can see that there are relatively fewer English words than the other languages...

In [8]:
corpus_df["language"].value_counts()

it    1862929
pl    1517274
fr     198538
de     186027
en      80641
sv      77126
Name: language, dtype: int64

We are going to train our language model on the English words, so taking out a sample of English words from the corpus...

In [9]:
english_train_pickle_file = "data/english_train.pkl"
if os.path.isfile(english_train_pickle_file):
    english_train = pd.read_pickle(english_train_pickle_file)
else:    
    # set seed for sampling
    random.seed(seed)
    n_words = 70000
    
    # sample english words from the corpus
    english_train = corpus_df[corpus_df["language"]=="en"].sample(n_words)
    english_train = english_train.reset_index(drop=True)
    
    # save data for later
    english_train.to_pickle(english_train_pickle_file)

In [10]:
english_train

Unnamed: 0,word,language
0,sensitised,en
1,signifying,en
2,wholesomeness,en
3,adware,en
4,chasm,en
...,...,...
69995,entourages,en
69996,axe,en
69997,disdained,en
69998,calibers,en


To make the dataset bit more manageable, I'll just take a sample of each of the languages.

In [11]:
# remove the words that we use to train language model from the corpus
cond = corpus_df["word"].isin(english_train["word"])
corpus_df = corpus_df.drop(corpus_df[cond].index)
corpus_df = corpus_df.reset_index(drop=True)
corpus_df["language"].value_counts()

it    1860112
pl    1513877
fr     190682
de     185275
sv      74994
en      10641
Name: language, dtype: int64

In [12]:
corpus_sample_pickle_file = "data/corpus_sample.pkl"
if os.path.isfile(corpus_sample_pickle_file):
    corpus_sample_df = pd.read_pickle(corpus_sample_pickle_file)
else:
    # set seed for sampling
    set_seed(seed)
    balanced = True

    # take a sample from the rest of the remaining words
    if balanced:
        n_english = 10000
        n_remaining = 2000
        # sampling non-english words
        languages = corpus_df["language"].unique()
        words_per_language = math.floor(n_remaining / (len(languages)-1))
        non_english_df = pd.concat(
            [corpus_df[corpus_df["language"]==lang].sample(words_per_language, random_state=seed)
             for lang in languages if lang != "en"]
        )
        # sampling english words
        english_df = corpus_df[corpus_df["language"]=="en"].sample(n_english, random_state=seed)
        corpus_sample_df = pd.concat([non_english_df, english_df]).reset_index(drop=True)
    else:
        n_words = 12000
        corpus_sample_df = corpus_df.iloc[random.sample(range(len(corpus_df)), n_words)]
        corpus_sample_df = corpus_sample_df.reset_index(drop=True)

    # save data for later
    corpus_sample_df.to_pickle(corpus_sample_pickle_file)

In [13]:
corpus_sample_df

Unnamed: 0,word,language
0,getrippelte,de
1,ordnungsliebenderes,de
2,vermutetem,de
3,beizumischende,de
4,scholastischer,de
...,...,...
11995,swapping,en
11996,pruners,en
11997,teatimes,en
11998,bonk,en


In [14]:
corpus_sample_df["language"].value_counts()

en    10000
de      400
fr      400
it      400
pl      400
sv      400
Name: language, dtype: int64

## Training a language model

We want to train a masked language model for our corpus of English words. In particular, we mask out particular letters and ask our model to try predict the masked letter.

Here, we initialise our tokenizer (here we tokenize by character), data collator (with padding) and set up our transformer model by specifying the config (we use the RoBERTa here).

In [15]:
max_length = 512

# load in tokenizer for architecture
tokenizer = RobertaTokenizerFast.from_pretrained('CHAR_BERT/',
                                                 max_len=max_length)

# set up data_collator to use (intially just one that adds padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# initialise transformer architecture (random weights)
config_args = {"vocab_size": tokenizer.backend_tokenizer.get_vocab_size(),
               "hidden_size": 768,
               "max_length": max_length,
               "max_position_embeddings": max_length + 2,
               "hidden_dropout_prob": 0.1,
               "num_attention_heads": 12,
               "num_hidden_layers": 6,
               "type_vocab_size": 1}

config = RobertaConfig(**config_args)
model = RobertaForMaskedLM(config=config)

## Using the `TextEncoder` class

The `TextEncoder` class in the `nlpsig` package is able to take a dataframe with a column of text. We can use this class to obtain embeddings for the input text, or to train the model with the input text.

Note: In the initial writing of the package, the idea was that we can fit our transformer to some data (some text), and then we can obtain embeddings for them. But in our setting, we actually want to fit our data to our sample of English words (which we call `english_train`), but then obtain embeddings for our sample of the remaining words (which we call `corpus_sample_df`) - noting that this also contains some English words.

So we will actually use two instances of `TextEncoder` - one to pass in `corpus_df` and train the model. And then another to obtain embeddings for the words in `corpus_sample_df`. This is not optimal and not clean, but some adjustment to `TextEncoder` will be able. In particular, we can perhaps make changes to the `.tokenize_text()` method (which we will see how to use later) which can take in some external text data. 

I envisage that we can pass in different data to train the model rather than training it on the data that is passed...

But for now...

In [16]:
english_train.head()

Unnamed: 0,word,language
0,sensitised,en
1,signifying,en
2,wholesomeness,en
3,adware,en
4,chasm,en


In [17]:
text_encoder = nlpsig.TextEncoder(df=english_train,
                                  feature_name="word",
                                  model=model,
                                  config=config,
                                  tokenizer=tokenizer,
                                  data_collator=data_collator)

We can tokenize the text with the `.tokenize_text()` method, which tokenizes each of the items in the column of the dataframe that we passed in. So in the above, we tokenise the `word` column of the `english_train` dataframe.

In [18]:
text_encoder.tokenize_text()

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the datatset...


100%|██████████| 70/70 [00:01<00:00, 50.95ba/s]


[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


100%|██████████| 70/70 [00:03<00:00, 18.33ba/s]


[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id


Dataset({
    features: ['word', 'language', 'input_ids', 'attention_mask', 'special_tokens_mask', 'tokens'],
    num_rows: 70000
})

Note that the `text_encoder` object (instance of `TextEncoder`) also keeps the data as a Huggingface Dataset object too which is stored in the `.dataset` attribute of the object:

In [19]:
text_encoder.dataset

Dataset({
    features: ['word', 'language', 'input_ids', 'attention_mask', 'special_tokens_mask', 'tokens'],
    num_rows: 70000
})

We can see that we have tokenized this as there are `input_ids`, `attention_mask`, `special_tokens_mask`, and `tokens` features in the dataset.

In [20]:
text_encoder.dataset["word"][0]

'sensitised'

In [21]:
text_encoder.dataset["input_ids"][0]

[0, 23, 9, 18, 23, 13, 24, 13, 23, 9, 8, 1]

We can see that this word has been tokenized by character:

In [22]:
text_encoder.dataset["tokens"][0]

['s', 'e', 'n', 's', 'i', 't', 'i', 's', 'e', 'd']

We can also see that we have saved the tokenized text in the `'token'` column of the dataframe stored in `.df`:

In [23]:
text_encoder.df

Unnamed: 0,word,language,tokens
0,sensitised,en,"[s, e, n, s, i, t, i, s, e, d]"
1,signifying,en,"[s, i, g, n, i, f, y, i, n, g]"
2,wholesomeness,en,"[w, h, o, l, e, s, o, m, e, n, e, s, s]"
3,adware,en,"[a, d, w, a, r, e]"
4,chasm,en,"[c, h, a, s, m]"
...,...,...,...
69995,entourages,en,"[e, n, t, o, u, r, a, g, e, s]"
69996,axe,en,"[a, x, e]"
69997,disdained,en,"[d, i, s, d, a, i, n, e, d]"
69998,calibers,en,"[c, a, l, i, b, e, r, s]"


We also store the tokens in `.tokens` attribute.

In [24]:
text_encoder.tokens

Dataset({
    features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 70000
})

After applying the `.tokenize_text()` method, we store a tokenized dataframe in the `.tokenized_df` attribue. Here, we have each token in our corpus and their associated `'text_id'` (which is just the index they were given in the original dataframe that we pass):

In [25]:
text_encoder.tokenized_df

Unnamed: 0,text_id,language,tokens
0,0,en,s
1,0,en,e
2,0,en,n
3,0,en,s
4,0,en,i
...,...,...,...
596689,69999,en,a
596690,69999,en,t
596691,69999,en,i
596692,69999,en,o


So if we looked at `text_id==0`:

In [26]:
text_encoder.tokenized_df[text_encoder.tokenized_df["text_id"]==0]

Unnamed: 0,text_id,language,tokens
0,0,en,s
1,0,en,e
2,0,en,n
3,0,en,s
4,0,en,i
5,0,en,t
6,0,en,i
7,0,en,s
8,0,en,e
9,0,en,d


If we had passed in a pre-trained model (remember above, we just initialised one with a config and so have random weight), we can obtain token embeddings by the `.obtain_embeddings()` method. 

There are many ways in which one can get embeddings from the transformer network, as the output is the layers for the full network. A few ways are:

- Returning the output of a particular hidden layer
    - use `.obtain_embeddings(method = "hidden_layer", layers = l)` where `l` is the layer you want
    - If no layer is requested, it will just give you the second-to-last hidden layer of the transformer network.
- Concatenate the output of several hidden layers
    - use `.obtain_embeddings(method = "concatenate", layers = [l_1, l_2, ...])` where `[l_1, l_2, ...]` is a list of layers you want to concatenate
- Element-wise sum the output of several hidden layers
    - use `.obtain_embeddings(method = "sum" , layers = [l_1, l_2, ...])` where `[l_1, l_2, ...]` is a list of layers you want to sum
- Mean the output of several hidden layers
    - use `.obtain_embeddings(method = "mean" , layers = [l_1, l_2, ...])` where `[l_1, l_2, ...]` is a list of layers you want to mean

If a more custom way to obtain embeddings from the hidden layers, you can specify what layers you want, and it will return them (i.e. using `.obtain_embeddings(method = "hidden_layer", layers = [l_1, l_2, ...])` where `[l_1, l_2, ...]` is a list of hidden layers you want) and so the output will be a 3-dimensional array with dimensions `[layer, token, embedding]` for which you would need to combine in such a way that you would have an embedding for each token. The above methods would return a 2-dimensional array with dimensions `[token, embedding]`.

In the below, we just obtain the second-to-last hidden layer of the network:

In [27]:
token_embeddings = text_encoder.obtain_embeddings(method = "hidden_layer")

  0%|          | 0/700 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 700/700 [01:18<00:00,  8.89it/s]


By inspecting the shape of this, we can see that we have a 2-dimensional array with dimensions `[token, embedding]` where the embeddings are 768 dimensional in this network.

In [28]:
token_embeddings.shape

(596694, 768)

Now that we have token embeddings for each text, it is possible to pool these embeddings to obtain an embedding for the full text (for this case, this embedding would represent the word itself. We can use the `.pool_token_embeddings()` method for doing this.

Again, there are several methods and full details can be found in the documentation, but a few are:

- take the mean of the token embeddings
    - use `.pool_token_embeddings(method = "mean")`
- take the element-wise max of the token embeddings
    - use `.pool_token_embeddings(method = "max")`
- take the element-wise sum of the token embeddings
    - use `.pool_token_embeddings(method = "sum")`
- take the token-embedding for the CLS token
    - this is a special token that is used in some transformers like BERT
    - but this is only available to us if we set `skip_special_tokens=False` when tokenizing the text (note by default, this is set to `True`)
    - use `.pool_token_embeddings(method = "sum")`
        - this will produce an error if the CLS token is not available...

In [29]:
pooled_embeddings = text_encoder.pool_token_embeddings()

100%|██████████| 70000/70000 [00:27<00:00, 2543.93it/s]


Again, we can inspect the shape and we can see that we have embeddings for each of our words:

In [30]:
pooled_embeddings.shape

(70000, 768)

## Training the model

The above embeddings will not be good for any downstream task as the model itself has not been trained to the text. For this we will use other methods in the `TextEncoder` class which allows us to do this by using the Huggingface trainer API.

First, we need to set up a data collator for training our model.

In [31]:
# set up data_collator for language modelling (has dynamic padding)
data_collator_for_LM = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                       mlm=True,
                                                       mlm_probability=0.15)

To train our dataset, we will split it into a train, validation and test set with the `.split_dataset()` method.

We can set up the trainer's arguments with `.set_up_training_args()` which sets up a `TrainingArguments` object (from the `transformers` package) and stores it in the `.training_args` attribute. And lastly, we set up a `Trainer` object (from the `transformers` package) and store it in the `.trainer` attribute.

In [33]:
text_encoder.split_dataset()
text_encoder.set_up_training_args(output_dir="CHAR_BERT_trained",
                                  num_train_epochs=300,
                                  per_device_train_batch_size=128,
                                  seed=seed)
text_encoder.set_up_trainer(data_collator=data_collator_for_LM)

[INFO] Splitting up dataset into train / validation / test sets, and saving to `.dataset_split`.
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.


<transformers.trainer.Trainer at 0x14ece0621a30>

Once everything is set up, we just train our model by calling `.fit_transformer_with_trainer_api()` method.

In [34]:
torch.cuda.is_available()

True

In [35]:
torch.cuda.device_count()

1

In [36]:
text_encoder.fit_transformer_with_trainer_api()

The following columns in the training set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: language, tokens, word, special_tokens_mask. If language, tokens, word, special_tokens_mask are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 56000
  Num Epochs = 300
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 131400
  Number of trainable parameters = 43560249


[INFO] Training model with 43560249 parameters...


Epoch,Training Loss,Validation Loss
1,No log,2.01241
2,2.327600,1.774706
3,1.869500,1.68344
4,1.719600,1.620389
5,1.644500,1.569127
6,1.576000,1.531044
7,1.546200,1.458539
8,1.494000,1.441001
9,1.494000,1.402635
10,1.475300,1.397702


The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: language, tokens, word, special_tokens_mask. If language, tokens, word, special_tokens_mask are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7000
  Batch size = 8
Saving model checkpoint to CHAR_BERT_trained/checkpoint-500
Configuration saved in CHAR_BERT_trained/checkpoint-500/config.json
Model weights saved in CHAR_BERT_trained/checkpoint-500/pytorch_model.bin
tokenizer config file saved in CHAR_BERT_trained/checkpoint-500/tokenizer_config.json
Special tokens file saved in CHAR_BERT_trained/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: language, tokens, word, special_tokens_mask. If language, tokens, word, special_tokens_mask are not expecte

[INFO] Training completed!


Saving our model:

In [37]:
text_encoder.trainer.save_model("CHAR_BERT_trained/")

Saving model checkpoint to CHAR_BERT_trained/
Configuration saved in CHAR_BERT_trained/config.json
Model weights saved in CHAR_BERT_trained/pytorch_model.bin
tokenizer config file saved in CHAR_BERT_trained/tokenizer_config.json
Special tokens file saved in CHAR_BERT_trained/special_tokens_map.json


## Evaluating trained model

Evaluating the performance on predicting the masked letter for the test dataset. To do this, for each word in our test dataset, we will mask each letter on its own and ask the model to predict the masked letter. So for a 5 letter word, we have 5 predictions to make - one for each letter given the other letters.

For our tokenizer, we see that `\<mask>` is used as the mask token:

In [38]:
text_encoder.tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [39]:
def compute_masked_character_accuracy(fill_mask, words):
    was_correct = []
    print(f"Evaluating with {len(words)} words")
    for word in tqdm(words):
        masked_strings = [word[:i] + '<mask>' + word[i+1:] for i in range(len(word))]
        predictions = [fill_mask(word)[0]['sequence'] for word in masked_strings]
        was_correct += [pred == word for pred in predictions]
    
    acc = np.sum(was_correct) / len(was_correct)
    print(f"Accuracy: {acc}")
    return acc

fill_mask = pipeline("fill-mask",
                     model="CHAR_BERT_trained",
                     tokenizer="CHAR_BERT_trained")

compute_masked_character_accuracy(fill_mask, 
                                  text_encoder.dataset_split["test"]["word"])

loading configuration file CHAR_BERT_trained/config.json
Model config RobertaConfig {
  "_name_or_path": "CHAR_BERT_trained",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_length": 512,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 57
}

loading configuration file CHAR_BERT_trained/config.json
Model config RobertaConfig {
  "_name_or_path": "CHAR_BERT_trained",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bo

Evaluating with 7000 words


100%|██████████| 7000/7000 [14:04<00:00,  8.29it/s]

Accuracy: 0.8405469243820486





0.8405469243820486

## Obtaining a path for each word

Now that we have trained our model, we want to obtain embeddings for the words in `corpus_df`. Currently, `TextEncoder` only works with the data that is passed into the function and stored in `.df` and `.dataset`, so we need to initialise a new `TextEncoder` object with the `corpus_df` dataframe and also the trained model.

We can then obtain embeddings easily (recall from above we first need to tokenize the text, and then use the `.obtain_embeddings()` and `.pool_token_embeddings()` methods to do this).

In [42]:
text_encoder_2 = nlpsig.TextEncoder(df=corpus_sample_df,
                                    feature_name="word",
                                    model=text_encoder.model,
                                    config=text_encoder.config,
                                    tokenizer=text_encoder.tokenizer,
                                    data_collator=text_encoder.data_collator)

In [48]:
text_encoder_2.tokenize_text()
token_embeddings = text_encoder_2.obtain_embeddings(method = "hidden_layer")

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the datatset...




  0%|          | 0/12 [00:00<?, ?ba/s][A[A

  0%|          | 0/120 [05:33<?, ?it/s].20ba/s][A[A
100%|██████████| 12/12 [00:00<00:00, 43.05ba/s]


[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


100%|██████████| 12/12 [00:00<00:00, 23.78ba/s]


[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id


100%|██████████| 120/120 [00:13<00:00,  8.80it/s]


In [49]:
text_encoder_2.tokenized_df

Unnamed: 0,text_id,language,tokens
0,0,de,g
1,0,de,e
2,0,de,t
3,0,de,r
4,0,de,i
...,...,...,...
108399,11999,en,u
108400,11999,en,m
108401,11999,en,m
108402,11999,en,e


In [50]:
token_embeddings.shape

(108404, 768)

In [70]:
with open('corpus_sample_token_embeddings.pkl','wb') as f:
    pickle.dump(token_embeddings, f)

## Dimension reduction

We can perform dimension reduction with `nlpsig` using the `DimReduce` class. Here, we will use UMAP (implemented using the [`umap-learn`](https://umap-learn.readthedocs.io/en/latest/api.html) package, but there are other standard methods available:
- PCA (implemented using [`scikit-learn`](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html))
    - `method="pca"`
- TSNE (implemented using [`scikit-learn`](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html))
    - `method="tsne"`
- Post Processing Algorithm (PPA) with PCA (PPA-PCA)
    - `method="ppapca"`
    - see _Mu, J., Bhat, S., and Viswanath, P. (2017). All-but-the-top: Simple and effective postprocessing for word representations. arXiv preprint arXiv:1702.01417._
- PPA-PCA-PPA
    - `method="ppapacppa"`
    - see _Raunak, V., Gupta, V., and Metze, F. (2019). Effective dimensionality reduction for word embeddings. In Proceedings of the 4th Workshop on Representation Learning for NLP (RepL4NLP- 2019), pages 235–243._

In [51]:
reduction = nlpsig.DimReduce(method="umap",
                             n_components=10,
                             dim_reduction_kwargs={
                                 "metric": "cosine",
                             })
embeddings_reduced = reduction.fit_transform(token_embeddings)

In [52]:
embeddings_reduced.shape

(108404, 10)

In [71]:
with open('corpus_sample_reduced_token_embeddings.pkl','wb') as f:
    pickle.dump(embeddings_reduced, f)

As we have embeddings for each token, we can obtain a path for each word by constructing a path of the token embeddings. To do this, we can use the `PrepareData` class and pass in our tokenized dataframe (the dataframe where we have each token in our data and we also have the corresponding id for each word which is saved in the `text_id` column of the tokenized dataframe.

We pass in the column which defines the ids, `text_id`, the column which defines the labels, `language`, the token embeddings and the pooled embeddings.

In [53]:
dataset = nlpsig.PrepareData(text_encoder_2.tokenized_df,
                             id_column="text_id",
                             labels_column="language",
                             embeddings=token_embeddings,
                             embeddings_reduced=embeddings_reduced)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Note 'datetime' is not a column in `.df`, so only 'timeline_index' is added.
[INFO] As 'datetime' is not a column in `.df`, we assume that the data is ordered by time with respect to the id.
[INFO] Adding 'timeline_index' feature...


We can construct a path by using the `.pad()` method, and result of this is a multi-dimensional array or tensor (in particular a numpy array or PyTorch tensor) which can be then used in some downstream task. It is called "pad" because arrays and tensors are rectangular and if there are cases where there isn't enough data (e.g. if a word only has 3 letters/tokens and we want to make paths of length 4), we "pad" with either the last token embedding (set `zero_padding=False`) or with zeros (set `zero_padding=True`).

Here, we construct paths by setting a length of the paths (we call this method `k_last` in the code and we have to specify the length with `k=10`). We alternatively can construct to the longest word possible (by setting `method="max"`). The `time_feature` argument allows us to specify what time features we want to keep. Here we don't have any besides the index in which the word is, which is given by `timeline_index` and we choose not to standardise that by specifying `standardise_time_feature=False`.

In [54]:
word_path = dataset.pad(pad_by="id",
                        zero_padding=True,
                        method="k_last",
                        k=10,
                        time_feature=["timeline_index"],
                        standardise_time_feature=False,
                        embeddings="dim_reduced")

[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


100%|██████████| 12000/12000 [00:15<00:00, 791.47it/s]


By inspecting the shape of `word_path`, we see that we have a path for each word and the dimension of the array is `[batch, length of path, channels]`.

In [55]:
word_path.shape

(12000, 10, 13)

In [56]:
len(dataset.df["text_id"].unique())

12000

We store this array as a dataframe in `.df_padded` so that you can see what the columns correspond to, where columns beginning with `e` denote the dimensions of embeddings obtained from the transformer.

In [57]:
dataset.df_padded

Unnamed: 0,timeline_index,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,text_id,language
0,1,0.642436,0.347501,0.633622,9.498485,-0.536742,7.236354,0.349428,8.476377,10.284984,0.212375,0,de
1,2,1.097917,1.159596,-0.558051,10.146338,0.965994,4.594119,0.839234,1.718462,8.335831,2.011386,0,de
2,3,0.660696,-1.474226,0.324929,8.996325,0.147167,5.626800,-0.326708,2.957045,7.357754,0.119781,0,de
3,4,0.066091,0.934841,1.984142,9.897273,-0.445418,5.485973,0.820208,7.064891,10.352209,0.308647,0,de
4,5,0.018166,2.327596,-0.396088,10.137131,0.667989,3.218513,1.892539,2.235790,8.186775,1.986284,0,de
...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,5,0.705757,0.505991,0.561136,9.654332,-0.295937,5.772221,0.606102,8.362031,9.257602,0.483077,11999,en
119996,6,0.281582,0.431672,0.039087,9.760633,0.802404,0.440786,1.116959,1.490295,7.960192,0.338932,11999,en
119997,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,11999,-1
119998,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,11999,-1


We can see that the first column corresponds to the index, the columns beginning with `d` correspond to the dimension-reduced embeddings (which were 10 dimensional), and we also have the corresponding text-id and language (which we passed in the label above). If we look at the first word:

In [58]:
# still has the labels and the ids
dataset.df_padded[dataset.df_padded["text_id"]==0]

Unnamed: 0,timeline_index,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,text_id,language
0,1,0.642436,0.347501,0.633622,9.498485,-0.536742,7.236354,0.349428,8.476377,10.284984,0.212375,0,de
1,2,1.097917,1.159596,-0.558051,10.146338,0.965994,4.594119,0.839234,1.718462,8.335831,2.011386,0,de
2,3,0.660696,-1.474226,0.324929,8.996325,0.147167,5.6268,-0.326708,2.957045,7.357754,0.119781,0,de
3,4,0.066091,0.934841,1.984142,9.897273,-0.445418,5.485973,0.820208,7.064891,10.352209,0.308647,0,de
4,5,0.018166,2.327596,-0.396088,10.137131,0.667989,3.218513,1.892539,2.23579,8.186775,1.986284,0,de
5,6,1.022772,-0.541542,0.32448,9.801931,0.20029,5.401011,-0.242706,3.182425,7.249306,1.414376,0,de
6,7,0.4257,0.360153,0.315016,9.9994,-0.250613,6.296197,0.384874,8.175232,9.863592,0.369476,0,de
7,8,-0.36171,2.222073,-0.221039,10.026139,0.520481,2.371255,2.405247,1.766772,8.619617,1.6124,0,de
8,9,1.573137,-0.281566,0.601034,9.571445,1.029437,4.327699,-0.008659,3.540253,6.914563,1.514759,0,de
9,10,0.497214,-0.226626,-0.582444,9.96226,1.021137,3.80457,-0.194161,9.59627,7.277706,1.990388,0,de


In [59]:
text_encoder_2.df.iloc[0]

word                              getrippelte
language                                   de
tokens      [g, e, t, r, i, p, p, e, l, t, e]
Name: 0, dtype: object

We pick out a word which has less than 10 letters, and we can see that the path is padded with zeros and we give these a label `-1` to denote that they have been added.

Note that for padding, the method pads from below by default, but we can pad by above by setting `pad_from_below=False`.

In [60]:
dataset.df_padded[dataset.df_padded["text_id"]==3320]

Unnamed: 0,timeline_index,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,text_id,language
33200,0,1.353214,-0.676448,0.215638,9.932939,0.650689,5.326757,-0.377507,-1.604548,9.317955,3.325151,3320,en
33201,1,0.570237,0.459697,0.609359,9.673721,-0.411311,6.86112,0.438183,8.438325,10.143123,0.247765,3320,en
33202,2,0.084992,2.18465,0.310398,10.579418,-0.191729,3.76489,1.148623,1.218172,7.312806,1.726927,3320,en
33203,3,1.039667,0.449838,0.181571,10.064098,0.280863,4.666007,0.288457,1.372774,7.27478,1.99518,3320,en
33204,4,0.555279,1.213909,1.041155,10.354813,-0.680132,5.347591,0.620669,6.782768,9.622807,-1.069551,3320,en
33205,5,0.719879,0.543125,0.123861,9.536921,0.349212,3.724797,0.43282,1.072457,6.54846,0.793852,3320,en
33206,6,0.633168,1.132934,1.056927,10.242445,-0.677864,5.246167,0.525881,6.902331,9.556719,-1.307374,3320,en
33207,7,1.255061,0.524707,-0.323081,9.800545,0.373513,3.522918,0.788738,0.501832,6.481227,1.250318,3320,en
33208,8,1.192596,-0.054308,-0.01553,10.027349,0.571962,3.890334,-0.147352,8.861643,7.323592,1.748386,3320,en
33209,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3320,-1


In [61]:
text_encoder_2.df.iloc[3320]

word                          gemmology
language                             en
tokens      [g, e, m, m, o, l, o, g, y]
Name: 3320, dtype: object

For words which are longer than 10 letters, we only take the last 10:

In [62]:
dataset.df_padded[dataset.df_padded["text_id"]==10]

Unnamed: 0,timeline_index,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,text_id,language
100,2,1.317977,-0.560411,0.088102,9.918494,0.652725,5.139175,-0.433229,-1.957868,9.44171,3.286282,10,de
101,3,0.638559,0.376517,0.551726,9.560905,-0.479745,7.071331,0.380813,8.455746,10.142075,0.265349,10,de
102,4,0.773838,0.858633,-0.19358,9.875707,0.834488,3.804242,0.886892,1.669018,7.961495,1.5248,10,de
103,5,0.768352,-1.414051,0.409404,8.982706,0.151192,5.705562,-0.322427,2.893784,7.38011,0.191888,10,de
104,6,0.394626,0.569939,0.894882,9.464315,-0.599912,5.978182,0.508771,7.398853,9.959645,-0.185058,10,de
105,7,0.33118,0.620971,0.091171,9.666517,0.652566,1.552025,1.014759,1.336801,7.683355,0.604742,10,de
106,8,0.215077,1.710426,2.093915,10.198302,0.208076,4.545253,1.807776,7.774212,9.939435,1.417312,10,de
107,9,1.838432,-0.503554,0.196605,9.628243,1.677094,4.554884,0.082244,3.169625,6.684532,1.792743,10,de
108,10,0.44346,-0.22912,-0.662848,9.978356,1.078365,4.006979,-0.185805,9.632396,7.36542,1.925159,10,de
109,11,0.279706,-0.122955,-0.715107,9.911089,1.680002,-3.555078,-0.182654,4.726729,8.821706,1.976509,10,de


In [63]:
text_encoder_2.df.iloc[10]

word                                ungetrenntes
language                                      de
tokens      [u, n, g, e, t, r, e, n, n, t, e, s]
Name: 10, dtype: object

To obtain a path as a torch tensor, we use the `.get_torch_path()` method which by default keeps the time features and will remove the id and label columns.

In [64]:
# by default keeps the time features
torch_word_path = dataset.get_torch_path()
torch_word_path.shape

torch.Size([12000, 10, 11])

In [65]:
torch_word_path[0]

tensor([[ 1.0000e+00,  6.4244e-01,  3.4750e-01,  6.3362e-01,  9.4985e+00,
         -5.3674e-01,  7.2364e+00,  3.4943e-01,  8.4764e+00,  1.0285e+01,
          2.1238e-01],
        [ 2.0000e+00,  1.0979e+00,  1.1596e+00, -5.5805e-01,  1.0146e+01,
          9.6599e-01,  4.5941e+00,  8.3923e-01,  1.7185e+00,  8.3358e+00,
          2.0114e+00],
        [ 3.0000e+00,  6.6070e-01, -1.4742e+00,  3.2493e-01,  8.9963e+00,
          1.4717e-01,  5.6268e+00, -3.2671e-01,  2.9570e+00,  7.3578e+00,
          1.1978e-01],
        [ 4.0000e+00,  6.6091e-02,  9.3484e-01,  1.9841e+00,  9.8973e+00,
         -4.4542e-01,  5.4860e+00,  8.2021e-01,  7.0649e+00,  1.0352e+01,
          3.0865e-01],
        [ 5.0000e+00,  1.8166e-02,  2.3276e+00, -3.9609e-01,  1.0137e+01,
          6.6799e-01,  3.2185e+00,  1.8925e+00,  2.2358e+00,  8.1868e+00,
          1.9863e+00],
        [ 6.0000e+00,  1.0228e+00, -5.4154e-01,  3.2448e-01,  9.8019e+00,
          2.0029e-01,  5.4010e+00, -2.4271e-01,  3.1824e+00,  7.2493e+0

We can choose to ignore the time features by setting `include_time_features=False`:

In [66]:
# ingore time features (ignore first column of timeline_index)
torch_word_path = dataset.get_torch_path(include_time_features=False)
torch_word_path.shape

torch.Size([12000, 10, 10])

In [67]:
torch_word_path[0]

tensor([[ 6.4244e-01,  3.4750e-01,  6.3362e-01,  9.4985e+00, -5.3674e-01,
          7.2364e+00,  3.4943e-01,  8.4764e+00,  1.0285e+01,  2.1238e-01],
        [ 1.0979e+00,  1.1596e+00, -5.5805e-01,  1.0146e+01,  9.6599e-01,
          4.5941e+00,  8.3923e-01,  1.7185e+00,  8.3358e+00,  2.0114e+00],
        [ 6.6070e-01, -1.4742e+00,  3.2493e-01,  8.9963e+00,  1.4717e-01,
          5.6268e+00, -3.2671e-01,  2.9570e+00,  7.3578e+00,  1.1978e-01],
        [ 6.6091e-02,  9.3484e-01,  1.9841e+00,  9.8973e+00, -4.4542e-01,
          5.4860e+00,  8.2021e-01,  7.0649e+00,  1.0352e+01,  3.0865e-01],
        [ 1.8166e-02,  2.3276e+00, -3.9609e-01,  1.0137e+01,  6.6799e-01,
          3.2185e+00,  1.8925e+00,  2.2358e+00,  8.1868e+00,  1.9863e+00],
        [ 1.0228e+00, -5.4154e-01,  3.2448e-01,  9.8019e+00,  2.0029e-01,
          5.4010e+00, -2.4271e-01,  3.1824e+00,  7.2493e+00,  1.4144e+00],
        [ 4.2570e-01,  3.6015e-01,  3.1502e-01,  9.9994e+00, -2.5061e-01,
          6.2962e+00,  3.8487e-0

## Acknowledgements

The computations described in this notebook were performed using the Baskerville Tier 2 HPC service (https://www.baskerville.ac.uk/). Baskerville was funded by the EPSRC and UKRI through the World Class Labs scheme (EP/T022221/1) and the Digital Research Infrastructure programme (EP/W032244/1) and is operated by Advanced Research Computing at the University of Birmingham.