In [1]:
from datasets import Dataset, DatasetDict
from transformers import (
    RobertaConfig,
    RobertaForMaskedLM,
    RobertaTokenizerFast,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline
)
import tokenizers

import os
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

import nlpsig

## Language dataset

In the `data/` folder, we have several text folders of words from different languages:
- `wordlist_de.txt`: German words
- `wordlist_en.txt`: English words
- `wordlist_fr.txt`: French words
- `wordlist_it.txt`: Italian words
- `wordlist_pl.txt`: Polish words
- `wordlist_sv.txt`: Swedish words

We additionally have a `alphabet.txt` file which just stores the alphabet characters ('a', 'b', 'c', ...).

The task is to split the words into its individual characters and to obtain an embedding for each of them. We can represent a word by a path of its character embeddings and compute its path signature to use as features in predicting the language for which the word belongs.

Here we look at obtaining embeddings using a Transformer model.

In [2]:
ALPHABET_FILE = 'data/alphabet.txt'
with open(ALPHABET_FILE) as f:
    alphabet = f.read().splitlines()
print(alphabet)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


## Set up Tokenizer for word corpora

If we were to fine-tune an existing pretrained transformer, we could use the same tokenizer that the model was pretrained with. However, here we want to train a model from stratch, and so using a tokenizer that was pretrained on a corpus that looks quite different to ours is suboptimal.

Here, we need to use the `tokenizers` library to set up and train a new tokenizer for our text.

In particular, we're going to start off with a character-based tokenizer (as we're going to split up our words into characters), and train it to our data.

In [3]:
# initialise character based tokenizer
tokenizer = tokenizers.CharBPETokenizer()
tokenizer.train(files=[ALPHABET_FILE], show_progress=False)

# save the tokenizer to "CHAR_BERT/" folder
if not os.path.exists("CHAR_BERT"):
    os.makedirs("CHAR_BERT")

tokenizer.save_model("CHAR_BERT")

['CHAR_BERT/vocab.json', 'CHAR_BERT/merges.txt']

In [4]:
wordlist_files = ["data/wordlist_de.txt",
                  "data/wordlist_en.txt",
                  "data/wordlist_fr.txt",
                  "data/wordlist_it.txt",
                  "data/wordlist_pl.txt",
                  "data/wordlist_sv.txt"]

wordlist_dfs = []
for filename in wordlist_files:
    with open(filename, "r") as f:
        words = f.read().splitlines()
        words_df = pd.DataFrame({"word": words})
        words_df["language"] = filename.split("_")[1][0:2]
        wordlist_dfs.append(words_df)

corpus_df = pd.concat(wordlist_dfs)

In [5]:
corpus_df

Unnamed: 0,word,language
0,a,de
1,aal,de
2,aale,de
3,aalen,de
4,aalend,de
...,...,...
77121,zons,sv
77122,zoo,sv
77123,zoologisk,sv
77124,zoologiska,sv


Question: what do we do with words that appear twice in the data? i.e. different languages have the same word in their vocabularies, e.g. the word "zoo" is a valid word in english, french, italian, polish and swedish.

In [6]:
corpus_df[corpus_df["word"]=="zoo"]

Unnamed: 0,word,language
80613,zoo,en
198413,zoo,fr
1861555,zoo,it
1508829,zoo,pl
77122,zoo,sv


We take a random sample of the 3.9 million words in the corpora...

In [7]:
import random
import math
seed = 2022
random.seed(seed)
balanced = True

# take 
n_words = 12000
if balanced:
    languages = corpus_df["language"].unique()
    words_per_language = math.floor(n_words / len(languages))
    corpus_sample_df = pd.concat(
        [corpus_df[corpus_df["language"]==lang].sample(words_per_language, random_state=seed)
         for lang in languages]
    )
    corpus_sample_df = corpus_sample_df.reset_index(drop=True)
else:
    corpus_sample_df = corpus_df.iloc[random.sample(range(len(corpus_df)), n_words)]
    corpus_sample_df = corpus_sample_df.reset_index(drop=True)

corpus_sample_df

Unnamed: 0,word,language
0,unbehelligte,de
1,herauszuschlagender,de
2,anzahlbarer,de
3,unbeseelte,de
4,imaginativem,de
...,...,...
11995,ifatt,sv
11996,formgavs,sv
11997,bestods,sv
11998,kurirers,sv


In [8]:
corpus_sample_df["language"].value_counts()

de    2000
en    2000
fr    2000
it    2000
pl    2000
sv    2000
Name: language, dtype: int64

In [9]:
max_length = 512

# load in tokenizer for architecture
tokenizer = RobertaTokenizerFast.from_pretrained('CHAR_BERT/',
                                                 max_len=max_length)

# set up data_collator to use (intially just one that adds padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# initialise transformer architecture (random weights)
config_args = {"vocab_size": tokenizer.backend_tokenizer.get_vocab_size(),
               "hidden_size": 768,
               "max_length": 512,
               "max_position_embeddings": max_length + 2,
               "hidden_dropout_prob": 0.1,
               "num_attention_heads": 12,
               "num_hidden_layers": 6,
               "type_vocab_size": 1}

config = RobertaConfig(**config_args)
model = RobertaForMaskedLM(config=config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
text_encoder = nlpsig.TextEncoder(df=corpus_sample_df,
                                  feature_name="word",
                                  model=model,
                                  config=config,
                                  tokenizer=tokenizer,
                                  data_collator=data_collator)

In [11]:
text_encoder.tokenize_text()

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the datatset...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/12 [00:00<?, ?ba/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


  0%|          | 0/12 [00:00<?, ?ba/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id


Dataset({
    features: ['word', 'language', 'input_ids', 'attention_mask', 'special_tokens_mask', 'tokens'],
    num_rows: 12000
})

In [12]:
text_encoder.dataset["word"][0]

'unbehelligte'

In [13]:
text_encoder.dataset["input_ids"][0]

[53, 21, 14, 2, 5, 8, 5, 12, 12, 9, 7, 20, 5, 54]

In [14]:
text_encoder.dataset["tokens"][0]

['u', 'n', 'b', 'e', 'h', 'e', 'l', 'l', 'i', 'g', 't', 'e']

In [15]:
text_encoder.tokenized_df

Unnamed: 0,text_id,language,tokens
0,0,de,u
1,0,de,n
2,0,de,b
3,0,de,e
4,0,de,h
...,...,...,...
130015,11999,sv,n
130016,11999,sv,g
130017,11999,sv,a
130018,11999,sv,r


In [16]:
token_embeddings = text_encoder.obtain_embeddings(method = "hidden_layer")

  0%|                                                                                                               | 0/120 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [00:45<00:00,  2.62it/s]


In [17]:
token_embeddings.shape

(130020, 768)

In [139]:
text_encoder.tokenized_df[text_encoder.tokenized_df["text_id"]==0]

Unnamed: 0,text_id,language,tokens
0,0,de,u
1,0,de,n
2,0,de,b
3,0,de,e
4,0,de,h
5,0,de,e
6,0,de,l
7,0,de,l
8,0,de,i
9,0,de,g


In [18]:
text_encoder.tokens

Dataset({
    features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 12000
})

In [19]:
pooled_embeddings = text_encoder.pool_token_embeddings()

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:01<00:00, 6156.56it/s]


In [20]:
pooled_embeddings.shape

(12000, 768)

## Training the model

In [30]:
# set up data_collator for language modelling (has dynamic padding)
data_collator_for_LM = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                       mlm=True,
                                                       mlm_probability=0.15)

In [31]:
text_encoder.split_dataset()
text_encoder.set_up_training_args(output_dir="CHAR_BERT_trained",
                                  num_train_epochs=20,
                                  per_device_train_batch_size=128,
                                  seed=seed)
text_encoder.set_up_trainer(data_collator=data_collator_for_LM)

[INFO] Splitting up dataset into train / validation / test sets, and saving to `.dataset_split`.
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.


<transformers.trainer.Trainer at 0x2cf48f6d0>

In [26]:
text_encoder.fit_transformer_with_trainer_api()

The following columns in the training set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: word, language, special_tokens_mask, tokens. If word, language, special_tokens_mask, tokens are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 120000
  Num Epochs = 50
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 46900
  Number of trainable parameters = 43560249


[INFO] Training model with 43560249 parameters...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [22]:
text_encoder.trainer.save_model("CHAR_BERT_trained/")

Saving model checkpoint to CHAR_BERT_trained/
Configuration saved in CHAR_BERT_trained/config.json
Model weights saved in CHAR_BERT_trained/pytorch_model.bin
tokenizer config file saved in CHAR_BERT_trained/tokenizer_config.json
Special tokens file saved in CHAR_BERT_trained/special_tokens_map.json


## Evaluating trained model

Evaluating the performance on predicting the masked letter for the test dataset. To do this, for each word in our test dataset, we will mask each letter on its own and ask the model to predict the masked letter. So for a 5 letter word, we have 5 predictions to make - one for each letter given the other letters.

For our tokenizer, we see that "\<mask>" is used as the mask token.

In [21]:
text_encoder.tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [22]:
def compute_masked_character_accuracy(fill_mask, words):
    was_correct = []
    print(f"Evaluating with {len(words)} words")
    for word in tqdm(words):
        masked_strings = [word[:i] + '<mask>' + word[i+1:] for i in range(len(word))]
        predictions = [fill_mask(word)[0]['sequence'] for word in masked_strings]
        was_correct += [pred == word for pred in predictions]
    
    acc = np.sum(was_correct) / len(was_correct)
    print(f"Accuracy: {acc}")
    return acc

fill_mask = pipeline("fill-mask",
                     model="CHAR_BERT_trained",
                     tokenizer="CHAR_BERT_trained")

compute_masked_character_accuracy(fill_mask, 
                                  text_encoder.dataset_split["test"]["word"])

loading configuration file CHAR_BERT_trained/config.json
Model config RobertaConfig {
  "_name_or_path": "CHAR_BERT_trained",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_length": 512,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 57
}

loading configuration file CHAR_BERT_trained/config.json
Model config RobertaConfig {
  "_name_or_path": "CHAR_BERT_trained",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bo

Evaluating with 15000 words


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [1:16:58<00:00,  3.25it/s]

Accuracy: 0.4963931386184516





0.4963931386184516

## Obtaining a path for each word

In [21]:
text_encoder.tokenized_df

Unnamed: 0,text_id,language,tokens
0,0,de,u
1,0,de,n
2,0,de,b
3,0,de,e
4,0,de,h
...,...,...,...
130015,11999,sv,n
130016,11999,sv,g
130017,11999,sv,a
130018,11999,sv,r


In [22]:
token_embeddings.shape

(130020, 768)

In [82]:
dataset = PrepareData(text_encoder.tokenized_df,
                             id_column="text_id",
                             labels_column="language",
                             embeddings=token_embeddings,
                             pooled_embeddings=pooled_embeddings)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Note 'datetime' is not a column in `.df`, so only 'timeline_index' is added.
[INFO] As 'datetime' is not a column in `.df`, we assume that the data is ordered by time with respect to the id.
[INFO] Adding 'timeline_index' feature...


In [83]:
len(dataset.df["text_id"].unique())

12000

In [84]:
word_path = dataset.pad(pad_by="id",
                        zero_padding=True,
                        method="k_last",
                        k=10,
                        time_feature=["timeline_index"],
                        standardise_time_feature=False)
word_path.shape

[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:30<00:00, 387.23it/s]


(12000, 10, 771)

In [85]:
dataset.df_padded

Unnamed: 0,timeline_index,e1,e2,e3,e4,e5,e6,e7,e8,e9,...,e761,e762,e763,e764,e765,e766,e767,e768,text_id,language
0,2,0.143588,-1.417048,1.320182,-0.508459,1.247967,0.034426,-1.468047,-0.961864,0.559406,...,-0.980690,0.403656,0.398150,0.487656,-0.676191,0.807380,-1.213304,-0.957085,0,de
1,3,1.107637,-0.667894,2.376304,-0.928480,3.609042,-0.043664,-1.700852,0.552448,-0.633671,...,-0.324514,-0.190638,-0.429270,1.990711,-0.213387,0.543733,0.014306,-0.571665,0,de
2,4,0.777725,-0.197840,2.643306,-1.788009,-0.637072,-0.254247,0.406706,-0.182973,0.702718,...,-0.377799,2.157183,-0.826511,1.861895,-1.222253,-0.212417,-1.194266,-1.107049,0,de
3,5,0.717557,-0.304074,1.029350,-1.073429,1.875072,-0.084052,-1.756982,0.406479,-0.279253,...,0.262103,1.245355,-0.515692,1.316131,-0.730478,-0.612286,0.887850,-0.179549,0,de
4,6,0.921733,0.051804,0.740113,-0.494959,0.141836,0.293235,0.499587,0.465118,-0.825649,...,-1.601243,1.046709,0.133001,1.805218,-0.145986,0.592019,-2.297613,-1.601534,0,de
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,6,1.178958,-0.148862,1.265958,-0.734617,0.983744,-0.335252,0.362385,0.789220,-0.720118,...,0.082894,2.911485,-0.729118,2.193193,-0.566522,-0.983746,-1.191947,0.632889,11999,sv
119996,7,1.024163,-2.861085,0.770218,-0.320513,0.436667,0.087550,-0.765312,0.123461,-0.976917,...,-0.779674,1.250211,-0.421141,1.575737,-0.471906,1.111535,-1.863977,0.369330,11999,sv
119997,8,1.684086,-0.072866,1.837575,-1.241489,1.546939,0.040575,-0.274818,0.115267,-0.512420,...,-0.870751,1.708995,-0.186876,1.444939,0.052923,0.052861,-0.449925,0.161691,11999,sv
119998,9,2.046932,-0.022341,0.495391,-1.086745,0.310161,-0.112842,0.781924,0.100807,0.078282,...,-1.145458,3.236154,-0.256295,1.222923,-0.354660,-0.288702,-0.704664,-0.622784,11999,sv


In [86]:
# still has the labels and the ids
dataset.df_padded[dataset.df_padded["text_id"]==0]

Unnamed: 0,timeline_index,e1,e2,e3,e4,e5,e6,e7,e8,e9,...,e761,e762,e763,e764,e765,e766,e767,e768,text_id,language
0,2,0.143588,-1.417048,1.320182,-0.508459,1.247967,0.034426,-1.468047,-0.961864,0.559406,...,-0.98069,0.403656,0.39815,0.487656,-0.676191,0.80738,-1.213304,-0.957085,0,de
1,3,1.107637,-0.667894,2.376304,-0.92848,3.609042,-0.043664,-1.700852,0.552448,-0.633671,...,-0.324514,-0.190638,-0.42927,1.990711,-0.213387,0.543733,0.014306,-0.571665,0,de
2,4,0.777725,-0.19784,2.643306,-1.788009,-0.637072,-0.254247,0.406706,-0.182973,0.702718,...,-0.377799,2.157183,-0.826511,1.861895,-1.222253,-0.212417,-1.194266,-1.107049,0,de
3,5,0.717557,-0.304074,1.02935,-1.073429,1.875072,-0.084052,-1.756982,0.406479,-0.279253,...,0.262103,1.245355,-0.515692,1.316131,-0.730478,-0.612286,0.88785,-0.179549,0,de
4,6,0.921733,0.051804,0.740113,-0.494959,0.141836,0.293235,0.499587,0.465118,-0.825649,...,-1.601243,1.046709,0.133001,1.805218,-0.145986,0.592019,-2.297613,-1.601534,0,de
5,7,1.960512,-0.114054,-0.089443,-0.959227,-0.274134,-0.047917,0.303097,0.89557,0.704869,...,-0.654793,2.553124,-0.306052,1.188357,-0.323826,-0.661785,-0.127747,-0.642438,0,de
6,8,0.858337,0.98012,-0.480119,-1.347086,0.100639,0.609157,-1.403996,0.971574,1.116285,...,0.560951,-0.665503,0.56211,1.043689,0.655348,0.996938,-0.654039,-0.913582,0,de
7,9,0.739917,-0.593066,1.324941,-1.596288,0.311475,-0.266285,-0.007784,0.910885,1.055482,...,0.558452,1.238724,-0.340453,1.134031,-0.87936,0.472009,1.334212,-0.187999,0,de
8,10,1.059729,1.284173,0.815329,-0.547035,1.821547,-0.03291,-0.811382,0.412806,0.762482,...,-0.534226,0.42333,0.452225,-0.418872,0.391442,0.30564,0.317617,-0.111468,0,de
9,11,-0.218854,-0.604878,1.608435,-0.389211,1.714435,-0.739268,-1.454955,0.345475,-1.222678,...,-0.966916,0.498792,-0.507472,1.434245,-0.376011,1.072734,-0.002285,-0.825246,0,de


In [87]:
# by default keeps the time features
torch_word_path = dataset.get_torch_path()
torch_word_path.shape

torch.Size([12000, 10, 769])

In [88]:
emb = torch.from_numpy(dataset.pooled_embeddings.astype("float"))

In [89]:
repeated_emb = emb.unsqueeze(2).repeat(1,1,10).transpose(1, 2)

In [97]:
torch_word_path_for_deepsignet = dataset.get_torch_path_for_deepsignet(include_time_features_in_path=True,
                                                                       include_time_features_in_input=True,
                                                                       include_embedding_in_input=True)
print(f"path shape: {torch_word_path_for_deepsignet[0].shape}")
print(f"input_channels: {torch_word_path_for_deepsignet[1]}")

[INFO] The path was created for each text_id in the dataframe, so to include embeddings in the FFN input, we concatenate the pooled embeddings.
path shape: torch.Size([12000, 10, 1538])
input_channels: 769


In [91]:
torch_word_path_for_deepsignet = dataset.get_torch_path_for_deepsignet(include_time_features_in_path=True,
                                                                       include_time_features_in_input=True,
                                                                       include_embedding_in_input=False)
print(f"path shape: {torch_word_path_for_deepsignet[0].shape}")
print(f"input_channels: {torch_word_path_for_deepsignet[1]}")

path shape: torch.Size([12000, 10, 770])
input_channels: 769


In [92]:
torch_word_path_for_deepsignet = dataset.get_torch_path_for_deepsignet(include_time_features_in_path=True,
                                                                       include_time_features_in_input=False,
                                                                       include_embedding_in_input=False)
print(f"path shape: {torch_word_path_for_deepsignet[0].shape}")
print(f"input_channels: {torch_word_path_for_deepsignet[1]}")

path shape: torch.Size([12000, 10, 769])
input_channels: 769


In [93]:
torch_word_path_for_deepsignet = dataset.get_torch_path_for_deepsignet(include_time_features_in_path=False,
                                                                       include_time_features_in_input=True,
                                                                       include_embedding_in_input=False)
print(f"path shape: {torch_word_path_for_deepsignet[0].shape}")
print(f"input_channels: {torch_word_path_for_deepsignet[1]}")

path shape: torch.Size([12000, 10, 769])
input_channels: 768


In [94]:
torch_word_path_for_deepsignet = dataset.get_torch_path_for_deepsignet(include_time_features_in_path=False,
                                                                       include_time_features_in_input=False,
                                                                       include_embedding_in_input=False)
print(f"path shape: {torch_word_path_for_deepsignet[0].shape}")
print(f"input_channels: {torch_word_path_for_deepsignet[1]}")

path shape: torch.Size([12000, 10, 768])
input_channels: 768


tensor([[ 2.,  3.,  4.,  ...,  9., 10., 11.],
        [ 9., 10., 11.,  ..., 16., 17., 18.],
        [ 1.,  2.,  3.,  ...,  8.,  9., 10.],
        ...,
        [ 0.,  0.,  0.,  ...,  4.,  5.,  6.],
        [ 0.,  0.,  0.,  ...,  5.,  6.,  7.],
        [ 1.,  2.,  3.,  ...,  8.,  9., 10.]], dtype=torch.float64)

In [137]:
t1 = torch_word_path_for_deepsignet[0][:,:,torch_word_path_for_deepsignet[1]:torch_word_path_for_deepsignet[1]+1].max(1)[0]
t1.shape

torch.Size([12000, 1])

In [124]:
t2 = torch_word_path_for_deepsignet[0][:, 0, (torch_word_path_for_deepsignet[1] + 1) :]

In [135]:
torch.cat([t1,t2], dim=1).shape

torch.Size([12000, 773])

In [130]:
x = torch_word_path_for_deepsignet[0]
x[:, :, 0:1].max(1)[0].shape

torch.Size([12000, 1])

In [132]:
x[:, 0, (10 + 1) :].shape

torch.Size([12000, 1527])

In [134]:
torch.cat([x[:, :, 0:1].max(1)[0], x[:, 0, (10 + 1) :]], dim=1).shape

torch.Size([12000, 1528])