In [1]:
from datasets import Dataset, DatasetDict
from transformers import (
    RobertaConfig,
    RobertaForMaskedLM,
    RobertaTokenizerFast,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline
)
import tokenizers

import os
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

import nlpsig

## Language dataset

In the `data/` folder, we have several text folders of words from different languages:
- `wordlist_de.txt`: German words
- `wordlist_en.txt`: English words
- `wordlist_fr.txt`: French words
- `wordlist_it.txt`: Italian words
- `wordlist_pl.txt`: Polish words
- `wordlist_sv.txt`: Swedish words

We additionally have a `alphabet.txt` file which just stores the alphabet characters ('a', 'b', 'c', ...).

The task is to split the words into its individual characters and to obtain an embedding for each of them. We can represent a word by a path of its character embeddings and compute its path signature to use as features in predicting the language for which the word belongs.

Here we look at obtaining embeddings using a Transformer model.

In [2]:
ALPHABET_FILE = 'data/alphabet.txt'
with open(ALPHABET_FILE) as f:
    alphabet = f.read().splitlines()
print(alphabet)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


## Set up Tokenizer for word corpora

If we were to fine-tune an existing pretrained transformer, we could use the same tokenizer that the model was pretrained with. However, here we want to train a model from stratch, and so using a tokenizer that was pretrained on a corpus that looks quite different to ours is suboptimal.

Here, we need to use the `tokenizers` library to set up and train a new tokenizer for our text.

In particular, we're going to start off with a character-based tokenizer (as we're going to split up our words into characters), and train it to our data.

In [3]:
# initialise character based tokenizer
tokenizer = tokenizers.CharBPETokenizer()
tokenizer.train(files=[ALPHABET_FILE], show_progress=False)

# save the tokenizer to "CHAR_BERT/" folder
if not os.path.exists("CHAR_BERT"):
    os.makedirs("CHAR_BERT")

tokenizer.save_model("CHAR_BERT")

['CHAR_BERT/vocab.json', 'CHAR_BERT/merges.txt']

In [4]:
wordlist_files = ["data/wordlist_de.txt",
                  "data/wordlist_en.txt",
                  "data/wordlist_fr.txt",
                  "data/wordlist_it.txt",
                  "data/wordlist_pl.txt",
                  "data/wordlist_sv.txt"]

wordlist_dfs = []
for filename in wordlist_files:
    with open(filename, "r") as f:
        words = f.read().splitlines()
        words_df = pd.DataFrame({"word": words})
        words_df["language"] = filename.split("_")[1][0:2]
        wordlist_dfs.append(words_df)

corpus_df = pd.concat(wordlist_dfs)

In [5]:
corpus_df

Unnamed: 0,word,language
0,a,de
1,aal,de
2,aale,de
3,aalen,de
4,aalend,de
...,...,...
77121,zons,sv
77122,zoo,sv
77123,zoologisk,sv
77124,zoologiska,sv


Question: what do we do with words that appear twice in the data? i.e. different languages have the same word in their vocabularies, e.g. the word "zoo" is a valid word in english, french, italian, polish and swedish.

In [6]:
corpus_df[corpus_df["word"]=="zoo"]

Unnamed: 0,word,language
80613,zoo,en
198413,zoo,fr
1861555,zoo,it
1508829,zoo,pl
77122,zoo,sv


We take a random sample of the 3.9 million words in the corpora...

In [7]:
import random
import math
seed = 2022
random.seed(seed)
balanced = True

# take 
n_words = 12000
if balanced:
    languages = corpus_df["language"].unique()
    words_per_language = math.floor(n_words / len(languages))
    corpus_sample_df = pd.concat(
        [corpus_df[corpus_df["language"]==lang].sample(words_per_language, random_state=seed)
         for lang in languages]
    )
    corpus_sample_df = corpus_sample_df.reset_index(drop=True)
else:
    corpus_sample_df = corpus_df.iloc[random.sample(range(len(corpus_df)), n_words)]
    corpus_sample_df = corpus_sample_df.reset_index(drop=True)

corpus_sample_df

Unnamed: 0,word,language
0,unbehelligte,de
1,herauszuschlagender,de
2,anzahlbarer,de
3,unbeseelte,de
4,imaginativem,de
...,...,...
11995,ifatt,sv
11996,formgavs,sv
11997,bestods,sv
11998,kurirers,sv


In [8]:
corpus_sample_df["language"].value_counts()

de    2000
en    2000
fr    2000
it    2000
pl    2000
sv    2000
Name: language, dtype: int64

In [9]:
max_length = 512

# load in tokenizer for architecture
tokenizer = RobertaTokenizerFast.from_pretrained('CHAR_BERT/',
                                                 max_len=max_length)

# set up data_collator to use (intially just one that adds padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# initialise transformer architecture (random weights)
config_args = {"vocab_size": tokenizer.backend_tokenizer.get_vocab_size(),
               "hidden_size": 768,
               "max_length": 512,
               "max_position_embeddings": max_length + 2,
               "hidden_dropout_prob": 0.1,
               "num_attention_heads": 12,
               "num_hidden_layers": 6,
               "type_vocab_size": 1}

config = RobertaConfig(**config_args)
model = RobertaForMaskedLM(config=config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
text_encoder = nlpsig.TextEncoder(df=corpus_sample_df,
                                  feature_name="word",
                                  model=model,
                                  config=config,
                                  tokenizer=tokenizer,
                                  data_collator=data_collator)

In [11]:
text_encoder.tokenize_text()

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the datatset...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/12 [00:00<?, ?ba/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


  0%|          | 0/12 [00:00<?, ?ba/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id


Dataset({
    features: ['word', 'language', 'input_ids', 'attention_mask', 'special_tokens_mask', 'tokens'],
    num_rows: 12000
})

In [12]:
text_encoder.dataset["word"][0]

'unbehelligte'

In [13]:
text_encoder.dataset["input_ids"][0]

[53, 21, 14, 2, 5, 8, 5, 12, 12, 9, 7, 20, 5, 54]

In [14]:
text_encoder.dataset["tokens"][0]

['u', 'n', 'b', 'e', 'h', 'e', 'l', 'l', 'i', 'g', 't', 'e']

In [15]:
text_encoder.tokenized_df

Unnamed: 0,text_id,language,tokens
0,0,de,u
1,0,de,n
2,0,de,b
3,0,de,e
4,0,de,h
...,...,...,...
130015,11999,sv,n
130016,11999,sv,g
130017,11999,sv,a
130018,11999,sv,r


In [16]:
token_embeddings = text_encoder.obtain_embeddings(method = "hidden_layer")

  0%|                                                                                                               | 0/120 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [00:45<00:00,  2.62it/s]


In [17]:
token_embeddings.shape

(130020, 768)

In [18]:
text_encoder.tokens

Dataset({
    features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 12000
})

In [19]:
pooled_embeddings = text_encoder.pool_token_embeddings()

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:01<00:00, 6156.56it/s]


In [20]:
pooled_embeddings.shape

(12000, 768)

## Training the model

In [30]:
# set up data_collator for language modelling (has dynamic padding)
data_collator_for_LM = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                       mlm=True,
                                                       mlm_probability=0.15)

In [31]:
text_encoder.split_dataset()
text_encoder.set_up_training_args(output_dir="CHAR_BERT_trained",
                                  num_train_epochs=20,
                                  per_device_train_batch_size=128,
                                  seed=seed)
text_encoder.set_up_trainer(data_collator=data_collator_for_LM)

[INFO] Splitting up dataset into train / validation / test sets, and saving to `.dataset_split`.
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.


<transformers.trainer.Trainer at 0x2cf48f6d0>

In [26]:
text_encoder.fit_transformer_with_trainer_api()

The following columns in the training set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: word, language, special_tokens_mask, tokens. If word, language, special_tokens_mask, tokens are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 120000
  Num Epochs = 50
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 46900
  Number of trainable parameters = 43560249


[INFO] Training model with 43560249 parameters...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [22]:
text_encoder.trainer.save_model("CHAR_BERT_trained/")

Saving model checkpoint to CHAR_BERT_trained/
Configuration saved in CHAR_BERT_trained/config.json
Model weights saved in CHAR_BERT_trained/pytorch_model.bin
tokenizer config file saved in CHAR_BERT_trained/tokenizer_config.json
Special tokens file saved in CHAR_BERT_trained/special_tokens_map.json


## Evaluating trained model

Evaluating the performance on predicting the masked letter for the test dataset. To do this, for each word in our test dataset, we will mask each letter on its own and ask the model to predict the masked letter. So for a 5 letter word, we have 5 predictions to make - one for each letter given the other letters.

For our tokenizer, we see that "\<mask>" is used as the mask token.

In [21]:
text_encoder.tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [22]:
def compute_masked_character_accuracy(fill_mask, words):
    was_correct = []
    print(f"Evaluating with {len(words)} words")
    for word in tqdm(words):
        masked_strings = [word[:i] + '<mask>' + word[i+1:] for i in range(len(word))]
        predictions = [fill_mask(word)[0]['sequence'] for word in masked_strings]
        was_correct += [pred == word for pred in predictions]
    
    acc = np.sum(was_correct) / len(was_correct)
    print(f"Accuracy: {acc}")
    return acc

fill_mask = pipeline("fill-mask",
                     model="CHAR_BERT_trained",
                     tokenizer="CHAR_BERT_trained")

compute_masked_character_accuracy(fill_mask, 
                                  text_encoder.dataset_split["test"]["word"])

loading configuration file CHAR_BERT_trained/config.json
Model config RobertaConfig {
  "_name_or_path": "CHAR_BERT_trained",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_length": 512,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 57
}

loading configuration file CHAR_BERT_trained/config.json
Model config RobertaConfig {
  "_name_or_path": "CHAR_BERT_trained",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bo

Evaluating with 15000 words


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [1:16:58<00:00,  3.25it/s]

Accuracy: 0.4963931386184516





0.4963931386184516

## Obtaining a path for each word

In [21]:
text_encoder.tokenized_df

Unnamed: 0,text_id,language,tokens
0,0,de,u
1,0,de,n
2,0,de,b
3,0,de,e
4,0,de,h
...,...,...,...
130015,11999,sv,n
130016,11999,sv,g
130017,11999,sv,a
130018,11999,sv,r


In [22]:
token_embeddings.shape

(130020, 768)

In [81]:
import re
from typing import List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm


class PrepareData:
    """
    Class to prepare dataset for computing signatures.
    """

    def __init__(
        self,
        original_df: pd.DataFrame,
        embeddings: np.array,
        embeddings_reduced: Optional[np.array] = None,
        pooled_embeddings: Optional[np.array] = None,
        id_column: Optional[str] = None,
        labels_column: Optional[str] = None,
    ):
        """
        Class to prepare dataset for computing signatures.

        Parameters
        ----------
        original_df : pd.DataFrame
            Dataset as a pandas dataframe.
        embeddings : np.array
            Embeddings for each of the items in `original_df`.
        embeddings_reduced : Optional[np.array], optional
            Dimension reduced embeddings, by default None.
        pooled_embeddings : Optional[np.array], optional
            Pooled embeddings for each unique id in `id_column`, by default None.
        id_column : Optional[str]
            Name of the column which identifies each of the text, e.g.
            - "text_id" (if each item in `original_df` is a word or sentence from a particular text),
            - "user_id" (if each item in `original_df` is a post from a particular user)
            - "timeline_id" (if each item in `original_df` is a post from a particular time)
            If None, it will create a dummy id_column named "dummy_id" and fill with zeros.
        labels_column : Optional[str]
            Name of the column which are corresponds to the labels of the data.

        Raises
        ------
        ValueError
            if `original_df` and `embeddings` does not have the same number of rows.
        ValueError
            if `original_df` and `embeddings_reduced` does not have the same number of rows
            (if `embeddings_reduced` is provided).
        """
        # perform checks that original_df have the right column names to work with
        if embeddings.ndim != 2:
            raise ValueError("`embeddings` should be a 2-dimensional array.")
        if original_df.shape[0] != embeddings.shape[0]:
            raise ValueError(
                "`original_df` and `embeddings` should have the same number of rows."
            )
        if embeddings_reduced is not None:
            if embeddings_reduced.ndim != 2:
                raise ValueError("If provided, `embeddings_reduced` should be a 2-dimensional array.")
            if original_df.shape[0] != embeddings_reduced.shape[0]:
                raise ValueError(
                    "`original_df`, `embeddings` and `embeddings_reduced` "
                    "should have the same number of rows"
                )
        self.original_df: pd.DataFrame = original_df
        self.id_column: Optional[str] = id_column
        self.label_column: Optional[str] = labels_column
        # set embeddings
        self.embeddings: np.array = embeddings
        self.embeddings_reduced: Optional[np.array] = embeddings_reduced
        # obtain modelling dataframe
        self.df: Optional[pd.DataFrame] = None
        self.df = self._get_modeling_dataframe()
        # set pooled embeddings if provided
        if pooled_embeddings is not None:
            if pooled_embeddings.ndim != 2:
                raise ValueError("If provided, `pooled_embeddings` should be a 2-dimensional array.")
            if len(self.df[self.id_column].unique()) != pooled_embeddings.shape[0]:
                raise ValueError(
                    "If  provided, `pooled_embeddings` should have the same number of "
                    "rows as there are different ids, i.e. we should have "
                    "`len(self.df[self.id_column].unique()) != pooled_embeddings.shape[0]`."
                )
        self.pooled_embeddings: Optional[np.array] = pooled_embeddings
        # obtain time features
        self._time_feature_choices: List[str] = []
        self.time_features_added: bool = False
        self.df = self._set_time_features()
        self.df_padded: Optional[pd.DataFrame] = None
        self.array_padded: Optional[np.array] = None
        # record method for creating the path
        self.pad_method = None

    def _get_modeling_dataframe(self) -> pd.DataFrame:
        """
        [Private] Combines `.original_df` with the sentence
        embeddings and the dimension reduced embeddings

        Returns
        -------
        pd.DataFrame
            Original dataframe concatenated with the embeddings and
            dimension reduced embeddings (column-wise)
            - columns starting with "e" followed by a number denotes each
              dimension of the embeddings
            - columns starting with "d" followed by a number denotes each
              dimension of the dimension reduced embeddings
        """
        if self.df is not None:
            return self.df
        else:
            print("[INFO] Concatenating the embeddings to the dataframe...")
            print("[INFO] - columns beginning with 'e' denote the full embddings.")
            embedding_df = pd.DataFrame(
                self.embeddings,
                columns=[f"e{i+1}" for i in range(self.embeddings.shape[1])],
            )
            if self.embeddings_reduced is not None:
                print(
                    "[INFO] - columns beginning with 'd' denote the dimension reduced embeddings."
                )
                embeddings_reduced_df = pd.DataFrame(
                    self.embeddings_reduced,
                    columns=[
                        f"d{i+1}" for i in range(self.embeddings_reduced.shape[1])
                    ],
                )
                df = pd.concat(
                    [
                        self.original_df.reset_index(drop=True),
                        embeddings_reduced_df,
                        embedding_df,
                    ],
                    axis=1,
                )
            else:
                df = pd.concat(
                    [self.original_df.reset_index(drop=True), embedding_df],
                    axis=1,
                )
            if self.id_column is None:
                self.id_column = "dummy_id"
                print(
                    f"[INFO] No id_column was passed, so setting id_column to '{self.id_column}'."
                )
            if self.id_column not in self.original_df.columns:
                # set default value to id_column
                print(
                    f"[INFO] There is no column in `.original_df` called '{self.id_column}'. "
                    "Adding a new column named '{self.id_column}' of zeros."
                )
                df[self.id_column] = 0
            return df

    @staticmethod
    def _time_fraction(x: pd.Timestamp) -> float:
        """
        [Private] Converts a date, x, as a fraction of the year.

        Parameters
        ----------
        x : pd.Timestamp
            Date.

        Returns
        -------
        float
            The date as a fraction of the year.
        """
        # compute how many seconds the date is into the year
        x_year_start = pd.Timestamp(x.year, 1, 1)
        seconds_into_cal_year = abs(x - x_year_start).total_seconds()
        # compute the time fraction into the year
        time_frac = seconds_into_cal_year / (365 * 24 * 60 * 60)
        return x.year + time_frac

    def _set_time_features(self) -> pd.DataFrame:
        """
        [Private] Updates the dataframe in `.df` to include time features:
        - `time_encoding`: the date as a fraction of the year
           (only if 'datetime' is a column in `.df` dataframe).
        - `time_diff`: the difference in time (in minutes) between successive records
           (only if 'datetime' is a column in `.df` dataframe).
        - `timeline_index`: the index of each post for each id.

        Returns
        -------
        pd.DataFrame
            Updated dataframe with time features.
        """
        if self.time_features_added:
            print("Time features have already been added.")
            return
        print("[INFO] Adding time feature columns into dataframe in `.df`.")
        if "datetime" in self.df.columns:
            self._time_feature_choices += ["time_encoding", "time_diff"]

            # checking 'datetime' column is datatime type
            self.df["datetime"] = pd.to_datetime(self.df["datetime"])

            # obtain time encoding by computing the fraction of year it is in
            print("[INFO] Adding 'time_encoding' and feature...")
            self.df["time_encoding"] = self.df["datetime"].map(
                lambda t: self._time_fraction(t)
            )
            # sort by the id and the date
            self.df = self.df.sort_values(by=[self.id_column, "datetime"]).reset_index(
                drop=True
            )

            # calculate time difference between posts
            print("[INFO] Adding 'time_diff' and feature...")
            self.df["time_diff"] = list(
                self.df.groupby("timeline_id")
                .apply(
                    lambda x: [0.0]
                    + [
                        (
                            x["datetime"].iloc[i] - x["datetime"].iloc[i - 1]
                        ).total_seconds()
                        / 60
                        for i in range(1, len(x))
                    ]
                )
                .explode()
            )
        else:
            print(
                "[INFO] Note 'datetime' is not a column in `.df`, "
                "so only 'timeline_index' is added."
            )
            print(
                "[INFO] As 'datetime' is not a column in `.df`, "
                "we assume that the data is ordered by time with respect to the id."
            )
        # assign index for each post in each timeline
        self._time_feature_choices += ["timeline_index"]

        print("[INFO] Adding 'timeline_index' feature...")
        self.df["timeline_index"] = list(
            self.df.groupby(self.id_column)
            .apply(lambda x: list(range(len(x))))
            .explode()
        )
        self.time_features_added = True

        return self.df

    def _obtain_colnames(self, embeddings: str) -> List[str]:
        """
        [Private] Obtains the column names storing the embeddings.

        Parameters
        ----------
        embeddings : str
            Options are:
            - "dim_reduced": dimension reduced embeddings.
            - "full": full embeddings.
            - "both": concatenation of dimension reduced and full embeddings.

        Returns
        -------
        List[str]
            List of column names which store the embeddings.

        Raises
        ------
        ValueError
            if embeddings is not either of 'dim_reduced', 'full', or 'both'.
        """
        if embeddings not in ["dim_reduced", "full", "both"]:
            raise ValueError(
                "Embeddings must be either 'dim_reduced', 'full', or 'both'"
            )
        if embeddings == "dim_reduced":
            # obtain columns for the dimension reduced embeddings
            # these are columns which start with 'd' and have a number following it
            colnames = [col for col in self.df.columns if re.match(r"^d\w*[0-9]", col)]
        elif embeddings == "full":
            # obtain columns for the full embeddings
            # these are columns which start with 'e' and have a number following it
            colnames = [col for col in self.df.columns if re.match(r"^e\w*[0-9]", col)]
        elif embeddings == "both":
            # add columns for the embeddings
            colnames = [col for col in self.df.columns if re.match(r"^d\w*[0-9]", col)]
            colnames += [col for col in self.df.columns if re.match(r"^e\w*[0-9]", col)]
        return colnames

    def _obtain_time_feature_columns(
        self,
        time_feature: Optional[Union[List[str], str]],
    ) -> List[str]:
        """
        [Private] Obtains the column names storing the time features requested.

        Parameters
        ----------
        time_feature : Optional[Union[List[str], str]]
            If is a string, it must be the list found in
            `_time_feature_choices` attribute. If is a list,
            each item must be a string and it must be in the
            list found in `_time_feature_choices` attribute.

        Returns
        -------
        List[str]
            List of column names which store the time features.

        Raises
        ------
        ValueError
            if `time_feature` is a string, and it is not found in `_time_feature_choices`.
        ValueError
            if `time_feature` is a list of strings, and one of the items
            is not found in `_time_feature_choices`.
        """
        if time_feature is None:
            time_feature = []
        else:
            if not self.time_features_added:
                self.set_time_features()
            if isinstance(time_feature, str):
                if time_feature not in self._time_feature_choices:
                    raise ValueError(
                        "If `time_feature` is a string, it must "
                        + f"be in {self._time_feature_choices}."
                    )
                else:
                    time_feature = [time_feature]
            elif isinstance(time_feature, list):
                if not all(
                    [item in self._time_feature_choices for item in time_feature]
                ):
                    raise ValueError(
                        f"Each item in   should be in {self._time_feature_choices}."
                    )
            else:
                raise ValueError(
                    "`time_feature` must be either None, a string, or a list of strings."
                )
        return time_feature

    def _pad_dataframe(
        self,
        df: pd.DataFrame,
        k: int,
        padding_n: int,
        zero_padding: bool,
        colnames: List[str],
        time_feature: List[str],
        id: int,
    ) -> pd.DataFrame:
        """
        [Private] If `padding_n > 0`, we pad `padding_n` number of entries
        to the dataframe (either by zeros if `zero_padding==True`, or by the last post
        in df if `zero_padding==False`). If `padding_n <= 0`, we don't need to pad
        and we simply return the last `k` entries (throws error if `k` is less than number
        of entries in `.df`).

        Parameters
        ----------
        df : pd.DataFrame
            Dataframe to pad with.
        k : int
            Number of items to keep.
        padding_n : int
            Number of entries to pad.
        zero_padding : bool
            If True, will pad with zeros. Otherwise, pad with the latest
            text associated to the id.
        colnames : List[str]
            List of column names that we wish to keep from the dataframe.
        time_feature : List[str]
            List of time feature column names that we wish to keep from the dataframe.
        id : int
            Which id are we padding.

        Returns
        -------
        pd.DataFrame
            Padded dataframe.

        Raises
        ------
        ValueError
            if k is not a positive integer.
        ValueError
            if padding_n is less than or equal to zero, but there aren't enough entries
            in `df` to take the last `k` entries.
        """
        if k < 0:
            raise ValueError("`k` must be a positive integer")
        columns = time_feature + colnames + [self.id_column]
        if self.label_column is not None:
            columns += [self.label_column]
        if padding_n > 0:
            # need to pad to fill up
            if zero_padding or len(df) == 0:
                # pad by having zero entries
                if self.label_column is not None:
                    # set labels to be -1 to indicate that they're padded values
                    data_dict = {
                        **dict.fromkeys(time_feature, [0]),
                        **{c: [0] for c in colnames},
                        **{self.id_column: [id], self.label_column: [-1]},
                    }
                else:
                    # no label column to add
                    data_dict = {
                        **dict.fromkeys(time_feature, [0]),
                        **{c: [0] for c in colnames},
                        **{self.id_column: [id]},
                    }
                df_padded = pd.concat(
                    [
                        pd.concat([pd.DataFrame(data_dict)] * padding_n),
                        df[columns],
                    ]
                )
            else:
                # pad by repeating the latest text
                latest_text = df[columns].tail(1)
                df_padded = pd.concat(
                    [
                        df[columns],
                        pd.concat([latest_text] * padding_n),
                    ]
                )
            return df_padded.reset_index(drop=True)
        else:
            if len(df) < k:
                raise ValueError(
                    "Requested to not pad, but there aren't enough entries in `df`."
                )
            return df[columns].tail(k).reset_index(drop=True)

    def _pad_id(
        self,
        k: int,
        zero_padding: bool,
        colnames: List[str],
        id_counts: pd.Series,
        id: int,
        time_feature: List[str],
    ) -> pd.DataFrame:
        """
        [Private] For a given id, the function slices the dataframe in .df
        by finding those with id_column == id and keeping only the columns
        found in colnames.
        The function returns a dataframe with k rows:
        - If the number of records with id_column == id is less than k, it "pads" the
        dataframe by adding in empty records (with label = -1 to indicate they're padded).
        - If the number of records with id_column == id is equal to k, it just returns
        the records with id_column == id.

        Parameters
        ----------
        k : int
            Number of items to keep.
        zero_padding : bool
            If True, will pad with zeros. Otherwise, pad with the latest
            text associated to the id.
        colnames : List[str]
            List of column names that we wish to keep from the dataframe.
        id_counts : pd.Series
            The number of records in associated to each id_column.
        id : int
            Which id are we padding.
        time_feature : List[str]
            List of time feature column names that we wish to keep from the dataframe.

        Returns
        -------
        pd.DataFrame
            Padded dataframe for a particular id.

        Raises
        ------
        ValueError
            if k is not a positive integer.
        """
        if k < 0:
            raise ValueError("`k` must be a positive integer")
        history = self.df[self.df[self.id_column] == id]
        padding_n = k - id_counts[id]
        return self._pad_dataframe(
            df=history,
            k=k,
            padding_n=padding_n,
            zero_padding=zero_padding,
            colnames=colnames,
            time_feature=time_feature,
            id=id,
        )

    def _pad_history(
        self,
        k: int,
        zero_padding: bool,
        colnames: List[str],
        index: int,
        time_feature: List[str],
        include_current_embedding: bool,
    ) -> pd.DataFrame:
        """
        [Private]

        Parameters
        ----------
        k : int
            Number of items to keep.
        zero_padding : bool
            If True, will pad with zeros. Otherwise, pad with the latest
            text associated to the id.
        colnames : List[str]
            List of column names that we wish to keep from the dataframe.
        index : int
            Which index of the dataframe are we padding.
        time_feature : List[str]
            List of time feature column names that we wish to keep from the dataframe.

        Returns
        -------
        pd.DataFrame
            Padded dataframe for a particular index of the dataframe by looking
            at the previous texts of a particular id.

        Raises
        ------
        ValueError
            if k is not a positive integer.
        ValueError
            if index is outside the range of indicies of the dataframe ([0, 1, ..., len(.df)]).
        """
        if k < 0:
            raise ValueError("`k` must be a positive integer.")
        if index not in range(len(self.df)):
            raise ValueError("`index` is outside of [0, 1, ..., len(.df)].")
        # look at particular text at a given index
        text = self.df.iloc[index]
        id = text[self.id_column]
        timeline_index = text["timeline_index"]
        # obtain history for the piece of text
        if include_current_embedding:
            history = self.df[
                (self.df[self.id_column] == id)
                & (self.df["timeline_index"] <= timeline_index)
            ]
        else:
            history = self.df[
                (self.df[self.id_column] == id)
                & (self.df["timeline_index"] < timeline_index)
            ]
        padding_n = k - len(history)
        return self._pad_dataframe(
            df=history,
            k=k,
            padding_n=padding_n,
            zero_padding=zero_padding,
            colnames=colnames,
            time_feature=time_feature,
            id=id,
        )

    def pad(
        self,
        pad_by: str,
        method: str = "k_last",
        zero_padding: bool = True,
        k: int = 5,
        time_feature: Optional[Union[List[str], str]] = None,
        standardise_time_feature: bool = True,
        standardise_method: str = "standardise",
        embeddings: str = "full",
        include_current_embedding: bool = False,
    ) -> np.array:
        """
        Creates an array which stores the path.
        We create a path for each id in id_column if `pad_by="id"`
        (by constructing a path of the embeddings of the texts associated to each id),
        or for each item in `.df` if `pad_by="history"`
        (by constructing a path of the embeddings of the previous texts).

        We can decide how long our path is by letting `method="k_last` and specifying `k`.
        Alternatively, we can set `method="max"`, which sets the length of the path
        by setting `k` to be the largest number of texts associated to an individual id.

        The function "pads" if there aren't enough texts to fill in (e.g. if requesting for
        the last 5 posts for an id, but there are less than 5 posts available),
        by adding empty records (if `zero_padding=True`)
        or by the last previous text (if `zero_padding=False`). This ensures that each
        path has the same number of data points.

        Parameters
        ----------
        pad_by : str
            How to construct the path. Options are:
            - "id": constructs a path of the embeddings of the texts associated to each id
            - "history": constructs a path by looking at the embeddings of the previous texts
              for each text
        method : str
            How long the path is, default "k_last". Options are:
            - "k_last": specifying the length of the path through the choice of `k` (see below)
            - "max": the length of the path is chosen by looking at the largest number
              of texts associated to an individual id in `.id_column`
        zero_padding : bool
            If True, will pad with zeros. Otherwise, pad with the latest
            text associated to the id.
        k : int
            The requested length of the path, default 5. This is ignored if `method="max"`.
        time_feature : Optional[Union[List[str], str]]
            Which time feature(s) to keep. If None, then doesn't keep any.
        embeddings : str, optional
            Which embeddings to keep, by default "full". Options:
            - "dim_reduced": dimension reduced embeddings
            - "full": full embeddings
            - "both": keeps both dimension reduced and full embeddings
        include_current_embedding : bool, optional
            If `pad_by="history", this determines whether or not the embedding for the
            text is included in it's history, by default False. If `pad_by="id"`,
            this argument is ignored.

        Returns
        -------
        np.array
            3 dimension array of the path:
            - First dimension is ids (if `pad_by="id"`)
              or each text (if `pad_by="history"`)
            - Second dimension is the associated texts
            - Third dimension are the features (e.g. embeddings /
              dimension reduced embeddings, time features)
        """
        print(
            "[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes."
        )
        if pad_by not in ["id", "history"]:
            raise ValueError("`pad_by` must be either 'id' or 'history'.")
        # obtain id_column counts
        id_counts = self.df[self.id_column].value_counts(sort=False)
        # determine padding length
        if method == "k_last":
            # use k that was passed in
            pass
        elif method == "max":
            # let k be the largest number of items associated to an id
            k = id_counts.max()
        else:
            raise ValueError("Method must be either 'k_last' or 'max'.")
        # obtain time feature colnames
        time_feature_colnames = self._obtain_time_feature_columns(
            time_feature=time_feature
        )
        # obtain colnames of embeddings
        colnames = self._obtain_colnames(embeddings=embeddings)
        if pad_by == "id":
            # pad each of the ids in id_column and store them in a list
            self.pad_method = "id"
            padded_dfs = [
                self._pad_id(
                    k=k,
                    zero_padding=zero_padding,
                    colnames=colnames,
                    id_counts=id_counts,
                    id=id,
                    time_feature=time_feature_colnames,
                )
                for id in tqdm(id_counts.index)
            ]
            self.df_padded = pd.concat(padded_dfs).reset_index(drop=True)
            if standardise_time_feature:
                # standardises the time features in .df_padded
                for tf in time_feature_colnames:
                    self.df_padded[tf] = self._standardise_pd(
                        vec=self.df_padded[tf], method=standardise_method
                    )
            self.array_padded = np.array(self.df_padded).reshape(
                len(id_counts), k, len(self.df_padded.columns)
            )
        elif pad_by == "history":
            # pad each of the ids in id_column and store them in a list
            self.pad_method = "history"
            padded_dfs = [
                self._pad_history(
                    k=k,
                    zero_padding=zero_padding,
                    colnames=colnames,
                    index=index,
                    time_feature=time_feature_colnames,
                    include_current_embedding=include_current_embedding,
                )
                for index in tqdm(range(len(self.df)))
            ]
            self.df_padded = pd.concat(padded_dfs).reset_index(drop=True)
            if standardise_time_feature:
                # standardises the time features in .df_padded
                for tf in time_feature_colnames:
                    self.df_padded[tf] = self._standardise_pd(
                        vec=self.df_padded[tf], method=standardise_method
                    )
            self.array_padded = np.array(self.df_padded).reshape(
                len(self.df), k, len(self.df_padded.columns)
            )
        return self.array_padded

    @staticmethod
    def _standardise_pd(vec: pd.Series, method: str) -> pd.Series:
        # standardised pandas series
        if method == "standardise":
            return (vec - vec.mean()) / vec.std()
        elif method == "normalise":
            return vec / vec.sum()
        else:
            raise ValueError("Method must be either 'standardise' or 'normalise'.")

    def get_torch_time_feature(
        self,
        time_feature: str = "timeline_index",
        standardise: bool = True,
        standardise_method: str = "standardise",
    ) -> torch.tensor:
        """
        Returns a `torch.tensor` object of the time_feature that is requested
        (the string passed has to be one of the strings in `._time_feature_choices`).

        Parameters
        ----------
        time_feature : str, optional
            Which time feature to obtain `torch.tensor` for, by default "timeline_index".
        standardise : bool, optional
            Whether or not to standardise the time feature, by default True.

        Returns
        -------
        torch.tensor
            Time feature.

        Raises
        ------
        ValueError
            if `time_feature` is not in the possible time_features
            (can be found in `._time_feature_choices` attribute).
        """
        if time_feature not in self._time_feature_choices:
            raise ValueError(
                f"`time_feature` should be in {self._time_feature_choices}"
            )
        if not self.time_features_added:
            self.set_time_features()
        if standardise:
            feature = self._standardise_pd(
                vec=self.df[time_feature], method=standardise_method
            )
            return torch.tensor(feature)
        else:
            return torch.tensor(self.df[time_feature])

    def get_torch_path(self,
                       include_time_features: bool = True) -> torch.tensor:
        """
        Returns a torch.tensor object of the path.
        Includes the time features by default (if they are present after the padding).

        Parameters
        ----------
        include_time_features : bool, optional
            Whether or not to keep the time features, by default True.

        Returns
        -------
        torch.tensor
            Path.

        Raises
        ------
        ValueError
            if `self.array_padded` is `None`. In this case, need to call `.pad()` first.
        """
        if self.array_padded is None:
            raise ValueError("Need to first call to create the path `.pad()`.")
        # first strip away the id_column and label_column (if exists)
        if self.label_column is not None:
            # remove last two columns in the third dimension
            # (which store id_column and label_column)
            path = torch.from_numpy(self.array_padded[:, :, :-2].astype("float"))
        else:
            # there are no labels, so just remove last column in third dimension
            # (which stores id_column)
            path = torch.from_numpy(self.array_padded[:, :, :-1].astype("float"))
        if not include_time_features:
            # computes how many time features there are currently
            # (which occur in the first n_time_features columns)
            n_time_features = len(
                [item for item in self._time_feature_choices if item in self.df_padded]
            )
            # removes any time features (if they're present)
            path = path[:, :, n_time_features:]
        return path

    def get_torch_embeddings(self, reduced_embeddings: bool = False) -> torch.tensor:
        """
        Returns a `torch.tensor` object of the the embeddings.

        Parameters
        ----------
        reduced_embeddings : bool, optional
            If True, returns `torch.tensor` of dimension reduced embeddings,
            by default False.

        Returns
        -------
        torch.tensor
            Embeddings.
        """
        if reduced_embeddings:
            if self.embeddings_reduced is None:
                raise ValueError(
                    "There were no reduced embeddings passed into the class."
                )
            else:
                colnames = [
                    col for col in self.df.columns if re.match(r"^d\w*[0-9]", col)
                ]
        else:
            colnames = [col for col in self.df.columns if re.match(r"^e\w*[0-9]", col)]
        return torch.tensor(self.df[colnames].values)

    def get_torch_path_for_deepsignet(self,
                                      include_time_features_in_path: bool,
                                      include_time_features_in_input: bool,
                                      include_embedding_in_input: bool,
                                      reduced_embeddings: bool = False):
        if self.array_padded is None:
            raise ValueError("Need to first call to create the path `.pad()`.")
        
        # obtains a torch tensor which can be inputted into deepsignet
        # computes how many time features there are currently
        # (which occur in the first n_time_features columns)
        n_time_features = len(
            [item for item in self._time_feature_choices if item in self.df_padded]
        )

        if include_embedding_in_input:
            # repeat the embeddings which will be concatenated to the path later
            if self.pad_method == "id":
                print(f"[INFO] The path was created for each {self.id_column} in the dataframe, "
                      "so to include embeddings in the FFN input, we concatenate the "
                      "pooled embeddings.")
                if self.pooled_embeddings is None:
                    raise ValueError(
                        "There were no pooled embeddings passed into the class."
                    )
                if self.array_padded.shape[0] != self.pooled_embeddings.shape[0]:
                    raise ValueError(
                        "If want to include the pooled embeddings in the FFN input, the path "
                        "(found in `.array_padded`) must have the same number of "
                        "samples as there are pooled embeddings, i.e `.array_padded.shape[0]` "
                        "must equal `.pooled_embeddings.shape[0]`."
                    )
                else:
                    emb = torch.from_numpy(self.pooled_embeddings.astype("float"))
            elif self.pad_method == "history":
                print(f"[INFO] The path was created for each item in the dataframe, "
                      "by looking at its history, so to include embeddings in the FFN input, "
                      "we concatenate the embeddings for each sentence / text.")
                if reduced_embeddings:
                    if self.embeddings_reduced is None:
                        raise ValueError(
                            "There were no reduced embeddings passed into the class."
                        )
                    elif self.array_padded.shape[0] != self.embeddings_reduced.shape[0]:
                        raise ValueError(
                            "If want to include reduced embeddings in the FFN input, the path "
                            "(found in `.array_padded`) must have the same number of "
                            "samples as there are embeddings, i.e `.array_padded.shape[0]` "
                            "must equal `.embeddings_reduced.shape[0]`."
                        )
                    else:
                        emb = torch.from_numpy(self.embeddings_reduced.astype("float"))
                else:
                    if self.array_padded.shape[0] != self.embeddings.shape[0]:
                        raise ValueError(
                            "If want to include the full embeddings in the FFN input, the path "
                            "(found in `.array_padded`) must have the same number of "
                            "samples as there are embeddings, i.e `.array_padded.shape[0]` "
                            "must equal `.embeddings.shape[0]`."
                        )
                    else:
                        emb = torch.from_numpy(self.embeddings.astype("float"))
            repeat_emb = emb.unsqueeze(2).repeat(1, 1, self.array_padded.shape[1]).transpose(1, 2)

        if include_time_features_in_path:
            # make sure path includes the time features
            path = self.get_torch_path(include_time_features=True)
            input_channels = path.shape[2]
            if include_time_features_in_input:
                # need to repeat the time feature columns
                # if there are no time features, then we don't need to repeat anything
                if n_time_features == 1:
                    path = torch.cat([path, path[:,:,0].unsqueeze(2)], dim = 2)
                elif n_time_features > 1:
                    path = torch.cat([path, path[:,:,0:n_time_features]], dim = 2)
        else:
            if include_time_features_in_input:    
                # path doesn't need to include the time features
                # but we still want to include them in the input to the FFN for classification
                path = self.get_torch_path(include_time_features=True)                
                input_channels = path.shape[2]-n_time_features
                # need to move time features to the end of the path
                # if there are no time features, then we don't need to move anything
                if n_time_features == 1:
                    path = torch.cat([path[:,:,n_time_features:], path[:,:,0].unsqueeze(2)], dim = 2)
                elif n_time_features > 1:
                    path = torch.cat([path[:,:,n_time_features:], path[:,:,0:n_time_features]], dim = 2)
            else:
                # path doesn't need to include the time features
                # and don't need to include them in the input to the FFN for classification
                path = self.get_torch_path(include_time_features=False)
                input_channels = path.shape[2]

        if include_embedding_in_input:
            path = torch.cat([path, repeat_emb], dim = 2)

        return path, input_channels


In [82]:
dataset = PrepareData(text_encoder.tokenized_df,
                             id_column="text_id",
                             labels_column="language",
                             embeddings=token_embeddings,
                             pooled_embeddings=pooled_embeddings)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Note 'datetime' is not a column in `.df`, so only 'timeline_index' is added.
[INFO] As 'datetime' is not a column in `.df`, we assume that the data is ordered by time with respect to the id.
[INFO] Adding 'timeline_index' feature...


In [83]:
len(dataset.df["text_id"].unique())

12000

In [84]:
word_path = dataset.pad(pad_by="id",
                        zero_padding=True,
                        method="k_last",
                        k=10,
                        time_feature=["timeline_index"],
                        standardise_time_feature=False)
word_path.shape

[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:30<00:00, 387.23it/s]


(12000, 10, 771)

In [85]:
dataset.df_padded

Unnamed: 0,timeline_index,e1,e2,e3,e4,e5,e6,e7,e8,e9,...,e761,e762,e763,e764,e765,e766,e767,e768,text_id,language
0,2,0.143588,-1.417048,1.320182,-0.508459,1.247967,0.034426,-1.468047,-0.961864,0.559406,...,-0.980690,0.403656,0.398150,0.487656,-0.676191,0.807380,-1.213304,-0.957085,0,de
1,3,1.107637,-0.667894,2.376304,-0.928480,3.609042,-0.043664,-1.700852,0.552448,-0.633671,...,-0.324514,-0.190638,-0.429270,1.990711,-0.213387,0.543733,0.014306,-0.571665,0,de
2,4,0.777725,-0.197840,2.643306,-1.788009,-0.637072,-0.254247,0.406706,-0.182973,0.702718,...,-0.377799,2.157183,-0.826511,1.861895,-1.222253,-0.212417,-1.194266,-1.107049,0,de
3,5,0.717557,-0.304074,1.029350,-1.073429,1.875072,-0.084052,-1.756982,0.406479,-0.279253,...,0.262103,1.245355,-0.515692,1.316131,-0.730478,-0.612286,0.887850,-0.179549,0,de
4,6,0.921733,0.051804,0.740113,-0.494959,0.141836,0.293235,0.499587,0.465118,-0.825649,...,-1.601243,1.046709,0.133001,1.805218,-0.145986,0.592019,-2.297613,-1.601534,0,de
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,6,1.178958,-0.148862,1.265958,-0.734617,0.983744,-0.335252,0.362385,0.789220,-0.720118,...,0.082894,2.911485,-0.729118,2.193193,-0.566522,-0.983746,-1.191947,0.632889,11999,sv
119996,7,1.024163,-2.861085,0.770218,-0.320513,0.436667,0.087550,-0.765312,0.123461,-0.976917,...,-0.779674,1.250211,-0.421141,1.575737,-0.471906,1.111535,-1.863977,0.369330,11999,sv
119997,8,1.684086,-0.072866,1.837575,-1.241489,1.546939,0.040575,-0.274818,0.115267,-0.512420,...,-0.870751,1.708995,-0.186876,1.444939,0.052923,0.052861,-0.449925,0.161691,11999,sv
119998,9,2.046932,-0.022341,0.495391,-1.086745,0.310161,-0.112842,0.781924,0.100807,0.078282,...,-1.145458,3.236154,-0.256295,1.222923,-0.354660,-0.288702,-0.704664,-0.622784,11999,sv


In [86]:
# still has the labels and the ids
dataset.df_padded[dataset.df_padded["text_id"]==0]

Unnamed: 0,timeline_index,e1,e2,e3,e4,e5,e6,e7,e8,e9,...,e761,e762,e763,e764,e765,e766,e767,e768,text_id,language
0,2,0.143588,-1.417048,1.320182,-0.508459,1.247967,0.034426,-1.468047,-0.961864,0.559406,...,-0.98069,0.403656,0.39815,0.487656,-0.676191,0.80738,-1.213304,-0.957085,0,de
1,3,1.107637,-0.667894,2.376304,-0.92848,3.609042,-0.043664,-1.700852,0.552448,-0.633671,...,-0.324514,-0.190638,-0.42927,1.990711,-0.213387,0.543733,0.014306,-0.571665,0,de
2,4,0.777725,-0.19784,2.643306,-1.788009,-0.637072,-0.254247,0.406706,-0.182973,0.702718,...,-0.377799,2.157183,-0.826511,1.861895,-1.222253,-0.212417,-1.194266,-1.107049,0,de
3,5,0.717557,-0.304074,1.02935,-1.073429,1.875072,-0.084052,-1.756982,0.406479,-0.279253,...,0.262103,1.245355,-0.515692,1.316131,-0.730478,-0.612286,0.88785,-0.179549,0,de
4,6,0.921733,0.051804,0.740113,-0.494959,0.141836,0.293235,0.499587,0.465118,-0.825649,...,-1.601243,1.046709,0.133001,1.805218,-0.145986,0.592019,-2.297613,-1.601534,0,de
5,7,1.960512,-0.114054,-0.089443,-0.959227,-0.274134,-0.047917,0.303097,0.89557,0.704869,...,-0.654793,2.553124,-0.306052,1.188357,-0.323826,-0.661785,-0.127747,-0.642438,0,de
6,8,0.858337,0.98012,-0.480119,-1.347086,0.100639,0.609157,-1.403996,0.971574,1.116285,...,0.560951,-0.665503,0.56211,1.043689,0.655348,0.996938,-0.654039,-0.913582,0,de
7,9,0.739917,-0.593066,1.324941,-1.596288,0.311475,-0.266285,-0.007784,0.910885,1.055482,...,0.558452,1.238724,-0.340453,1.134031,-0.87936,0.472009,1.334212,-0.187999,0,de
8,10,1.059729,1.284173,0.815329,-0.547035,1.821547,-0.03291,-0.811382,0.412806,0.762482,...,-0.534226,0.42333,0.452225,-0.418872,0.391442,0.30564,0.317617,-0.111468,0,de
9,11,-0.218854,-0.604878,1.608435,-0.389211,1.714435,-0.739268,-1.454955,0.345475,-1.222678,...,-0.966916,0.498792,-0.507472,1.434245,-0.376011,1.072734,-0.002285,-0.825246,0,de


In [87]:
# by default keeps the time features
torch_word_path = dataset.get_torch_path()
torch_word_path.shape

torch.Size([12000, 10, 769])

In [88]:
emb = torch.from_numpy(dataset.pooled_embeddings.astype("float"))

In [89]:
repeated_emb = emb.unsqueeze(2).repeat(1,1,10).transpose(1, 2)

In [90]:
torch_word_path_for_deepsignet = dataset.get_torch_path_for_deepsignet(include_time_features_in_path=True,
                                                                       include_time_features_in_input=True,
                                                                       include_embedding_in_input=True)
print(f"path shape: {torch_word_path_for_deepsignet[0].shape}")
print(f"input_channels: {torch_word_path_for_deepsignet[1]}")

[INFO] The path was created for each text_id in the dataframe, so to include embeddings in the FFN input, we concatenate the pooled embeddings.
path shape: torch.Size([12000, 10, 1538])
input_channels: 769


In [91]:
torch_word_path_for_deepsignet = dataset.get_torch_path_for_deepsignet(include_time_features_in_path=True,
                                                                       include_time_features_in_input=True,
                                                                       include_embedding_in_input=False)
print(f"path shape: {torch_word_path_for_deepsignet[0].shape}")
print(f"input_channels: {torch_word_path_for_deepsignet[1]}")

path shape: torch.Size([12000, 10, 770])
input_channels: 769


In [92]:
torch_word_path_for_deepsignet = dataset.get_torch_path_for_deepsignet(include_time_features_in_path=True,
                                                                       include_time_features_in_input=False,
                                                                       include_embedding_in_input=False)
print(f"path shape: {torch_word_path_for_deepsignet[0].shape}")
print(f"input_channels: {torch_word_path_for_deepsignet[1]}")

path shape: torch.Size([12000, 10, 769])
input_channels: 769


In [93]:
torch_word_path_for_deepsignet = dataset.get_torch_path_for_deepsignet(include_time_features_in_path=False,
                                                                       include_time_features_in_input=True,
                                                                       include_embedding_in_input=False)
print(f"path shape: {torch_word_path_for_deepsignet[0].shape}")
print(f"input_channels: {torch_word_path_for_deepsignet[1]}")

path shape: torch.Size([12000, 10, 769])
input_channels: 768


In [94]:
torch_word_path_for_deepsignet = dataset.get_torch_path_for_deepsignet(include_time_features_in_path=False,
                                                                       include_time_features_in_input=False,
                                                                       include_embedding_in_input=False)
print(f"path shape: {torch_word_path_for_deepsignet[0].shape}")
print(f"input_channels: {torch_word_path_for_deepsignet[1]}")

path shape: torch.Size([12000, 10, 768])
input_channels: 768
