### Part 1: Saving the hidden representations

In [None]:
import torch
import h5py
import altair as alt
import pandas as pd
import numpy as np
import os

# %pip install datasets transformers


from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"USING DEVICE: {device}")

In [None]:
TOKENIZER_NAME = "facebook/xglm-564M"

# For task 2
TASK = "task_2"
MODEL_NAME = "facebook/xglm-564M"

# For task 3
# TASK = "task_3"
# MODEL_NAME = "../models/full-nllb.pt"

DATASET_NAME = "facebook/flores"

NUM_LAYERS = 25
SAMPLES = 200

AVAILABLE_SPLITS = ["dev", "devtest"]

# this is the minimal set of languages that you should analyze
# feel free to experiment with additional lanuages available in the flores dataset
LANGUAGES = [
    "eng_Latn",
    "spa_Latn",
    "deu_Latn",
    "arb_Arab",
    "tam_Taml",
    "quy_Latn"
]

In [None]:
# First let's handle the random sampling of the dataset
def random_sample(dataset, n):
    """ Randomly sample n sentences from the dataset for a language

    Args:
        dataset: torch.utils.data.Dataset - the dataset to sample from
        n: int - the number of samples to draw
    
    Returns:
        torch.utils.data.Subset - a subset of the original dataset with n samples
    """
    dataset_len = len(dataset)
    if n > dataset_len:
        raise ValueError("Number of samples cannot exceed the dataset length.")

    indices = np.sort(np.random.choice(dataset_len, n, replace=False))
    return torch.utils.data.Subset(dataset, indices)

In [None]:
def load_flores_datasets(languages, splits):
    """ Loads the FLORES datasets for the specified languages and splits

    Args:
        languages (list): a list of languages
        splits (list): a list of splits

    Returns:
        dict: a dictionary of datasets for each language and split
    """
    flores_data = {}
    for language in languages:
        print(f"Loading dataset for language: {language}")
        flores_data[language] = {}
        for split in splits:
            flores_data[language][split] = {}
            flores_data[language][split] = load_dataset(
                "facebook/flores",
                language,
                split=split,
                trust_remote_code=True,
                cache_dir="../cache/languages"
            )

    for language in LANGUAGES:
        for split in AVAILABLE_SPLITS:
            print(f"Sampling {SAMPLES} samples for {language} - {split}")
            flores_data[language][split] = random_sample(flores_data[language][split], SAMPLES)
    return flores_data

In [None]:
class Tokenizer:
    """ Tokenizer class to tokenize a given example for a given model
    """
    def __init__(self, model_name, padding="longest", truncation="longest_first", return_tensors="pt"):
        self.model_name = model_name
        self.padding = padding
        self.truncation = truncation
        self.return_tensors = return_tensors
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize(self, sentences):
        """Tokenizes the given input text

        Args:
            sentences (list): The input text to be tokenized

        Returns:
            dict: A dictionary containing the tokenized input text, attention mask, and labels
        """
        tokenized = self.tokenizer(
            sentences,
            padding=self.padding,
            return_tensors=self.return_tensors,
            truncation=self.truncation
        )

        # It's more efficient to just return the enitre sentence and tokens from here
        tokenized["sentence"] = [sentence for sentence in sentences]
        tokenized["tokens"] = [self.tokenizer.tokenize(sentence) for sentence in sentences]

        return tokenized

In [None]:
def collate_fn(batch, tokenizer):
    """ Collate function to convert a batch of samples into a batch of padded tokenized sequences

    Args:
        batch (list): a list of samples
        tokenizer (Tokenizer): the tokenizer

    Returns:
        dict: a dictionary of tokenized sequences
    """
    return tokenizer.tokenize([sample["sentence"] for sample in batch])

def build_dataloaders(languages, batch_size, collate_fn, tokenizer, shuffle=False):
    """ Builds dataloaders for a given set of languages and tokenizer using the specified batch size and collate function

    Args:
        languages (list): a list of languages
        batch_size (int): the batch size
        collate_fn (function): the collate function
        tokenizer (Tokenizer): the tokenizer
        shuffle (bool, optional): whether to shuffle the dataset. Defaults to False.

    Returns:
        dict: a dictionary of dataloaders for each language
    """
    flores_data = load_flores_datasets(languages, ["dev", "devtest"])

    flores_dataloaders = {}
    for language in languages:
        flores_dataloaders[language] = DataLoader(
            flores_data[language]["devtest"],
            batch_size=batch_size,
            shuffle=shuffle,
            collate_fn=lambda batch: collate_fn(batch, tokenizer)
        )
    return flores_dataloaders

In [None]:
def write_data_to_file(data, lang):
    """ Writes the data to a hdf5 file

    Args:
        data (dict): The data to write to the file
        lang (str): The name of the file
    """
    with h5py.File(f'../data/{TASK}/{lang}.hdf5', 'w') as f:
      for key, value in data.items():
        main_group = f.create_group(key)
        for field, value in value.items():
          if field == "sentences": 
            main_group.create_dataset(field, data=value)
          else:
            layers_group = main_group.create_group(field) # Create the layers group
            for layer_key, layer_val in value.items(): 
              layer_group = layers_group.create_group(layer_key) # Create a group for each layer
              for sentence_key, sentence_value in layer_val.items():
                sentence_group = layer_group.create_group(sentence_key) # Create a group for each sentence
                for token_key, token_value in sentence_value.items():
                  if token_key == "s": # The hidden state of the entire sentence
                    sentence_group.create_dataset(token_key, data=token_value)                    
                  else:
                    token_group = sentence_group.create_group(token_key) # Create a group for each token
                    for item_key, item_value in token_value.items():
                      token_group.create_dataset(item_key, data=item_value)
      f.close()

In [None]:
def build_model(model_name, device):
    """Builds a model from a given model name and device

    Args:
        model_name (str): the name or path of the model
        device (torch.device): the device to run the model on

    Returns:
        torch.nn.Module: the model
    """
    if os.path.exists(model_name):
        print(f"Loading model from path: {model_name}")
        model = torch.load(model_name)
    else:
        print(f"Loading model from name: {model_name}")
        model = AutoModelForCausalLM.from_pretrained(model_name)
    model.to(device)
    return model

In [None]:
BATCH_SIZE = 2

tokenizer = Tokenizer(TOKENIZER_NAME)

flores_dataloaders = build_dataloaders(LANGUAGES, BATCH_SIZE, collate_fn, tokenizer)

model = build_model(MODEL_NAME, device)
model.eval()

In [None]:
'''
    Data format stored in file
    data = {
      "sentences": ["My cat has an orange cat energy", "My dog is cute"], # The list of sampled sentences
      "layers": {                                                         # A dictionary of layers
        "l_1": {                                                          # The first layer
          "s_0": {                                                        # Sentence first sentence
            "t_0": {                                                      # The first token of the first sentence
              "t": str,                                                   # The string representation of the token
              "s": np.array                                               # Hidden representation of the token
              "id": int                                                   # The encoded id of the token
            },
            "t_1": {
              "t": str,
              "s": np.array,
              "id": int
            },
            "s": np.array,                                                # Hidden representation of the entire sentence
          },
        "l_2": {
          "s_0": {
            "t_0": {
              "t": str,
              "s": np.array,
              "id": int
            },
            "t_1": {
              "t": str,
              "s": np.array,
              "id": int
            },
            "s": np.array,
          }
        }
      }
    }
'''
with torch.no_grad():
  for idx_lang, lang in enumerate(LANGUAGES):
    data = {}
    data["data"] = {}
    data["data"]["layers"] = {}
    data["data"]["sentences"] = []

    # Initialize the layers and sentences
    for idx_layer in range(NUM_LAYERS):
      data["data"]["layers"][f"l_{idx_layer}"] = {}
      for idx_sentence in range(SAMPLES):
        data["data"]["layers"][f"l_{idx_layer}"][f"s_{idx_sentence}"] = {}

    for idx_batch, batch in enumerate(tqdm(flores_dataloaders[lang])):
      data["data"]["sentences"].extend(batch["sentence"])

      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)

      outputs = model.forward(
          input_ids=input_ids,
          attention_mask=attention_mask,
          output_hidden_states=True
      )

      # h_l -> hidden layer
      for idx_h_l, h_l in enumerate(outputs.hidden_states):
        idx_sentence = idx_batch * BATCH_SIZE
        for id_sequence, token_sequence in enumerate(batch["input_ids"]):
          tokens_per_seq = []
          for idx_token, token in enumerate(token_sequence):
            if token == tokenizer.tokenizer.pad_token_id:
              continue

            state = h_l[id_sequence][idx_token].cpu()
            tokens_per_seq.append(state)

            data["data"]["layers"][f"l_{idx_h_l}"][f"s_{idx_sentence + id_sequence}"][f"t_{idx_token}"] = {
                "t": tokenizer.tokenizer.convert_ids_to_tokens([token])[0],
                "s": np.array(state),
                "id": token.item()
            }

          stacked_tensors = torch.stack(tokens_per_seq)
          mean = np.array(torch.mean(stacked_tensors, dim=0, keepdim=True))
          data["data"]["layers"][f"l_{idx_h_l}"][f"s_{idx_sentence + id_sequence}"]["s"] = mean

    # Write the data to files 
    write_data_to_file(data, lang)

### Part 2: Visualizing the data using PCA and t-SNE

#### Util methods for performing PCA and t-SNE on some data

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def apply_pca(data, n_components=2):
    """Applies PCA to the given data

    Args:
        data (MatrixLike): The data to be transformed
        n_components (int, optional): Number of components to keep. Defaults to 2.

    Returns:
        ndarray: Data transformed by PCA
    """
    pca = PCA(n_components=n_components)
    return pca.fit_transform(data)

def apply_tsne(data, n_components=2, perplexity=30, n_jobs=-1):
    """Applies t-SNE to the given data

    Args:
        data (MatrixLike): The data to be transformed
        n_components (int, optional): Number of components to keep. Defaults to 2.
        learning_rate (str, optional): The learning rate for t-SNE. Defaults to 'auto'.
        init (str, optional): Initialization of embedding. Defaults to 'random'.
        n_iter (int, optional): Maximum number of iterations for the optimization. Defaults to 1000.
        perplexity (int, optional): The perplexity is related to the number of nearest neighbors. Defaults to 40.
        n_jobs (int, optional): The number of parallel jobs to run. Defaults to -1.

    Returns:
        ndarray: Data transformed by t-SNE
    """
    tsne = TSNE(n_components=n_components, perplexity=perplexity, n_jobs=n_jobs)
    return tsne.fit_transform(data)

#### Retrive the hidden representation from the first hidden layer

In [None]:
LAYER = 1 # Not 0 because we want to visualize the first hidden layer and not the embedding layer

data = []
for lang in LANGUAGES:
    f = h5py.File(f'../data/{TASK}/{lang}.hdf5', 'r')
    for idx_sample in range(SAMPLES):
        for idx_token in range(len(f[f"data/layers/l_{LAYER}/s_{idx_sample}"]) - 1): # Do not include the hidden state of the entire sentence
          data.append({
              "token": f[f"data/layers/l_{LAYER}/s_{idx_sample}/t_{idx_token}/t"][()].decode("utf-8"),
              "hidden_state": f[f"data/layers/l_{LAYER}/s_{idx_sample}/t_{idx_token}/s"][()],
              "language": lang
          })
    f.close()

#### Apply PCA to the hidden representations of each language of the first layer of the model and visualizing the result in 2D

In [None]:
# Initialize the dataframe
alt.data_transformers.enable("vegafusion")

main_df = pd.DataFrame(columns=["First component", "Second component", "Token", "Language"])
pca_data = apply_pca([d["hidden_state"] for d in data])
df = pd.DataFrame(pca_data, columns=["First component", "Second component"])
df["Token"] = [str(d["token"]) for d in data]
df["Language"] = [d["language"] for d in data]
main_df = pd.concat([df, df])

alt.Chart(main_df).mark_circle(size=20).encode(
    x='First component',
    y='Second component',
    color='Language',
    tooltip=['Token', 'Language'],
).interactive().properties(title="PCA Visualization of first hidden layer")

#### Apply t-SNE to the hidden representations of each language of the first layer of the model and visualizing the result in 2D 

In [None]:
# Initialize the dataframe
alt.data_transformers.enable("vegafusion")

main_df = pd.DataFrame(columns=["First component", "Second component", "Token", "Language"])
pca_data = apply_tsne(np.array([d["hidden_state"] for d in data]))
df = pd.DataFrame(pca_data, columns=["First component", "Second component"])
df["Token"] = [str(d["token"]) for d in data]
df["Language"] = [d["language"] for d in data]
main_df = pd.concat([df, df])

alt.Chart(main_df).mark_circle(size=20).encode(
    x='First component',
    y='Second component',
    color='Language',
    tooltip=['Token', 'Language']
).interactive().properties(title="t-SNE Visualization of first hidden layer")

#### Create the directories to store the charts

In [None]:
# Creating the dirs for the tokens charts
tokens_path = f"../data/{TASK}/charts/tokens"
sentence_path = f"../data/{TASK}/charts/sentence"

paths_to_create = [
    f"{tokens_path}/png/pca",
    f"{tokens_path}/svg/pca",
    f"{tokens_path}/html/pca",
    f"{tokens_path}/png/tsne",
    f"{tokens_path}/svg/tsne",
    f"{tokens_path}/html/tsne",
    f"{sentence_path}/png/pca",
    f"{sentence_path}/svg/pca",
    f"{sentence_path}/html/pca",
    f"{sentence_path}/png/tsne",
    f"{sentence_path}/svg/tsne",
    f"{sentence_path}/html/tsne"
]

for path in paths_to_create:
    if not os.path.exists(path):
        os.makedirs(path)

#### Visualizing (PCA and tSNE) for each layer of the model (including the embedding layer)

In [None]:
for idx_layer in range(NUM_LAYERS):
    data = []

    # Initialize the dataframe
    main_df_pca = pd.DataFrame(columns=["First component", "Second component", "Token", "Language"])
    main_df_tsne = pd.DataFrame(columns=["First component", "Second component", "Token", "Language"])

    # Load the data for each language for the current `idx_layer` hidden layer
    for lang in LANGUAGES:
        f = h5py.File(f'../data/{TASK}/{lang}.hdf5', 'r')
        for idx_sample in range(SAMPLES):
            # We need to subtract 1 because the last element is the hidden representation of the entire sentence 
            for idx_token in range(len(f[f"data/layers/l_{idx_layer}/s_{idx_sample}"]) - 1):
                data.append({
                    "token": f[f"data/layers/l_{idx_layer}/s_{idx_sample}/t_{idx_token}/t"][()].decode("utf-8"),
                    "hidden_state": f[f"data/layers/l_{idx_layer}/s_{idx_sample}/t_{idx_token}/s"][()],
                    "language": lang
                })
        f.close()
        
    pca_data = apply_pca([d["hidden_state"] for d in data])
    df = pd.DataFrame(pca_data, columns=["First component", "Second component"])
    df["Token"] = [str(d["token"]) for d in data]
    df["Language"] = [d["language"] for d in data]
    main_df_pca = pd.concat([main_df_pca, df])

    tsne_data = apply_tsne(np.array([d["hidden_state"] for d in data]))
    df = pd.DataFrame(tsne_data, columns=["First component", "Second component"])
    df["Token"] = [str(d["token"]) for d in data]
    df["Language"] = [d["language"] for d in data]
    main_df_tsne = pd.concat([main_df_tsne, df])

    if idx_layer == 0:
        title = "Visualization of the embedding layer"
    else:
        title = f"Visualization of the {idx_layer} hidden layer"
    
    chart_pca = alt.Chart(main_df_pca).mark_circle(size=20).encode(
        x='First component',
        y='Second component',
        color='Language',
        tooltip=['Token', 'Language'],
    ).properties(title=f"PCA {title}")

    chart_tsne = alt.Chart(main_df_tsne).mark_circle(size=20).encode(
        x='First component',
        y='Second component',
        color='Language',
        tooltip=['Token', 'Language'],
    ).properties(title=f"t-SNE {title}")
    
    chart_pca.save(f'../data/{TASK}/charts/tokens/png/pca/pca_{idx_layer}.png')
    chart_pca.save(f'../data/{TASK}/charts/tokens/svg/pca/pca_{idx_layer}.svg')
    chart_pca.save(f'../data/{TASK}/charts/tokens/html/pca/pca_{idx_layer}.html')

    chart_tsne.save(f'../data/{TASK}/charts/tokens/png/tsne/tsne_{idx_layer}.png')
    chart_tsne.save(f'../data/{TASK}/charts/tokens/svg/tsne/tsne_{idx_layer}.svg')  
    chart_tsne.save(f'../data/{TASK}/charts/tokens/html/tsne/tsne__{idx_layer}.html')  

#### Sentence representation for each language

In [None]:
for idx_layer in tqdm(range(NUM_LAYERS)):
    data = []

    # Initialize the dataframe
    main_df_pca = pd.DataFrame(columns=["First component", "Second component", "Sentence", "Language"])
    main_df_tsne = pd.DataFrame(columns=["First component", "Second component", "Sentence", "Language"])

    for lang in LANGUAGES:
        f = h5py.File(f'../data/{TASK}/{lang}.hdf5', 'r')
        for idx_sample in range(SAMPLES):
            data.append({
                "sentence": f[f"data/sentences"][idx_sample].decode("utf-8"),
                "hidden_state": f[f"data/layers/l_{idx_layer}/s_{idx_sample}/s"][()][0],
                "language": lang
            })
        f.close()

    pca_data = apply_pca([d["hidden_state"] for d in data])
    df = pd.DataFrame(pca_data, columns=["First component", "Second component"])
    df["Sentence"] = [str(d["sentence"]) for d in data]
    df["Language"] = [d["language"] for d in data]
    main_df_pca = pd.concat([main_df_pca, df])

    tsne_data = apply_tsne(np.array([d["hidden_state"] for d in data]))
    df = pd.DataFrame(tsne_data, columns=["First component", "Second component"])
    df["Sentence"] = [str(d["sentence"]) for d in data]
    df["Language"] = [d["language"] for d in data]
    main_df_tsne = pd.concat([main_df_tsne, df])

    if idx_layer == 0:
        title = "Visualization of the embedding layer"
    else:
        title = f"Visualization of the {idx_layer} hidden layer"
        
    chart_pca = alt.Chart(main_df_pca).mark_circle(size=20).encode(
        x='First component',
        y='Second component',
        color='Language',
        tooltip=['Sentence', 'Language'],
    ).properties(title=f"PCA {title}")

    chart_tsne = alt.Chart(main_df_tsne).mark_circle(size=20).encode(
        x='First component',
        y='Second component',
        color='Language',
        tooltip=['Sentence', 'Language'],
    ).properties(title=f"t-SNE {title}")
    
    chart_pca.save(f'../data/{TASK}/charts/sentence/png/pca/pca_{idx_layer}.png')
    chart_pca.save(f'../data/{TASK}/charts/sentence/svg/pca/pca_{idx_layer}.svg')
    chart_pca.save(f'../data/{TASK}/charts/sentence/html/pca/pca_{idx_layer}.html')

    chart_tsne.save(f'../data/{TASK}/charts/sentence/png/tsne/tsne_{idx_layer}.png')
    chart_tsne.save(f'../data/{TASK}/charts/sentence/svg/tsne/tsne_{idx_layer}.svg')
    chart_tsne.save(f'../data/{TASK}/charts/sentence/html/tsne/tsne_{idx_layer}.html')