# Task 2: Visualize hidden represenations of a model

In [2]:
import h5py

import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [6]:
# TODO: your code goes here

In [3]:
import torch 
import h5py

import datasets
import numpy as np
import transformers
import random

In [3]:
MODEL_NAME = "facebook/xglm-564M"
DATASET_NAME = "facebook/flores"

In [4]:
LANGUAGES = [
    "eng_Latn",
    "spa_Latn",
    "deu_Latn",
    "arb_Arab",
    "tam_Taml",
    "quy_Latn"
]

In [5]:
data = {}
for lang in LANGUAGES:
    data[lang] = datasets.load_dataset("facebook/flores", lang, trust_remote_code=True)

In [13]:
len(data["eng_Latn"]["dev"])

997

In [15]:
random_indexes = random.sample(range(len(data["eng_Latn"]["dev"])), 200)

In [17]:
extracted_sentences = {}
for lang in LANGUAGES:
    extracted_sentences[lang] = [data[lang]["dev"][i]["sentence"] for i in random_indexes]

In [19]:
extracted_sentences

{'eng_Latn': ['In his notes he used words which some parents considered coarse, and he reportedly used profanity in class.',
  'Day hiking involves distances of less than a mile up to longer distances that can be covered in a single day.',
  'The town lies immediately next to the falls, and they are the major attraction, but this popular tourist destination offers both adventure seekers and sightseers plenty of opportunities for a longer stay.',
  "There is also no requirement that you obtain a local number from the community in which you live; you can obtain a satellite Internet connection in the wilds of Chicken, Alaska and select a number which claims you're in sunny Arizona.",
  'Although its physical state may change, its chemical state remains the same.',
  'This will ensure that your particular interests and/or constraints are matched with the ship most suitable to them.',
  'Typical for the period, Kirby Muxloe Castle is more of a fortified house than a true castle.',
  'The ea

In [38]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME, output_hidden_states=True)

Some weights of XGLMModel were not initialized from the model checkpoint at facebook/xglm-564M and are newly initialized: ['model.embed_positions.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
# Function to obtain hidden representations
def get_hidden_representations(sentence):
    tokens = tokenizer.tokenize(sentence)
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.hidden_states
    return tokens, hidden_states

# Save representations to HDF5 file
def save_representations(lang, sentences):
    with h5py.File(f"{lang}_representations.hdf5", "w") as f:
        for sentence, idx in zip(sentences, random_indexes):
            tokens, hidden_states = get_hidden_representations(sentence)
            # Calculate sentence representation (mean-pooling)
            sentence_representation = torch.mean(torch.stack(hidden_states)[:, 1:-1, :], dim=0)  # Mean over all layers, exclude padding tokens
            # Save token representations
            for layer_idx, layer_output in enumerate(hidden_states):
                for token_idx, (token, token_hidden) in enumerate(zip(tokens, layer_output[1:-1])):  # Exclude CLS and SEP tokens
                    dataset_name = f"{idx}/{layer_idx}/{token_idx}_{token}"
                    f.create_dataset(dataset_name, data=token_hidden.numpy())
            # Save sentence representation
            sentence_rep_dataset_name = f"{idx}/sentence_representation"
            f.create_dataset(sentence_rep_dataset_name, data=sentence_representation.numpy())
    print(f"Representations for {lang} saved to {lang}_representations.hdf5")

# Save representations for each language
for lang in LANGUAGES:
    save_representations(lang, extracted_sentences[lang])

Representations for eng_Latn saved to eng_Latn_representations.hdf5
Representations for spa_Latn saved to spa_Latn_representations.hdf5
Representations for deu_Latn saved to deu_Latn_representations.hdf5
Representations for arb_Arab saved to arb_Arab_representations.hdf5
Representations for tam_Taml saved to tam_Taml_representations.hdf5
Representations for quy_Latn saved to quy_Latn_representations.hdf5
