# Task 2: Visualize hidden represenations of a model

In [1]:
# import h5py
# 
# import matplotlib.pyplot as plt
# import numpy as np
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE

In [2]:
packages_to_install = ["ipywidgets", "numpy=1.24.0", "torch", "matplotlib", "sentencepiece", "protobuf", "datasets", "transformers", "diffusers", "peft", "h5py", "scikit-learn", "scipy", "wandb"]

In [3]:
!conda config --add channels conda-forge



In [4]:
%%time
import importlib

for package_name in packages_to_install:
    try:
        importlib.import_module(package_name)
        print(f"{package_name} is already installed.")
    except ImportError:
        print(f"{package_name} is not installed. Installing it now...")
        !conda install -y {package_name}

ipywidgets is already installed.
numpy=1.24.0 is not installed. Installing it now...
done
Solving environment: done


  current version: 23.7.4
  latest version: 24.1.2

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

     conda install conda=24.1.2



# All requested packages already installed.

torch is already installed.
matplotlib is already installed.
sentencepiece is already installed.
protobuf is not installed. Installing it now...
done
Solving environment: done


  current version: 23.7.4
  latest version: 24.1.2

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

     conda install conda=24.1.2



# All requested packages already installed.

datasets is already installed.
transformers is already installed.
diffusers is already installed.


In [5]:
# import dependencies
import matplotlib.pyplot as plt
import numpy as np
import torch

from datasets import load_dataset, load_dataset_builder, get_dataset_split_names, get_dataset_config_names
from transformers import XGLMTokenizer, XGLMTokenizerFast, XGLMForCausalLM, AutoModelForCausalLM, AutoTokenizer, GenerationConfig

In [6]:
DATA_SET_NAME = "facebook/flores"
MODEL_NAME = "facebook/xglm-564M"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LANGUAGES = [
    "eng_Latn",
    "spa_Latn",
    "ita_Latn",
    "deu_Latn",
    "arb_Arab",
    "tel_Telu",
    "tam_Taml",
    "quy_Latn"
]

In [7]:
%%time
language_flores_data = {}
for language in LANGUAGES:
    language_flores_data[language] = load_dataset_builder(DATA_SET_NAME, language, trust_remote_code=True)
    language_flores_data[language].download_and_prepare()

language_flores_data

CPU times: user 923 ms, sys: 146 ms, total: 1.07 s
Wall time: 19.4 s


{'eng_Latn': <datasets_modules.datasets.facebook--flores.2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef.flores.Flores200 at 0x7f5a2220f670>,
 'spa_Latn': <datasets_modules.datasets.facebook--flores.2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef.flores.Flores200 at 0x7f5a1ce29270>,
 'ita_Latn': <datasets_modules.datasets.facebook--flores.2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef.flores.Flores200 at 0x7f5a1ce28850>,
 'deu_Latn': <datasets_modules.datasets.facebook--flores.2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef.flores.Flores200 at 0x7f5a1ce29540>,
 'arb_Arab': <datasets_modules.datasets.facebook--flores.2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef.flores.Flores200 at 0x7f5a19a9c0d0>,
 'tel_Telu': <datasets_modules.datasets.facebook--flores.2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef.flores.Flores200 at 0x7f5a19479060>,
 'tam_Taml': <datasets_modules.datasets.facebo

In [8]:
eng_dataset = language_flores_data["eng_Latn"].as_dataset()
eng_dataset

DatasetDict({
    dev: Dataset({
        features: ['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink', 'sentence'],
        num_rows: 997
    })
    devtest: Dataset({
        features: ['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink', 'sentence'],
        num_rows: 1012
    })
})

In [9]:
# tokenize the data

# load a pre-trained tokenizer from the huggingface hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# gpt2 does not have a padding token, so we have to add it manually
if MODEL_NAME == "gpt2":
    tokenizer.add_special_tokens({'pad_token': tokenizer.unk_token})

# specify the tokenization function
def tokenize_function(example):
    return tokenizer(example['sentence'])
# TODO: your code goes here

tokenized_datasets = {lang: language_flores_data[lang].as_dataset().map(tokenize_function, batched=True) for lang in LANGUAGES}

In [10]:
for key, data in tokenized_datasets.items():
    tokenized_datasets[key] = tokenized_datasets[key].remove_columns(["id", "URL", "domain", "topic", "has_image", "has_hyperlink", "sentence"])
    tokenized_datasets[key].set_format("torch")

In [11]:
# construct a pytorch data loader for each dataset
from typing import Dict
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
BATCH_SIZE = 2 # for testing purposes, we start with a batch size of 2. You can change this later.

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

flores_dataloaders: Dict[str, Dict[str, DataLoader]] = {}
# Iterate over languages
for language in LANGUAGES:
    flores_dataloaders[language] = {}  # Initialize a dictionary for each language
    # Iterate over data splits for the current language
    for split_name, dataset in tokenized_datasets[language].items():
        flores_dataloaders[language][split_name] = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=data_collator)
flores_dataloaders
# TODO: your code goes here

{'eng_Latn': {'dev': <torch.utils.data.dataloader.DataLoader at 0x7f5a1947ba60>,
  'devtest': <torch.utils.data.dataloader.DataLoader at 0x7f5a1947bac0>},
 'spa_Latn': {'dev': <torch.utils.data.dataloader.DataLoader at 0x7f5a194f1420>,
  'devtest': <torch.utils.data.dataloader.DataLoader at 0x7f5a194f2050>},
 'ita_Latn': {'dev': <torch.utils.data.dataloader.DataLoader at 0x7f5a041ae9b0>,
  'devtest': <torch.utils.data.dataloader.DataLoader at 0x7f5a041ae410>},
 'deu_Latn': {'dev': <torch.utils.data.dataloader.DataLoader at 0x7f5a041ae290>,
  'devtest': <torch.utils.data.dataloader.DataLoader at 0x7f5a041af310>},
 'arb_Arab': {'dev': <torch.utils.data.dataloader.DataLoader at 0x7f5a041af400>,
  'devtest': <torch.utils.data.dataloader.DataLoader at 0x7f5a041af4f0>},
 'tel_Telu': {'dev': <torch.utils.data.dataloader.DataLoader at 0x7f5a041af5e0>,
  'devtest': <torch.utils.data.dataloader.DataLoader at 0x7f5a041af730>},
 'tam_Taml': {'dev': <torch.utils.data.dataloader.DataLoader at 0x7f5a

In [12]:
for i in flores_dataloaders['eng_Latn']['dev']:
    print(i)
    break

{'input_ids': tensor([[     2,   1504,  28488,      4, 140003,    501,     32, 200884,   6073,
           9512,     48,  88230,  76168,     32, 160597,     48,     11,    929,
          55516,  35761,    155,    490,   9482,  89288,    235,   6950,     13,
             11,  61368,  24049,   2005,  37295,    155,    490,    113, 213481,
             72,   1117,   5885,  86368,   6929, 111288,      7,     73,  59298,
            769,    743,    242,      5,    211,      5,  19015,   5129,      5],
        [     2, 106920, 169914,   2492,    319,   1246,  13470,  15170,     10,
         178985,     48,  21558,      4, 138365,  73517,    219,      4,  31766,
             53,   1253,   5940,     33,  45526,     22,  12740,      8,     95,
          41440,  33504,      4,   2743,     32, 150346,  73401,     73, 102498,
             88,   3234,    157,  74077,  21558,    490,    113,  11135,   2947,
             48,  34300,     64,  33504,      5,      1,      1,      1,      1]]), 'attention

In [13]:
import os

counter_file = "experiment_counter.txt"
experimentCounter = 0

# Check if counter file exists
if os.path.exists(counter_file):
    # Read counter from file
    with open(counter_file, 'r') as f:
        experimentCounter = int(f.read())

filename = f"{experimentCounter}_hidden_representations.hdf5"

# Increment counter
experimentCounter += 1
with open(counter_file, 'w') as f:
    f.write(str(experimentCounter))

In [14]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.eval()

  return self.fget.__get__(instance, owner)()


XGLMForCausalLM(
  (model): XGLMModel(
    (embed_tokens): Embedding(256008, 1024, padding_idx=1)
    (embed_positions): XGLMSinusoidalPositionalEmbedding()
    (layers): ModuleList(
      (0-23): 24 x XGLMDecoderLayer(
        (self_attn): XGLMAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine

In [15]:
# Check if GPU is available, else fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [ ]:
import h5py

def add_groups(name, f: h5py.File) -> h5py.Group:
    return f.create_group(name)

def add_dataset(data, name, grp: h5py.Group):
    grp.create_dataset(name,  data=data.cpu().detach().numpy())

def add_token_rep(tokens, data, grp, key, sen_num, layer_no):
    for idx, token in enumerate(tokens):
        if token != '<pad>':
            add_dataset(data[idx], f'token_{token}_{idx}', grp)

def add_sentence_rep(f, sentence, data):
    try:
        grp = add_groups('full_sentence_encoding', f)
    except:
        pass
    add_dataset(data)

In [ ]:
%%time
import h5py
losses = {lang: [] for lang in LANGUAGES} # store per-batch losses for each language

# Check if GPU is available, else fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

f = h5py.File(filename,'a')

for lang, loaders in flores_dataloaders.items():
    print(lang)
    try:
        lang_grp = add_groups(lang, f)
    except:
        print("Exception 1")
        pass
    for split, loader in loaders.items():
        try:
            type_grp = add_groups(split, lang_grp)
        except:
            print("Exception 2")
            pass
        for batch in loader:
            # for key, value in batch.items():
            #   print(key, value)
            with torch.no_grad():
                inputs = batch.to(device=device)
                outputs = model(**inputs)
                loss = outputs.loss.cpu()
                losses[lang].append(loss)
                print(loss.item())
        print(f"Finished losses for {lang}")

# iterate over the datset for each language and compute the cross-entropy loss per batch
# TODO: your code goes here

# Visualization