# Experimenting with flores-200.py

In [1]:
from flores_200 import Flores200, Flores200Config

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
cd_home = os.getcwd() == '/Users/emmanuelrassou/Desktop/HarvardClasses/spring_2025/neuro_240/final_project'
if not cd_home:
    os.chdir("..")  # Move one directory up
    print("Moved up one directory:", os.getcwd())  # Check current directory
    print("Contents of directory:", os.listdir())  # Equivalent to `%ls`

cd_home = True  # Update flag

Moved up one directory: /Users/emmanuelrassou/Desktop/HarvardClasses/spring_2025/neuro_240/final_project
Contents of directory: ['out.txt', '.myenv', '.DS_Store', 'requirements.txt', 'loss.txt', 'VAE_output_per_exp.png', 'images', 'dataset', 'fairseq', 'loss_curve_eng_fra.png', 'ar1-pytorch', 'training_demo.sh', 'modern_latent_replay', 'loss_01.txt', 'loss_02.txt', 'loss_curve_eng_afr__fra.png', 'model', 'Assignment 3_ Midterm Report.pdf', 'predictions.txt', 'miniforge.sh', 'Combatting Language Forgetting in MultiLingual Settings.pdf', '.vscode', 'avalanche_demo', 'small_100']


# Creating an Avalanche Dataset

In [3]:
import os
from torch.utils.data import Dataset
from avalanche.benchmarks.utils import AvalancheDataset

class FloresDataset(Dataset):
    def __init__(self, data_dir, src_lang, tgt_lang, suffix):
        self.data = []
        self.targets = []
        # self.task_labels = []
        src_file_path = os.path.join(data_dir, suffix,  f"{src_lang}.{suffix}")
        tgt_file_path = os.path.join(data_dir, suffix,  f"{tgt_lang}.{suffix}")

        if not os.path.exists(src_file_path):
            raise FileNotFoundError(f"Source file {src_file_path} does not exist.")
        if not os.path.exists(tgt_file_path):
            raise FileNotFoundError(f"Target file {tgt_file_path} does not exist.")

        with open(src_file_path, "r", encoding="utf-8") as src_file, \
             open(tgt_file_path, "r", encoding="utf-8") as tgt_file:
            src_sentences = src_file.readlines()
            tgt_sentences = tgt_file.readlines()

            if len(src_sentences) != len(tgt_sentences):
                raise ValueError("Source and target files must have the same number of lines.")

            self.data.extend(src_sentences)
            self.targets.extend(tgt_sentences)
            # self.task_labels.extend([src_lang] * len(src_sentences))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx].strip(), self.targets[idx].strip()#, self.task_labels[idx]



# Creating an Pytorch dataloaders

In [5]:
from torch.nn.utils.rnn import pad_sequence
import torch
import sys
import os
from small_100.tokenization_small100 import SMALL100Tokenizer

def collate_fn(batch):
    """ Custom collate function to pad variable-length sequences. """
    src_texts, tgt_texts = zip(*batch)  # Unzip source and target sequences
    
    # Tokenize if needed (replace with actual tokenizer)
    tokenizer = SMALL100Tokenizer()
    tokenizer.src_lang = "en"
    tokenizer.tgt_lang = "fr"
    src_tokens = [tokenizer(text, return_tensors="pt") for text in src_texts]
    tgt_tokens = [tokenizer(text, return_tensors="pt") for text in tgt_texts]

    return src_tokens, tgt_tokens

    #Pad sequences to match the longest one in the batch
    # src_padded = pad_sequence(src_tokens, batch_first=True, padding_value="")
    # tgt_padded = pad_sequence(tgt_tokens, batch_first=True, padding_value="")

    # return src_padded, tgt_padded

In [6]:
from torch.utils.data import DataLoader

# Define the directory containing your .dev files and the languages of interest
data_directory = os.path.join("dataset", "flores200_dataset")
train_suffix = "dev"
test_suffix = "devtest"

src_languages = ["eng_Latn", "fra_Latn", "afr_Latn"]  # Example source languages
tgt_language = "cjk_Latn"

# Create Pytorch data loaders

train_loaders = []
test_loaders = []

for task_id, src_lang in enumerate(src_languages):
    train_flores = FloresDataset(data_directory, src_lang, tgt_language, train_suffix)
    train_loader = DataLoader(train_flores, batch_size=16, shuffle=True, collate_fn=collate_fn)
    train_loaders.append(train_loader)

    # Assuming test datasets are structured similarly
    test_flores = FloresDataset(data_directory, src_lang, tgt_language, test_suffix)
    test_loader = DataLoader(test_flores, batch_size=16, shuffle=False, collate_fn=collate_fn)
    test_loaders.append(test_loader)


In [7]:
# explore train loader
for task_id, train_loader in enumerate(train_loaders):
    print(f"Task {task_id}:")
    for batch in train_loader:
        src_batch, tgt_batch = batch
        print(f"Source batch shape: {len(src_batch)}")
        print(f"Target batch shape: {len(tgt_batch)}")
        break

# explore test loader
for task_id, test_loader in enumerate(test_loaders):
    print(f"Task {task_id}:")
    for batch in test_loader:
        src_batch, tgt_batch = batch
        print(f"Source batch shape: {len(src_batch)}")
        print(f"Target batch shape: {len(tgt_batch)}")
        break

Task 0:
Source batch shape: 16
Target batch shape: 16
Task 1:
Source batch shape: 16
Target batch shape: 16
Task 2:
Source batch shape: 16
Target batch shape: 16
Task 0:
Source batch shape: 16
Target batch shape: 16
Task 1:
Source batch shape: 16
Target batch shape: 16
Task 2:
Source batch shape: 16
Target batch shape: 16


In [42]:
from avalanche.benchmarks import benchmark_from_datasets
from avalanche.benchmarks.utils import AvalancheDataset

train_datasets = [AvalancheDataset(train_loader.dataset) for train_loader in train_loaders]
test_datasets = [AvalancheDataset(test_loader.dataset) for test_loader in test_loaders]

scenario = benchmark_from_datasets(train=train_datasets, test=test_datasets)
print("Benchmark created successfully!")

Benchmark created successfully!
