In [1]:
# imports
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader
import torch

from dotenv import load_dotenv
import os
import pickle
from tqdm import tqdm
from collections import defaultdict

In [2]:
# load env variable
load_dotenv()

True

In [3]:
hf_home_dir = os.environ['HF_HOME']
transformer_dir = os.environ['TRANSFORMERS_CACHE']
print(hf_home_dir)
print(transformer_dir)

/data/users/dhananjay/.cache/huggingface
/data/users/dhananjay/.cache/huggingface


In [4]:
# set cuda device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [5]:
# load english data
train_ = load_dataset("snli", split="train", cache_dir = str(hf_home_dir))
val_ = load_dataset("snli", split="validation", cache_dir = str(hf_home_dir))
test_ = load_dataset("snli", split="test", cache_dir = str(hf_home_dir))

Found cached dataset snli (/data/users/dhananjay/.cache/huggingface/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Found cached dataset snli (/data/users/dhananjay/.cache/huggingface/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Found cached dataset snli (/data/users/dhananjay/.cache/huggingface/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


In [6]:
def clip_dataset(data, num_dataset):
    if len(data) > num_dataset:
        data['premise'] = data['premise'][:num_dataset]
        data['hypothesis'] = data['premise'][:num_dataset]
        data['label'] = data['premise'][:num_dataset]
    assert len(data) == num_dataset
    return data

In [7]:
class SNLIEnglish(Dataset):
    # data : Dataset --- input data file
    # clip_dataset : int --- how many datapoints to be consider while converting to the french
    #                        this is helpful if dataset is huge size and taking time for conversion to french
    def __init__(self, data, clip_dataset):
        self.data1 = self.clean_data(data)
        self.clip_dataset = min(clip_dataset, len(self.data1))
        
    def clean_data(self, data):
        return data.filter(lambda datapoint:datapoint['label'] != -1)

    def __len__(self):
        return self.clip_dataset #len(self.data)

    def __getitem__(self, n: int):
        datapoint = self.data1[n]
        task_prefix = "translate English to French: "
        premise = task_prefix + datapoint['premise']
        hypothesis = task_prefix + datapoint['hypothesis']
        label = datapoint['label']
        return premise, hypothesis, label

In [8]:
english_train_dataset = SNLIEnglish(train_, 100000)
english_val_dataset = SNLIEnglish(val_, len(val_))
english_test_dataset = SNLIEnglish(test_, len(test_))

Loading cached processed dataset at /data/users/dhananjay/.cache/huggingface/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-eba2ca8d6f1fbd02.arrow
Loading cached processed dataset at /data/users/dhananjay/.cache/huggingface/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-ddf279f7e4ed9c36.arrow
Loading cached processed dataset at /data/users/dhananjay/.cache/huggingface/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-56afeaebeebd3b97.arrow


In [9]:
# custom collate function to batchify data
def custom_collate_fn(batch):
    premise, hypothesis, label = zip(*batch)
    premise_enc = t5small_tokenizer.batch_encode_plus(premise, padding=True, return_tensors='pt')['input_ids'].to(device)
    hypthesis_enc = t5small_tokenizer.batch_encode_plus(hypothesis, padding=True, return_tensors='pt')['input_ids'].to(device)
    label_enc = torch.tensor(label).to(device)
    return premise_enc, hypthesis_enc, label_enc

In [10]:
# translate and save datafile locally to filepath
def generate_french_data(dataloader, model, tokenizer, filepath):
    french_data = defaultdict(list)
    for premise_enc, hypthesis_enc, label_enc in tqdm(dataloader):
        output_premise = model.generate(premise_enc, do_sample=False)
        french_data['premise'].extend(tokenizer.batch_decode(output_premise, skip_special_tokens=True))
        output_hypothesis = model.generate(hypthesis_enc, do_sample=False)
        french_data['hypothesis'].extend(tokenizer.batch_decode(output_hypothesis, skip_special_tokens=True))
        french_data['label'].extend(label_enc.tolist())
    
    hf_french_data = Dataset.from_dict(french_data)
    with open(filepath, "wb") as fp:   #Pickling
        pickle.dump(hf_french_data, fp)
    return hf_french_data

In [11]:
def load_french_data(dataloader, model, tokenizer, filepath):
    if not os.path.exists(filepath):
        print("File doesn't exist :", filepath)
        print("creating file...")
        generate_french_data(dataloader, model, tokenizer, filepath)
        print("File created!")
    with open(filepath, "rb") as fp:   #Pickling
        hf_french_data = pickle.load(fp)
    print("Successfully loaded file - ", filepath)
    return hf_french_data

In [12]:
# create dataloader
english_train_datloader = DataLoader(dataset = english_train_dataset, collate_fn = custom_collate_fn, batch_size = 512, shuffle = False)
english_val_dataloader = DataLoader(dataset = english_val_dataset, collate_fn = custom_collate_fn, batch_size = 512, shuffle = False)
english_test_dataloader = DataLoader(dataset = english_test_dataset, collate_fn = custom_collate_fn, batch_size = 512, shuffle = False)

In [13]:
# filepath to save intermediate files loacally
curr_dir = os.getcwd()
french_train_filepath = os.path.join(curr_dir, "data", "french", "train")
french_validation_filepath = os.path.join(curr_dir, "data", "french", "validation")
french_test_filepath = os.path.join(curr_dir, "data", "french", "test")
final_filepath = os.path.join(curr_dir, "data", "french", "final")

In [14]:
t5small_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5small_model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [15]:
french_train_dataset = load_french_data(english_train_datloader, t5small_model, t5small_tokenizer, french_train_filepath)
french_val_dataset = load_french_data(english_val_dataloader, t5small_model, t5small_tokenizer, french_validation_filepath)
french_test_dataset = load_french_data(english_test_dataloader, t5small_model, t5small_tokenizer, french_test_filepath)

Successfully loaded file -  /data/users/dhananjay/nlp244/quest_4/data/french/train
File doesn't exist : /data/users/dhananjay/nlp244/quest_4/data/french/validation
creating file...


100%|███████████████████████████████████████████| 20/20 [01:59<00:00,  5.95s/it]


File created!
Successfully loaded file -  /data/users/dhananjay/nlp244/quest_4/data/french/validation
File doesn't exist : /data/users/dhananjay/nlp244/quest_4/data/french/test
creating file...


100%|███████████████████████████████████████████| 20/20 [01:58<00:00,  5.91s/it]

File created!
Successfully loaded file -  /data/users/dhananjay/nlp244/quest_4/data/french/test





In [16]:
# combine train, validation, test dataset
final_data = DatasetDict()
final_data['train'] = french_train_dataset
final_data['validation'] = french_val_dataset
final_data['test'] = french_test_dataset

In [17]:
if not os.path.exists(final_filepath):
    with open(final_filepath, "wb") as fp:   #Pickling
            pickle.dump(final_data, fp)

In [23]:
# push transalted data to the huggingface
from huggingface_hub import login
login()

Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /soe/dsonawan/.huggingface/token
Login successful


In [24]:
final_data.push_to_hub("dhananjay1210/SNLI_French")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]