In [6]:
%pip install transformers datasets pandas scikit-learn sentencepiece


Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = ''

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load your CSV file
df = pd.read_csv('dictionary.csv', on_bad_lines='skip')



In [3]:
# Create a Dataset object
dataset = Dataset.from_pandas(df)

# Split the dataset into train and validation sets
dataset = dataset.train_test_split(test_size=0.1)



In [4]:
# Load the IndicTrans2 tokenizer and model
model_name = "ai4bharat/indictrans2-en-indic-dist-200m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [5]:
# Set the source and target languages
tokenizer.src_lang = "hi"
tokenizer.tgt_lang = "en"

# Tokenize the dataset
def preprocess_function(examples):
    inputs = [str(ex) for ex in examples["Hindi Translation"]] # Convert to strings
    targets = [str(ex) for ex in examples["English Phrase"]] # Convert to strings
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)



Map: 100%|██████████| 26748/26748 [00:02<00:00, 11338.61 examples/s]
Map: 100%|██████████| 2972/2972 [00:00<00:00, 12174.69 examples/s]


In [6]:
early_stopping = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.001)

In [7]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=64,
    weight_decay=0.01,
    num_train_epochs=100,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    save_strategy="no",
    logging_steps=100
)



In [8]:
# Create a data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [9]:

# Fine-tune the model
trainer.train()


  0%|          | 0/41700 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 240.00 MiB. GPU 0 has a total capacity of 3.81 GiB of which 7.06 MiB is free. Including non-PyTorch memory, this process has 3.80 GiB memory in use. Of the allocated memory 3.64 GiB is allocated by PyTorch, and 76.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:

# Save the fine-tuned model
model.save_pretrained("./finetuned_indictrans2_hi_en")
tokenizer.save_pretrained("./finetuned_indictrans2_hi_en")