In [15]:
%pip install transformers[torch] datasets pandas scikit-learn sentencepiece accelerate

Note: you may need to restart the kernel to use updated packages.


In [9]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.19.0%2Bcu118-cp310-cp310-win_amd64.whl (5.0 MB)
     ---------------------------------------- 0.0/5.0 MB ? eta -:--:--
     ---- ----------------------------------- 0.5/5.0 MB 4.2 MB/s eta 0:00:02
     ---------- ----------------------------- 1.3/5.0 MB 4.8 MB/s eta 0:00:01
     -------------------- ------------------- 2.6/5.0 MB 4.9 MB/s eta 0:00:01
     ------------------------------- -------- 3.9/5.0 MB 5.2 MB/s eta 0:00:01
     ------------------------------------- -- 4.7/5.0 MB 4.8 MB/s eta 0:00:01
     ---------------------------------------- 5.0/5.0 MB 4.6 MB/s eta 0:00:00
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.4.0%2Bcu118-cp310-cp310-win_amd64.whl (4.0 MB)
     ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
     ---------- ----------------------------- 1.0/4.0 MB 6

  You can safely remove it manually.


In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load your CSV file
df = pd.read_csv('dictionary.csv', on_bad_lines='skip')



In [3]:
# Create a Dataset object
dataset = Dataset.from_pandas(df)

# Split the dataset into train and validation sets
dataset = dataset.train_test_split(test_size=0.1)



In [4]:
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
os.environ['HF_HOME'] = 'F:\\HFCache'

# Load the IndicTrans2 tokenizer and model
model_name = "ai4bharat/indictrans2-en-indic-dist-200m"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)



In [5]:
# Set the source and target languages
tokenizer.src_lang = "hi"
tokenizer.tgt_lang = "en"

# Tokenize the dataset
def preprocess_function(examples):
    inputs = [str(ex) for ex in examples["Hindi Translation"]] # Convert to strings
    targets = [str(ex) for ex in examples["English Phrase"]] # Convert to strings
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)



Map: 100%|██████████| 26748/26748 [00:03<00:00, 8875.89 examples/s]
Map: 100%|██████████| 2972/2972 [00:00<00:00, 9812.60 examples/s]


In [6]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=32,
    weight_decay=0.01,
    num_train_epochs=100,
    predict_with_generate=True,
    fp16=True,
    # fp16_opt_level="O1",
    push_to_hub=False,
    save_strategy="no",
)



In [7]:
# Create a data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [8]:

# Fine-tune the model
trainer.train()


  0%|          | 18/20800 [07:54<158:36:49, 27.48s/it]

In [None]:

# Save the fine-tuned model
model.save_pretrained("./finetuned_indictrans2_hi_en")
tokenizer.save_pretrained("./finetuned_indictrans2_hi_en")