In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
def load_parallel_texts(src_file, tgt_file):
    with open(src_file, 'r', encoding='utf-8') as src, open(tgt_file, 'r', encoding='utf-8') as tgt:
      while True:
        src_line = src.readline()
        tgt_line = tgt.readline()
        if not src_line or not tgt_line:
          break
        yield src_line.strip(), tgt_line.strip()

    #src_texts = [line.strip() for line in src_lines]
    #tgt_texts = [line.strip() for line in tgt_lines]

# src_file_path = ['/content/drive/My Drive/AIProject/datasets/de-en/ELRC-1089-German_Foreign_Offic.de-en.en']
# tgt_file_path = ['/content/drive/My Drive/AIProject/datasets/de-en/ELRC-1089-German_Foreign_Offic.de-en.de']

# src_file_path = ['/content/drive/My Drive/AIProject/datasets/en-es/ELRC-3571-EUR_LEX_covid.en-es.en']
# tgt_file_path = ['/content/drive/My Drive/AIProject/datasets/en-es/ELRC-3571-EUR_LEX_covid.en-es.es']

src_file_path = ['/content/drive/My Drive/AIProject/datasets/en-fr/ELRC-EUR_LEX.en-fr.en']
tgt_file_path = ['/content/drive/My Drive/AIProject/datasets/en-fr/ELRC-EUR_LEX.en-fr.fr']

lang1_texts = []
lang2_texts = []

for src, tgt in zip(src_file_path, tgt_file_path):
    for src_text, tgt_text in load_parallel_texts(src, tgt):
        lang1_texts.append(src_text)
        lang2_texts.append(tgt_text)

print(len(lang1_texts))


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('t5-small')


def batch_tokenize(src_texts, tgt_texts, src_lang, tgt_lang, batch_size):
    """Tokenize texts in batches."""
    for i in range(0, len(src_texts), batch_size):
        batch_src_texts = src_texts[i:i + batch_size]
        batch_tgt_texts = tgt_texts[i:i + batch_size]
        input_texts = [f"translate {src_lang} to {tgt_lang}: {text}" for text in batch_src_texts]
        model_inputs = tokenizer(input_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        labels = tokenizer(batch_tgt_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt").input_ids
        yield model_inputs, labels

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
from transformers import T5ForConditionalGeneration, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch
import gc

# Preparation
model = T5ForConditionalGeneration.from_pretrained('t5-small').to('cuda')  # or 'cpu'
optimizer = AdamW(model.parameters(), lr=0.0001)

# Assuming the total number of steps and the number of warmup steps are defined
total_steps = len(lang1_texts)  # This should be adjusted based on your actual training regimen
warmup_steps = int(total_steps * 0.1) # Adjust as necessary

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

model.train()
i = 0;

lang1 = 'English'
lang2 = 'French'

# Manual Training Loop
# Train for 3 epochs
for epoch in range(3):
  for model_inputs, labels in batch_tokenize(lang1_texts, lang2_texts, lang1, lang2, 8):
      input_ids = model_inputs['input_ids'].to('cuda')  # Move to GPU if using CUDA
      attention_mask = model_inputs['attention_mask'].to('cuda')
      labels = labels.to('cuda')

      optimizer.zero_grad()
      outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      scheduler.step()

      i=i+1
      print(f"Iteration: {i} Loss: {loss.item()}")

      # Explicit memory management
      del input_ids, attention_mask, labels, outputs, loss
      torch.cuda.empty_cache()
      gc.collect()

# Saving the model and tokenizer
model.save_pretrained('./drive/MyDrive/AIProject/t5-small-translation-en-fr')
tokenizer.save_pretrained('./drive/MyDrive/AIProject/t5-small-translation-en-fr')


In [None]:
# Saving the model and tokenizer
model.save_pretrained('./drive/MyDrive/AIProject/t5-small-translation-en-hi')
tokenizer.save_pretrained('./drive/MyDrive/AIProject/t5-small-translation-en-hi')