In [1]:
!nvidia-smi

Sat May 18 09:41:03 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
# !pip install transformers[sentencepiece]
# !pip install datasets
# !pip install sacrebleu rouge_score
# !pip install py7zr # For unziping the file

!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q # -q means quite mode.
!pip install --upgrade accelerate # accelerate is for GPU to run distributed processes.
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Found existing installation: transformers 4.41.0
Uninstalling transformers-4.41.0:
  Successfully uninstalled transformers-4.41.0
Found existing installation: accelerate 0.30.1
Uninstalling accelerate-0.30.1:
  Successfully uninstalled accelerate-0.30.1
Collecting transformers
  Using cached transformers-4.41.0-py3-none-any.whl (9.1 MB)
Collecting accelerate
  Using cached accelerate-0.30.1-py3-none-any.whl (302 kB)
Installing collected packages: transformers, accelerate
Successfully installed accelerate-0.30.1 transformers-4.41.0


## Different Approaches.
- Approach 1. Fine-Tuning of Pre-Trained Model or
- Approach 2. Create A Knowledge Base and Extract the information from the Knowledge Base and Connect it through Query to generate the Output.
This process is called as RAG i.e. Retrieval Augmented Generation.

- `Sacrebleu` is a metric commonly used for evaluating the quality of machine-generated translations, typicaly in the context of machine translation tasks. It often used in NLP competions and research to access the performace of translation model.

- Rouge (Recall-Oriented Understudy for Gisting Evaluation) is a metrics used for evaluating the quality of summaries by comparing them to reference summaries. Rouge metrics are commonly used in NLP processing tasks, particularly in text summarization systems.

In [4]:
import os, sys, transformers, datasets
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AdamWeightDecay # Adam Optimizer

### A Language Translation System Using HuggingFace.

## Fine-Tuning of Pre-Trained Model.
- Step 1: Training of the model
- Step 2: Inferencing of the model i.e. loading the pretrained model and predicting the results.

In [None]:
# device = 'cuda' if torch.cuda.is_available() else "cpu"
# device

In [None]:
# Using the Pre-trained Helsinki-NLP/opus-mt-en-hi Model. This is for English to Hindi Language.
model_checkpoint = 'Helsinki-NLP/opus-mt-en-hi'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Loading the In-built Dataset.
dataset = load_dataset('cfilt/iitb-english-hindi')
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [None]:
dataset['train']

Dataset({
    features: ['translation'],
    num_rows: 1659083
})

In [None]:
dataset['test'][1]

{'translation': {'en': "As America's road planners struggle to find the cash to mend a crumbling highway system, many are beginning to see a solution in a little black box that fits neatly by the dashboard of your car.",
  'hi': 'जबकि अमेरिका के सड़क योजनाकार, ध्वस्त होते हुए हाईवे सिस्टम को सुधारने के लिए धन की कमी से जूझ रहे हैं, वहीं बहुत-से लोग इसका समाधान छोटे से ब्लैक बॉक्स में देख रहे हैं, जो आपकी कार के डैशबोर्ड पर सफ़ाई से फिट हो जाता है।'}}

In [None]:
for data in dataset:
    print(dataset[data])

Dataset({
    features: ['translation'],
    num_rows: 1659083
})
Dataset({
    features: ['translation'],
    num_rows: 520
})
Dataset({
    features: ['translation'],
    num_rows: 2507
})


In [None]:
dataset['train'].column_names

['translation']

In [None]:
tokenizer('These is a professor who is teaching')

{'input_ids': [1055, 23, 19, 10876, 66, 23, 1762, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer('जबकि अमेरिका के सड़क योजनाकार')

{'input_ids': [44, 2273, 1400, 2146, 44, 4499, 1056, 174, 428, 1908, 260, 44, 906, 44, 1, 1543, 716, 44, 1321, 917, 2273, 314, 260, 716, 260, 428, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
with tokenizer.as_target_tokenizer(): # These are target specific tokens.
    print(tokenizer(['जबकि अमेरिका के सड़क योजनाकार']))

{'input_ids': [[992, 1117, 6, 6341, 4408, 4125, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}




In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [None]:
dataset['train']['translation']

[{'en': 'Give your application an accessibility workout',
  'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'},
 {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'},
 {'en': 'The default plugin layout for the bottom panel',
  'hi': 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका'},
 {'en': 'The default plugin layout for the top panel',
  'hi': 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका'},
 {'en': 'A list of plugins that are disabled by default',
  'hi': 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है'},
 {'en': 'Highlight duration', 'hi': 'अवधि को हाइलाइट रकें'},
 {'en': 'The duration of the highlight box when selecting accessible nodes',
  'hi': 'पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि'},
 {'en': 'Highlight border color',
  'hi': 'सीमांत (बोर्डर) के रंग को हाइलाइट करें'},
 {'en': 'The color and opacity of the highlight border.',
  'hi': 'हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। '},
 {'en': 'Highlight fill color', 'hi': 

In [None]:
for i in dataset['train']['translation'][:20]:
    print(i['en'])

Give your application an accessibility workout
Accerciser Accessibility Explorer
The default plugin layout for the bottom panel
The default plugin layout for the top panel
A list of plugins that are disabled by default
Highlight duration
The duration of the highlight box when selecting accessible nodes
Highlight border color
The color and opacity of the highlight border.
Highlight fill color
The color and opacity of the highlight fill.
API Browser
Browse the various methods of the current accessible
Hide private attributes
Method
Property
Value
IPython Console
Interactive console for manipulating currently selected accessible
Event monitor


In [None]:
max_input_length = 128
max_target_length = 128

source_lang = 'en'
target_lang = 'hi'

def process_function(sentence):
    inputs = [ex[source_lang] for ex in sentence['translation']] # English sentences
    targets = [ex[target_lang] for ex in sentence['translation']] # Hindi Sentences
    model_inputs = tokenizer(inputs, max_length = max_input_length, truncation = True) # Tokenizer for the inputs
    with tokenizer.as_target_tokenizer(): # Tokenizer for targets using 'as_target_tokenizer'
        labels = tokenizer(targets, max_length = max_target_length, truncation = True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
tokenized_data = dataset.map(process_function, batched = True)

In [None]:
# Downloading the Pre-Trained Weights.
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [None]:
# Training Parameters.
batch_size = 32
learning_rate = 0.001
weight_decay = 0.01
num_train_epochs = 1

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model, return_tensors = 'tf')

In [None]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model = model, return_tensors = 'tf', pad_to_multiple_of=128)

In [None]:
train_dataset = model.prepare_tf_dataset(
    tokenized_data['test'],
    batch_size = batch_size,
    shuffle = True,
    collate_fn = data_collator,
)

TypeError: Cannot convert [array([6.000e+01, 2.170e+02, 5.340e+02, 6.000e+00, 3.900e+01, 1.776e+03,
       1.100e+01, 5.448e+03, 1.591e+03, 5.000e+00, 4.000e+01, 0.000e+00])] to EagerTensor of dtype int64

In [None]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_data['validation'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn = data_collator,
)

TypeError: Cannot convert [array([2.6970e+03, 1.0395e+04, 8.9000e+01, 1.3914e+04, 2.4701e+04,
       1.5000e+01, 1.8410e+03, 9.0750e+03, 1.8600e+02, 4.1000e+01,
       4.5500e+02, 2.9000e+01, 5.1000e+01, 4.2890e+03, 4.0000e+01,
       0.0000e+00])] to EagerTensor of dtype int64

In [None]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_data['validation'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn = generation_data_collator
)

In [None]:
optimizer = AdamWeightDecay(learning_rate = learning_rate, weight_decay_rate = weight_decay)

In [None]:
model.compile(optimizer = optimizer)

In [None]:
model.fit(train_dataset, validation_data = validation_dataset, epochs = num_train_epochs)

In [None]:
# Saving the Fine-Tuned Model.
model.save_pretrained('tf_translation')

### `Model Inference`

In [None]:
input_text = 'I am learning Coding. How about your teaching'

In [None]:
tokenized = tokenizer([input_text], return_tensors = 'np') # np means numpy array
tokenized

In [None]:
outputs = model.generate(**tokenized, max_length = 128) # ** means passing complete dict.

In [None]:
outputs[0]

In [None]:
with tokenizer.as_target_tokenizer():
  print(tokenizer.decode(outputs[0]))

In [None]:
# To remove the padding tokens from the result, use 'skip_special_tokens = True'.
with tokenizer.as_target_tokenizer():
  print(tokenizer.decode(outputs[0]), skip_special_tokens = True)

In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(outputs[0]), skip_special_tokens = True)