## **Loading the Model**

In [140]:
!pip install --quiet sacremoses

In [141]:
from transformers import MarianMTModel, MarianTokenizer

In [142]:
model_name = 'Helsinki-NLP/opus-mt-en-fr'

In [143]:
tokenizer = MarianTokenizer.from_pretrained(model_name)

In [144]:
model = MarianMTModel.from_pretrained(model_name)

In [145]:
text = "Schedule"

In [146]:
model_inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

In [147]:
translation = model.generate(**model_inputs)

In [148]:
translation

tensor([[59513,  4576,     0]])

In [149]:
translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)

In [150]:
translated_text # give something else

'Tableau'

## **Preparing the Dataset**

In [151]:
!pip install datasets --quiet

In [152]:
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import pandas as pd

In [153]:
df = pd.read_excel('Chat.xlsx')

In [154]:
df.head()

Unnamed: 0,English (en),French (fr)
0,"""Tasks"" <Page Heading>",Tâches
1,"""Search By task title or unit name"" <Search ba...",Recherche par titre de la tâche ou nom de ...
2,"""no filters applied"" the selected filter bar",Aucun filtre n'est appliqué (no filter is appl...
3,"""Select Property"" <dropdown to select property>",Sélectionner une propriété (select a property)
4,"""Assign Employee "" <Button>",Affectation d'un employé (Assignment of an emp...


In [155]:
df.isnull().sum()

English (en)    0
French (fr)     2
dtype: int64

In [156]:
data = df.dropna()

In [157]:
data.head()

Unnamed: 0,English (en),French (fr)
0,"""Tasks"" <Page Heading>",Tâches
1,"""Search By task title or unit name"" <Search ba...",Recherche par titre de la tâche ou nom de ...
2,"""no filters applied"" the selected filter bar",Aucun filtre n'est appliqué (no filter is appl...
3,"""Select Property"" <dropdown to select property>",Sélectionner une propriété (select a property)
4,"""Assign Employee "" <Button>",Affectation d'un employé (Assignment of an emp...


In [158]:
data.isnull().sum()

English (en)    0
French (fr)     0
dtype: int64

In [159]:
data.columns

Index(['English (en)', 'French (fr)'], dtype='object')

In [160]:
import re

In [161]:
def extract_text(text):
    match = re.search(r'"([^"]*)"', text)
    if match:
        return match.group(1)
    else:
        return ''

In [162]:
data['English (en)'] = data['English (en)'].apply(extract_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['English (en)'] = data['English (en)'].apply(extract_text)


In [163]:
data.head()

Unnamed: 0,English (en),French (fr)
0,Tasks,Tâches
1,Search By task title or unit name,Recherche par titre de la tâche ou nom de ...
2,no filters applied,Aucun filtre n'est appliqué (no filter is appl...
3,Select Property,Sélectionner une propriété (select a property)
4,Assign Employee,Affectation d'un employé (Assignment of an emp...


In [164]:
# Function to remove text within parentheses
def remove_text_in_parentheses(text):
    return re.sub(r'\([^)]*\)', '', text)

In [165]:
data['French (fr)'] = data['French (fr)'].apply(remove_text_in_parentheses)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['French (fr)'] = data['French (fr)'].apply(remove_text_in_parentheses)


In [166]:
data

Unnamed: 0,English (en),French (fr)
0,Tasks,Tâches
1,Search By task title or unit name,Recherche par titre de la tâche ou nom de ...
2,no filters applied,Aucun filtre n'est appliqué
3,Select Property,Sélectionner une propriété
4,Assign Employee,Affectation d'un employé
5,Assign Team,Assigner l'équipe
6,Copy,Copier
7,Bulk Upload,Chargement en vrac
8,Add,Ajouter
10,TASK DETAILS,DÉTAILS DE TÂCHE


## **Check all the translations**

In [167]:
def translate(text, model, tokenizer):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Generate translation outputs
    translated = model.generate(**inputs)

    # Decode the translated text
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return translated_text

In [168]:
# Apply the translation function to all English sentences
data['translated_french_before_finetuning'] = data['English (en)'].apply(lambda x: translate(x, model, tokenizer)[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['translated_french_before_finetuning'] = data['English (en)'].apply(lambda x: translate(x, model, tokenizer)[0])


In [169]:
# Display sample translations and their corresponding original French sentences
for index, row in data.iterrows():
    print(f"English: {row['English (en)']}")
    print(f"Original French: {row['French (fr)']}")
    print(f"Translated French Before Finetuning: {row['translated_french_before_finetuning']}\n")

English: Tasks
Original French: Tâches
Translated French Before Finetuning: Tâches

English: Search By task title or unit name
Original French:     Recherche par titre de la tâche ou nom de l'unité
Translated French Before Finetuning: Recherche par titre de la tâche ou nom de l'unité

English: no filters applied
Original French: Aucun filtre n'est appliqué 
Translated French Before Finetuning: Aucun filtre n'est appliqué

English: Select Property
Original French: Sélectionner une propriété 
Translated French Before Finetuning: Sélectionner une propriété

English: Assign Employee 
Original French: Affectation d'un employé 
Translated French Before Finetuning: Affectation d'un employé

English: Assign Team
Original French: Assigner l'équipe
Translated French Before Finetuning: Assigner l'équipe

English: Copy
Original French: Copier
Translated French Before Finetuning: Copier

English: Bulk Upload
Original French: Chargement en vrac 
Translated French Before Finetuning: Chargement en vra

## **Training the Model**

In [170]:
dataset = Dataset.from_pandas(data)

In [171]:
def tokenize_function(examples):
    model_inputs = tokenizer(examples["English (en)"], max_length=128, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["French (fr)"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [172]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/33 [00:00<?, ? examples/s]



In [173]:
!pip install transformers[torch] --quiet

In [174]:
!pip install accelerate -U --quiet

In [175]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,  # You can adjust the number of epochs based on your dataset size and desired performance
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

In [176]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [177]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=90, training_loss=0.08743160565694173, metrics={'train_runtime': 181.4374, 'train_samples_per_second': 1.819, 'train_steps_per_second': 0.496, 'total_flos': 460276236288.0, 'train_loss': 0.08743160565694173, 'epoch': 10.0})

## **Translate English Sentences to check**

In [178]:
text = "Schedule"

In [179]:
model_inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

In [180]:
translation = model.generate(**model_inputs)

In [181]:
translation

tensor([[59513, 17923,     0]])

In [182]:
translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)

In [183]:
translated_text

'Calendrier'

In [184]:
def translate(text, model, tokenizer):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Generate translation outputs
    translated = model.generate(**inputs)

    # Decode the translated text
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return translated_text

In [185]:
# Apply the translation function to all English sentences
data['translated_french'] = data['English (en)'].apply(lambda x: translate(x, model, tokenizer)[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['translated_french'] = data['English (en)'].apply(lambda x: translate(x, model, tokenizer)[0])


## **Compare Translations**

In [186]:
# Display sample translations and their corresponding original French sentences
for index, row in data.iterrows():
    print(f"English: {row['English (en)']}")
    print(f"Original French: {row['French (fr)']}")
    print(f"Translated French: {row['translated_french']}\n")
    print(f"Translated French Before Finetuning: {row['translated_french_before_finetuning']}\n")

English: Tasks
Original French: Tâches
Translated French: Tâches

Translated French Before Finetuning: Tâches

English: Search By task title or unit name
Original French:     Recherche par titre de la tâche ou nom de l'unité
Translated French: Recherche par titre de la tâche ou nom de l'unité

Translated French Before Finetuning: Recherche par titre de la tâche ou nom de l'unité

English: no filters applied
Original French: Aucun filtre n'est appliqué 
Translated French: Aucun filtre n'est appliqué

Translated French Before Finetuning: Aucun filtre n'est appliqué

English: Select Property
Original French: Sélectionner une propriété 
Translated French: Sélectionner une propriété

Translated French Before Finetuning: Sélectionner une propriété

English: Assign Employee 
Original French: Affectation d'un employé 
Translated French: Affectation d'un employé

Translated French Before Finetuning: Affectation d'un employé

English: Assign Team
Original French: Assigner l'équipe
Translated Fre

## **Save the Model**

In [187]:
model_save_path = './trained_marian_model'
tokenizer_save_path = './trained_marian_tokenizer'

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


('./trained_marian_tokenizer/tokenizer_config.json',
 './trained_marian_tokenizer/special_tokens_map.json',
 './trained_marian_tokenizer/vocab.json',
 './trained_marian_tokenizer/source.spm',
 './trained_marian_tokenizer/target.spm',
 './trained_marian_tokenizer/added_tokens.json')

## **Loading the Saved Model**

In [188]:
from transformers import MarianMTModel, MarianTokenizer

# Specify the path where your model and tokenizer are saved
model_path = './trained_marian_model'
tokenizer_path = './trained_marian_tokenizer'

# Load the model
model = MarianMTModel.from_pretrained(model_path)

# Load the tokenizer
tokenizer = MarianTokenizer.from_pretrained(tokenizer_path)

In [189]:
translated_text # give something else

'Calendrier'

In [190]:
text = "Schedule"

In [191]:
model_inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

In [192]:
translation = model.generate(**model_inputs)

In [193]:
translation

tensor([[59513, 17923,     0]])

In [194]:
translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)

In [195]:
translated_text

'Calendrier'

## **Save Model to Drive**

In [196]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [197]:
!cp -r ./trained_marian_model "/content/drive/MyDrive/Object Detection Project/Saved Models/opus-mt-en-fr_finetuned"
!cp -r ./trained_marian_tokenizer "/content/drive/MyDrive/Object Detection Project/Saved Models/opus-mt-en-fr_tokeniser"


## **Push the Model to Hub**

In [198]:
!pip install -q transformers huggingface_hub --upgrade

In [199]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [200]:
model_name_on_hub = "dev02chandan/opus-mt-en-fr_finetuned"

# Push to hub
model.push_to_hub(model_name_on_hub)
tokenizer.push_to_hub(model_name_on_hub)

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


CommitInfo(commit_url='https://huggingface.co/dev02chandan/opus-mt-en-fr_finetuned/commit/87da5cda7a234934f3148e7903f6a94723f761a8', commit_message='Upload tokenizer', commit_description='', oid='87da5cda7a234934f3148e7903f6a94723f761a8', pr_url=None, pr_revision=None, pr_num=None)

## **Inference From Hub**

In [201]:
from transformers import MarianMTModel, MarianTokenizer

repository_id = 'dev02chandan/opus-mt-en-fr_finetuned'

# Load the model
model = MarianMTModel.from_pretrained(repository_id)

# Load the tokenizer
tokenizer = MarianTokenizer.from_pretrained(repository_id)

generation_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

In [202]:
def translate(text, model, tokenizer):
    # Tokenize the text
    inputs = tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True)

    # Generate translation outputs
    outputs = model.generate(inputs)

    # Decode the translated text
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

# Example usage
english_text = "Schedule"
translated_text = translate(english_text, model, tokenizer)

print(f"Translated text: {translated_text}")


Translated text: Calendrier


## **Conclusion**

Hence we finetuned the model to give certain translations the way we want.

Access finetuned model here: [Here](https://huggingface.co/dev02chandan/opus-mt-en-fr_finetuned)