###Installing and uploading the needed packages in our project

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding


In [None]:
!pip install transformers
!pip install datasets
!pip install sacrebleu
!pip install sentencepiece
import transformers
import pandas as pd
import numpy as np
import random
from huggingface_hub import notebook_login
from transformers.utils import send_example_telemetry
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_dataset, load_metric
from IPython.display import display, HTML
from transformers import AutoTokenizer
from keras.callbacks import ModelCheckpoint
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer


###Defining our Model: Flan T5 Base

In [None]:
#Defining our model :  MT0 base
model_checkpoint = "bigscience/mt0-base"


###Loading the dataset

In [None]:
#loading dataset" Dora it contains more than 10 000 rows
df =  pd.read_csv ('./sentences_.csv')
metric = load_metric("sacrebleu")

In [None]:
#Dropping the null rows
df = df.dropna()
#Trying to reduce the overfitting by addin a string to out 'english column
df['english'] = 'translate to arabic : ' + df['english'].astype(str)
df['english'].head()

###Spliting data to 2 parts : 80% for training, 20% for test

In [None]:
# split the data into train and test set
train, test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [None]:
test.shape

(2000, 2)

###Hugging face dataset load from CSV

In [None]:
#Hugging face dataset load from CSV
train_ = Dataset.from_pandas(train)
test_ = Dataset.from_pandas(test)
#Transforming our dataset to the hugging face dictset format
df = DatasetDict()
#Removing a generated Index column
df.remove_columns("__index_level_0__")
df['train'] = train_.remove_columns("__index_level_0__")
df['test'] = test_.remove_columns("__index_level_0__")


###Generalizing random data 

In [None]:
!pip install datasets


In [None]:
!pip install datasets


In [None]:
from datasets import Dataset, DatasetDict, load_dataset, load_metric

In [None]:
#Showing some random examples from our dataset t understand what our dataset looks like
def show_random_elements(df, num_examples=5):
    assert num_examples <= len(df), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(df)-1)
        while pick in picks:
            pick = random.randint(0, len(df)-1)
        picks.append(pick)
    
    data = pd.DataFrame(df[picks])
    for column, typ in df.features.items():
        if isinstance(typ, datasets.ClassLabel):
            data[column] = data[column].transform(lambda i: typ.names[i])
    display(HTML(data.to_html()))

In [None]:
show_random_elements(df["train"])

###Generating our metric

In [None]:
#Generating a matric
metric
fake_preds = ["hello brother", "salam khouya"]
fake_labels = [["hello brother"], ["salam khouya"]]
metric.compute(predictions=fake_preds, references=fake_labels)

###Preprocessing data
we need to preprocess our data before giving it to our model. 
We used Transformers Tokenizer tokenize the inputs.

In [None]:
#Preprocessing data   
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
if "mbart" in model_checkpoint:
    tokenizer.src_lang = "en-XX"
    tokenizer.tgt_lang = "ar_AR"

In [None]:
#testing out tokenizer
tokenizer("Hello, this one sentence!")

{'input_ids': [30273, 261, 714, 1371, 259, 98923, 309, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
#we need to tokenize them inside the as_target_tokenizer context manager. 
#This will make sure the tokenizer uses the special tokens corresponding to the targets
with tokenizer.as_target_tokenizer():
    print(tokenizer(["Hello, this one sentence!", "This is another sentence."]))

In [None]:
#T5 checkpoints require a special prefix to put before the inputs
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "translate English to darija: "
else:
    prefix = ""

In [None]:
#the function that will preprocess our samples
max_input_length = 128
max_target_length = 128
source_lang = "english"
target_lang = "darija"

def preprocess_function(examples):
    print(examples)
    inputs = [prefix + ex  for ex in examples[source_lang]]
    targets = [ex for ex in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
show_random_elements(df["train"])

To apply the preprocess function on all the pairs of sentences in our dataset, we just use the map method of our dataset object we created earlier

In [None]:
tokenized_datasets = df.map(preprocess_function, batched=True)


###Finetuning

In [None]:
#Finetuning
#we use the AutoModelForSeq2SeqLM class. Like with the tokenizer.
#the from_pretrained method will download and cache the model for us.
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

To instantiate a Seq2SeqTrainer, we will need to define three more things. The most important is the Seq2SeqTrainingArguments, which is a class that contains all the attributes to customize the training. 

In [None]:
source_lang = "english"
target_lang = "darija"
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

The last thing to define for our Seq2SeqTrainer is how to compute the metrics from the predictions.

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

we need to pass all of this along with our datasets to the Seq2SeqTrainer

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

We can now finetune our model by just calling the train method

In [None]:

trainer.train()

###Testing with new data

In [None]:
inputs = tokenizer("translate to arabic : hi, I am salma and you?", return_tensors="pt").to("cuda")
output = model.generate(**inputs)
output

In [None]:
tokenizer.decode(output.cpu().numpy()[0])

In [None]:
inputs = tokenizer("translate to arabic : hello, would you like to give a pitch tomorrow?", return_tensors="pt").to("cuda")
output = model.generate(**inputs)

In [None]:
tokenizer.decode(output.cpu().numpy()[0])

Gradio-Interface

In [None]:
!pip install gradio

In [None]:
!pip install transformers

In [None]:
!pip install git+https://github.com/huggingface/transformers.git

In [None]:

!pip install diffusers==0.12.1

In [4]:
import os
import torch
import gradio as gr
import time
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline


In [1]:
def translation(text):

    model_checkpoint = "bigscience/mt0-base"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    #inference
    inputs = tokenizer("translate to arabic: " + text, return_tensors="pt")
    output = model.generate(**inputs)
    output = tokenizer.decode(output.cpu().numpy()[0], skip_special_tokens=True)
    return output

In [8]:
translation('how are you doing today ?')

'كيف فعلت اليوم؟'

In [None]:
if __name__ == '__main__':
    print('\tinit models')

    #inputs = [gr.inputs.Radio(['nllb-distilled-600M', 'nllb-1.3B', 'nllb-distilled-1.3B'], label='NLLB Model'),
    inputs = [gr.inputs.Textbox(lines=5, label="Input text")]

    outputs = gr.outputs.Textbox(label="Output text")

    title = "Derej M3aya"

    demo_status = "Demo is running on CPU"
    description = f"Details: https://github.com/facebookresearch/fairseq/tree/nllb. {demo_status}"
    examples = [
    ['English', 'Darija', 'Hi nice to meet you']
    ]

    gr.Interface(translation,
                 inputs,
                 outputs,
                 title=title,
                 description=description,
                 ).launch(share=True, debug = True)

