## Install and import the needed libraries

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset, load_metric

In [3]:
from utils.prepare import *
from utils.translation import *
from utils.summarizer import Summarize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

## Import and split dataset into train, test, and validation

In [4]:
file_loc = "Data/twitter_data.csv"
dataset = load_dataset("csv", data_files=file_loc)
print("---------- Dataset ----------")
print(dataset)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-532d11f144688f31/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-532d11f144688f31/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 100
    })
})


In [5]:
dataset = split_data(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][31])

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 64
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 20
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 16
    })
})
---------- Example output ----------
{'Tweets': 'Happy birthday to my darling Adebisi! Ojo ayo yi ni yi o, mo wa dupe lowo Oluwa.', 'Eng_source': "Happy birthday to my darling Adebisi! This is a day of joy, and I'm grateful to God.", 'Summary': "Happy Birthday Adebisi! I'm grateful to God for this joyful day."}


### Language Identification
Identify the language of each token in the tweet using the lingua library.

In [6]:
dataset = identified(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][31])

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 64
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 20
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 16
    })
})
---------- Example output ----------
{'Tweets': 'Happy birthday to my darling Adebisi! Ojo ayo yi ni yi o, mo wa dupe lowo Oluwa.', 'Eng_source': "Happy birthday to my darling Adebisi! This is a day of joy, and I'm grateful to God.", 'Summary': "Happy Birthday Adebisi! I'm grateful to God for this joyful day.", 'Language': ['ENGLISH', None, 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'YORUBA', 'YORUBA', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, 'ENGLISH', 'YORUBA', 'YORUBA', 'YORUBA', 'YORUBA', 

### Step 2: Code-switch detection
Detect the language switch in the tweet using regular expression

In [7]:
dataset = detect(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][31])

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 64
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 20
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 16
    })
})
---------- Example output ----------
{'Tweets': 'Happy birthday to my darling Adebisi! Ojo ayo yi ni yi o, mo wa dupe lowo Oluwa.', 'Eng_source': "Happy birthday to my darling Adebisi! This is a day of joy, and I'm grateful to God.", 'Summary': "Happy Birthday Adebisi! I'm grateful to God for this joyful day.", 'Language': ['ENGLISH', None, 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'YORUBA', 'YORUBA', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, 

### Step 3: Translation
Translate each tweet using google translate

In [8]:
dataset = translate_tweet(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][31])



Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 64
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 20
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 16
    })
})
---------- Example output ----------
{'Tweets': 'Happy birthday to my darling Adebisi! Ojo ayo yi ni yi o, mo wa dupe lowo Oluwa.', 'Eng_source': "Happy birthday to my darling Adebisi! This is a day of joy, and I'm grateful to God.", 'Summary': "Happy Birthday Adebisi! I'm grateful to God for this joyful day.", 'Language': ['ENGLISH', None, 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'YORUBA', 'YORUBA', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, 'ENGLISH', 'E

#### Evaluate the performance of the translator using BLEU (Bilingual Evaluation Understudy) metric

**We calculated the bleu score for each tweet and compute the average.**

In [9]:
dataset = computed_bleu_score(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][31])

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 64
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 20
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 16
    })
})
---------- Example output ----------
{'Tweets': 'Happy birthday to my darling Adebisi! Ojo ayo yi ni yi o, mo wa dupe lowo Oluwa.', 'Eng_source': "Happy birthday to my darling Adebisi! This is a day of joy, and I'm grateful to God.", 'Summary': "Happy Birthday Adebisi! I'm grateful to God for this joyful day.", 'Language': ['ENGLISH', None, 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'YORUBA', 'YORUBA', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGL

In [11]:
bleu_avg(dataset)

Bleu score: 0.5415


### Step 4: Summarization

In [12]:
# create an instance of the Summarize class
summarize = Summarize()

In [13]:
# tokenize the data
tokenize_data = summarize.tokenize_data(dataset)

Map:   0%|          | 0/64 [00:00<?, ? examples/s]



Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [14]:
# load the model
model = summarize.load_model()

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [15]:
# data collator
collator = summarize.collate_data(model)

In [16]:
# define the model trainer
trainer = summarize.model_trainer(model, tokenize_data, collator)

In [17]:
# train the model
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,9.70136,0.518673,0.321318,0.479959,0.490327
2,No log,7.2876,0.518948,0.315119,0.483345,0.489831
3,No log,5.982377,0.518743,0.312458,0.47945,0.486789


TrainOutput(global_step=48, training_loss=9.161104202270508, metrics={'train_runtime': 28.2252, 'train_samples_per_second': 6.802, 'train_steps_per_second': 1.701, 'total_flos': 58534699991040.0, 'train_loss': 9.161104202270508, 'epoch': 3.0})

In [18]:
trainer.evaluate()

{'eval_loss': 5.982377052307129,
 'eval_rouge1': 0.5187429377046906,
 'eval_rouge2': 0.3124579906952967,
 'eval_rougeL': 0.4794502022674353,
 'eval_rougeLsum': 0.48678911024190163,
 'eval_runtime': 2.6757,
 'eval_samples_per_second': 5.98,
 'eval_steps_per_second': 2.99,
 'epoch': 3.0}

In [19]:
# test the model
index = 16
output = summarize.test_model(dataset, index, trainer)
print("Original tweet: ", dataset["test"]["Tweets"][index])
print("Translated tweet: ", dataset["test"]["Translated_tweet"][index])
print("Generated summary: ", output)

Original tweet:  My mate has an offer as a junior project manager at his company, Ta lo fe?
Translated tweet:  My mate has an offer as a junior project manager at his company, who wants it?
Generated summary:  </s><s>My mate has an offer as a junior project manager at his company, who wants it</s>
