## Install and import the needed libraries

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset, load_metric

In [68]:
from utils.prepare import *
from utils.translation import *
from utils.summarizer import Summarize

## Import and split dataset into train, test, and validation

In [69]:
file_loc = "Data/twitter_data.csv"
dataset = load_dataset("csv", data_files=file_loc)
print("---------- Dataset ----------")
print(dataset)



  0%|          | 0/1 [00:00<?, ?it/s]

---------- Dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 100
    })
})


In [70]:
dataset = split_data(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][31])



---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 64
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 20
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 16
    })
})
---------- Example output ----------
{'Tweets': 'Happy birthday to my darling Adebisi! Ojo ayo yi ni yi o, mo wa dupe lowo Oluwa.', 'Eng_source': "Happy birthday to my darling Adebisi! This is a day of joy, and I'm grateful to God.", 'Summary': "Happy Birthday Adebisi! I'm grateful to God for this joyful day."}


### Language Identification
Identify the language of each token in the tweet using the lingua library.

In [71]:
dataset = identified(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][31])



---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 64
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 20
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 16
    })
})
---------- Example output ----------
{'Tweets': 'Happy birthday to my darling Adebisi! Ojo ayo yi ni yi o, mo wa dupe lowo Oluwa.', 'Eng_source': "Happy birthday to my darling Adebisi! This is a day of joy, and I'm grateful to God.", 'Summary': "Happy Birthday Adebisi! I'm grateful to God for this joyful day.", 'Language': ['ENGLISH', None, 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'YORUBA', 'YORUBA', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, 'ENGLISH', 'YORUBA', 'YORUBA', 'YORUBA', 'YORUBA', 

### Step 2: Code-switch detection
Detect the language switch in the tweet using regular expression

In [72]:
dataset = detect(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][31])



---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 64
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 20
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 16
    })
})
---------- Example output ----------
{'Tweets': 'Happy birthday to my darling Adebisi! Ojo ayo yi ni yi o, mo wa dupe lowo Oluwa.', 'Eng_source': "Happy birthday to my darling Adebisi! This is a day of joy, and I'm grateful to God.", 'Summary': "Happy Birthday Adebisi! I'm grateful to God for this joyful day.", 'Language': ['ENGLISH', None, 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'YORUBA', 'YORUBA', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, 

### Step 3: Translation
Translate each tweet using google translate

In [73]:
dataset = translate_tweet(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][31])

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 64
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 20
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 16
    })
})
---------- Example output ----------
{'Tweets': 'Happy birthday to my darling Adebisi! Ojo ayo yi ni yi o, mo wa dupe lowo Oluwa.', 'Eng_source': "Happy birthday to my darling Adebisi! This is a day of joy, and I'm grateful to God.", 'Summary': "Happy Birthday Adebisi! I'm grateful to God for this joyful day.", 'Language': ['ENGLISH', None, 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'YORUBA', 'YORUBA', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, 'ENGLISH', 'E

#### Evaluate the performance of the translator using BLEU (Bilingual Evaluation Understudy) metric

**We calculated the bleu score for each tweet and compute the average.**

In [74]:
dataset = computed_bleu_score(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][31])

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 64
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 20
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 16
    })
})
---------- Example output ----------
{'Tweets': 'Happy birthday to my darling Adebisi! Ojo ayo yi ni yi o, mo wa dupe lowo Oluwa.', 'Eng_source': "Happy birthday to my darling Adebisi! This is a day of joy, and I'm grateful to God.", 'Summary': "Happy Birthday Adebisi! I'm grateful to God for this joyful day.", 'Language': ['ENGLISH', None, 'ENGLISH', 'ENGLISH', 'ENGLISH', None, 'YORUBA', 'YORUBA', 'ENGLISH', 'ENGLISH', 'ENGLISH', 'ENGL

In [76]:
bleu_avg(dataset)

Bleu score: 0.5392


### Step 4: Summarization

In [77]:
# create an instance of the Summarize class
summarize = Summarize()

In [78]:
# tokenize the data
tokenize_data = summarize.tokenize_data(dataset)

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [79]:
# load the model
model = summarize.load_model()

In [80]:
# data collator
collator = summarize.collate_data(model)

In [81]:
# define the model trainer
trainer = summarize.model_trainer(model, tokenize_data, collator)

In [82]:
# train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,9.700435,0.519893,0.324254,0.4854,0.49268
2,No log,7.263669,0.519331,0.318614,0.48948,0.492605
3,No log,5.977238,0.518875,0.315634,0.48556,0.488729


TrainOutput(global_step=48, training_loss=9.165999094645182, metrics={'train_runtime': 26.4779, 'train_samples_per_second': 7.251, 'train_steps_per_second': 1.813, 'total_flos': 58534699991040.0, 'train_loss': 9.165999094645182, 'epoch': 3.0})

In [83]:
trainer.evaluate()

{'eval_loss': 5.977237701416016,
 'eval_rouge1': 0.5188752870015364,
 'eval_rouge2': 0.31563374139742206,
 'eval_rougeL': 0.48556009342718953,
 'eval_rougeLsum': 0.4887288363529416,
 'eval_runtime': 2.6069,
 'eval_samples_per_second': 6.138,
 'eval_steps_per_second': 3.069,
 'epoch': 3.0}

In [91]:
# test the model
index = 16
output = summarize.test_model(dataset, index, trainer)
print("Original tweet: ", dataset["test"]["Tweets"][index])
print("Translated tweet: ", dataset["test"]["Translated_tweet"][index])
print("Generated summary: ", output)

Original tweet:  My mate has an offer as a junior project manager at his company, Ta lo fe?
Translated tweet:  My mate has an offer as a junior project manager at his company, who wants it?
Generated summary:  </s><s>My mate has an offer as a junior project manager at his company, who wants it</s>
