## Import the needed libraries

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset, load_metric

In [3]:
from utils.prepare import *
from utils.translation import *
from utils.summarizer import Summarize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

## Import and split dataset into train, test, and validation

In [4]:
file_loc = "Data/twitter_data.csv"
dataset = load_dataset("csv", data_files=file_loc)
print("---------- Dataset ----------")
print(dataset)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-09e5227859e07750/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-09e5227859e07750/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 66
    })
})


In [5]:
dataset = split_data(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][4])

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 41
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 14
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 11
    })
})
---------- Example output ----------
{'Tweets': 'E jowo, mo fe ra phone charger, abi o le help me ni?', 'Eng_source': 'Please, I want to buy a phone charger, or can you help me?', 'Summary': 'I need to buy a phone charger; can you please assist me?'}


### Language Identification
Identify the language of each token in the tweet using the lingua library.

In [6]:
dataset = identified(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][4])

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 41
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 14
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 11
    })
})
---------- Example output ----------
{'Tweets': 'E jowo, mo fe ra phone charger, abi o le help me ni?', 'Eng_source': 'Please, I want to buy a phone charger, or can you help me?', 'Summary': 'I need to buy a phone charger; can you please assist me?', 'Language': ['ENGLISH', None, 'YORUBA', 'ENGLISH', 'YORUBA', 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', 'ENGLISH', 'YORUBA', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'YORUBA', 'ENGLISH', 'ENGLISH', None, None, None, 'YORUBA', 'YORUBA', None, 'ENG

### Step 2: Code-switch detection
Detect the language switch in the tweet using regular expression

In [7]:
dataset = detect(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][4])

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 41
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 14
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 11
    })
})
---------- Example output ----------
{'Tweets': 'E jowo, mo fe ra phone charger, abi o le help me ni?', 'Eng_source': 'Please, I want to buy a phone charger, or can you help me?', 'Summary': 'I need to buy a phone charger; can you please assist me?', 'Language': ['ENGLISH', None, 'YORUBA', 'ENGLISH', 'YORUBA', 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', 'ENGLISH', 'YORUBA', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'YORUBA', 'ENGLISH', 'ENGLISH

### Step 3: Translation
Translate each tweet using google translate

In [8]:
dataset = translate_tweet(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][4])



Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 41
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 14
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 11
    })
})
---------- Example output ----------
{'Tweets': 'E jowo, mo fe ra phone charger, abi o le help me ni?', 'Eng_source': 'Please, I want to buy a phone charger, or can you help me?', 'Summary': 'I need to buy a phone charger; can you please assist me?', 'Language': ['ENGLISH', None, 'YORUBA', 'ENGLISH', 'YORUBA', 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', 'ENGLISH', 'YORUBA', 'ENGLISH', None, 'ENGLISH

#### Evaluate the performance of the translator using BLEU (Bilingual Evaluation Understudy) metric

**We calculated the bleu score for each tweet and compute the average.**

In [9]:
dataset = computed_bleu_score(dataset)
print("---------- Updated dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][4])

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

---------- Updated dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 41
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 14
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 11
    })
})
---------- Example output ----------
{'Tweets': 'E jowo, mo fe ra phone charger, abi o le help me ni?', 'Eng_source': 'Please, I want to buy a phone charger, or can you help me?', 'Summary': 'I need to buy a phone charger; can you please assist me?', 'Language': ['ENGLISH', None, 'YORUBA', 'ENGLISH', 'YORUBA', 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', 'ENG

In [10]:
bleu_avg(dataset)

Bleu score: 0.4990


### Step 4: Summarization

In [11]:
# create an instance of the Summarize class
summarize = Summarize()

In [12]:
# tokenize the data
tokenize_data = summarize.tokenize_data(dataset)

Map:   0%|          | 0/41 [00:00<?, ? examples/s]



Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [13]:
# load the model
model = summarize.load_model()

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [14]:
# data collator
collator = summarize.collate_data(model)

In [15]:
# define the model trainer
trainer = summarize.model_trainer(model, tokenize_data, collator)

In [16]:
# train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Tweets, Summary, Translated_tweet, Code_switches, Eng_source, Bleu_score, Language. If Tweets, Summary, Translated_tweet, Code_switches, Eng_source, Bleu_score, Language are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 41
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 2
  Total optimization steps = 30
  Number of trainable parameters = 139420416
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
0,No log,11.119075,0.470957,0.229379,0.441364,0.445964
1,No log,9.991323,0.470957,0.229379,0.441364,0.445964
2,No log,9.450173,0.470069,0.22928,0.43935,0.444815


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Tweets, Summary, Translated_tweet, Code_switches, Eng_source, Bleu_score, Language. If Tweets, Summary, Translated_tweet, Code_switches, Eng_source, Bleu_score, Language are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "nu

TrainOutput(global_step=30, training_loss=11.0942138671875, metrics={'train_runtime': 21.5648, 'train_samples_per_second': 5.704, 'train_steps_per_second': 1.391, 'total_flos': 37193923952640.0, 'train_loss': 11.0942138671875, 'epoch': 2.95})

In [18]:
# test the model
index = 8
output = summarize.test_model(dataset, index, trainer)
print("Original tweet: ", dataset["test"]["Tweets"][index])
print("Translated tweet: ", dataset["test"]["Translated_tweet"][index])
print("Generated summary: ", output)

***** Running Prediction *****
  Num examples = 1
  Batch size = 2
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Original tweet:  I hope say you dey alright, mo n'ife re gan ni.
Translated tweet:  I hope you say you're alright, I really love you.
Generated summary:  </s><s>I hope you say you're alright, I really love you.</s><pad><pad><pad><pad>
