## Install and import the needed libraries

In [1]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install rouge_score
!pip install evaluate
!pip install torch
!pip3 install torch torchvision
!pip install lingua-language-detector
!pip3 install -U numpy
!pip3 install googletrans==3.1.0a0

In [2]:
import transformers
import nltk
import pandas as pd
import numpy as np
import torch
import evaluate
import os
import re
from googletrans import Translator
from lingua import Language, LanguageDetectorBuilder
from nltk.tokenize import sent_tokenize, word_tokenize
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from nltk.tokenize import RegexpTokenizer
from torch.utils.data import DataLoader
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer



In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Import and split dataset into train, test, and validation

In [5]:
file_loc = "Data/twitter_data.csv"
dataset = load_dataset("csv", data_files=file_loc)
print("---------- Dataset ----------")
print(dataset)



  0%|          | 0/1 [00:00<?, ?it/s]

---------- Dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 66
    })
})


In [6]:
datasets_train_test = dataset["train"].train_test_split(test_size=0.1, seed=0)
datasets_train_validation = datasets_train_test["train"].train_test_split(test_size=0.1, seed=0)

dataset["train"] = datasets_train_validation["train"]
dataset["test"] = datasets_train_test["test"]
dataset["validation"] = datasets_train_validation["test"]

print("---------- Updatad dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][4])



---------- Updatad dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 53
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 7
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary'],
        num_rows: 6
    })
})
---------- Example output ----------
{'Tweets': 'E jowo, mo fe ra phone charger, abi o le help me ni?', 'Eng_source': 'Please, I want to buy a phone charger, or can you help me?', 'Summary': 'I need to buy a phone charger; can you please assist me?'}


### Language Identification
Identify the language of each token in the tweet using the lingua library.

In [7]:
def identify(tweet):
  languages = [Language.ENGLISH, Language.YORUBA]
  detector = LanguageDetectorBuilder.from_languages(*languages).build()
  lang_list = []
  for word in tweet:
    lang = detector.detect_language_of(word)
    if (lang == None):
      lang_list.append(None)
    else:
      lang_list.append(lang.name)
  return lang_list

dataset = dataset.map(lambda x: {"Language": identify(x['Tweets'])})
print("---------- Updatad dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][4])



---------- Updatad dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 53
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 7
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language'],
        num_rows: 6
    })
})
---------- Example output ----------
{'Tweets': 'E jowo, mo fe ra phone charger, abi o le help me ni?', 'Eng_source': 'Please, I want to buy a phone charger, or can you help me?', 'Summary': 'I need to buy a phone charger; can you please assist me?', 'Language': ['ENGLISH', None, 'YORUBA', 'ENGLISH', 'YORUBA', 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', 'ENGLISH', 'YORUBA', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'YORUBA', 'ENGLISH', 'ENGLISH', None, None, None, 'YORUBA', 'YORUBA', None, 'ENGLI

### Step 2: Code-switch detection
Detect the language switch in the tweet using regular expression

In [8]:
def detect(tweet):
  return re.findall(r'\b\w+\b', tweet)
  
dataset = dataset.map(lambda x: {"Code_switches": detect(x['Tweets'])})
print("---------- Updatad dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][4])



---------- Updatad dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 53
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 7
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches'],
        num_rows: 6
    })
})
---------- Example output ----------
{'Tweets': 'E jowo, mo fe ra phone charger, abi o le help me ni?', 'Eng_source': 'Please, I want to buy a phone charger, or can you help me?', 'Summary': 'I need to buy a phone charger; can you please assist me?', 'Language': ['ENGLISH', None, 'YORUBA', 'ENGLISH', 'YORUBA', 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', 'ENGLISH', 'YORUBA', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'YORUBA', 'ENGLISH', 'ENGLISH',

### Step 3: Translation
Translate each tweet using google translate

In [9]:
def translate_tweet(tweet):
  return translator.translate(tweet, src='yo', dest='en').text

translator = Translator()
dataset = dataset.map(lambda x: {"Translated_tweet": translate_tweet(x['Tweets'])})
print("---------- Updatad dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][4])



Map:   0%|          | 0/53 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

---------- Updatad dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 53
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 7
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet'],
        num_rows: 6
    })
})
---------- Example output ----------
{'Tweets': 'E jowo, mo fe ra phone charger, abi o le help me ni?', 'Eng_source': 'Please, I want to buy a phone charger, or can you help me?', 'Summary': 'I need to buy a phone charger; can you please assist me?', 'Language': ['ENGLISH', None, 'YORUBA', 'ENGLISH', 'YORUBA', 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', 'ENGLISH', 'YORUBA', 'ENGLISH', None, 'ENGLISH',

#### Evaluate the performance of the translator using BLEU (Bilingual Evaluation Understudy) metric

In [11]:
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

def compute_bleu_score(predictions, references):
    smoothing = SmoothingFunction()
    # Tokenize the predictions and references
    predictions = [prediction.split() for prediction in predictions]
    references = [[reference.split()] for reference in references]

    # Compute the BLEU score
    bleu_score = corpus_bleu(references, predictions, smoothing_function=smoothing.method2)

    return bleu_score

# def compute_bleu_score(predictions, references):
#     # Tokenize the predictions and references
#     # predictions = [prediction.split() for prediction in predictions]
#     # references = [[reference.split()] for reference in references]

#     predictions = [predictions]
#     references = [[references]]

#     bleu = evaluate.load("bleu")
#     bleu_score = bleu.compute(predictions=predictions, references=references)

#     return bleu_score

**We calculated the bleu score for each tweet and compute the average.**

In [12]:
# dataset = dataset.map(lambda x: {"Bleu_score": compute_bleu_score([x['Translated_tweet']], [x['Eng_source']])})
dataset = dataset.map(lambda x: {"Bleu_score": compute_bleu_score([x['Translated_tweet']], [x['Eng_source']])})
print("---------- Updatad dataset ----------")
print(dataset)
print("---------- Example output ----------")
print(dataset["train"][4])

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

---------- Updatad dataset ----------
DatasetDict({
    train: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 53
    })
    test: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 7
    })
    validation: Dataset({
        features: ['Tweets', 'Eng_source', 'Summary', 'Language', 'Code_switches', 'Translated_tweet', 'Bleu_score'],
        num_rows: 6
    })
})
---------- Example output ----------
{'Tweets': 'E jowo, mo fe ra phone charger, abi o le help me ni?', 'Eng_source': 'Please, I want to buy a phone charger, or can you help me?', 'Summary': 'I need to buy a phone charger; can you please assist me?', 'Language': ['ENGLISH', None, 'YORUBA', 'ENGLISH', 'YORUBA', 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', 'ENGLISH', None, 'ENGLISH', None, None, 'ENGLISH', 'ENGLISH', 'ENGLI

In [13]:
from statistics import mean

# bleu = mean(dataset["train"]["Bleu_score"])
bleu = mean(dataset["train"]["Bleu_score"])
print(f"Bleu score: {bleu:.4f}")

Bleu score: 0.5010


### Step 4: Summarization
Fine-tune the BART model for summarization and evaluate its performance using the ROUGE metrics.

In [14]:
# define the variables
max_input = 512
max_target = 128
batch_size = 3
model_checkpoints = "facebook/bart-base"

In [15]:
# toenize the data
tokenizer = AutoTokenizer.from_pretrained(model_checkpoints)

In [16]:
# preprocess the data
def preprocess_data(data_to_process):
  # get all the translated tweets
  inputs = [tweet for tweet in data_to_process['Translated_tweet']]
  # tokenize the translated tweets
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)
  # tokenize the summaries
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['Summary'], max_length=max_target, padding='max_length', truncation=True)
    
  #set labels
  model_inputs['labels'] = targets['input_ids']

  #return the tokenized data
  return model_inputs

In [17]:
tokenize_data = dataset.map(preprocess_data, batched = True)

Map:   0%|          | 0/53 [00:00<?, ? examples/s]



Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [18]:
# load the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

In [19]:
collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    return_tensors="pt")

In [20]:
rouge_metric = evaluate.load("rouge")

# define function for custom tokenization
def tokenize_sentence(arg):
    encoded_arg = tokenizer(arg)
    return tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

# define function for computing the rouge score
def compute_rouge(eval_arg):
    preds, labels = eval_arg
    
      # Replace -100
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
      # Convert id tokens to text
    text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    text_preds = [(p if p.endswith(("!", "！", "?", "？", ".")) else p + ".") for p in text_preds]
    text_labels = [(l if l.endswith(("!", "！", "?", "？", ".")) else l + ".") for l in text_labels]
    sent_tokenizer_c = RegexpTokenizer(u'[^!！?？.]*[!！?？.]')
    text_preds = ["\n".join(np.char.strip(sent_tokenizer_c.tokenize(p))) for p in text_preds]
    text_labels = ["\n".join(np.char.strip(sent_tokenizer_c.tokenize(l))) for l in text_labels]
    
      # compute ROUGE score with custom tokenization
    return rouge_metric.compute(
        predictions=text_preds,
        references=text_labels,
        tokenizer=tokenize_sentence
    )

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# set parameters for training the model
args = Seq2SeqTrainingArguments(
    'code-switch-summ', 
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size= 2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    eval_accumulation_steps=3,
    # fp16=True ,
    seed = 42
    )

trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

# train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Translated_tweet, Code_switches, Bleu_score, Eng_source, Summary, Tweets, Language. If Translated_tweet, Code_switches, Bleu_score, Eng_source, Summary, Tweets, Language are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 53
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 2
  Total optimization steps = 39
  Number of trainable parameters = 139420416
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
0,No log,10.444443,0.491832,0.264381,0.461352,0.47655
1,No log,8.98771,0.491832,0.264381,0.461352,0.47655
2,No log,8.274302,0.491832,0.264381,0.461352,0.47655


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: Translated_tweet, Code_switches, Bleu_score, Eng_source, Summary, Tweets, Language. If Translated_tweet, Code_switches, Bleu_score, Eng_source, Summary, Tweets, Language are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 6
  Batch size = 2
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num

TrainOutput(global_step=39, training_loss=10.715683374649439, metrics={'train_runtime': 998.0824, 'train_samples_per_second': 0.159, 'train_steps_per_second': 0.039, 'total_flos': 48169180200960.0, 'train_loss': 10.715683374649439, 'epoch': 2.96})

### Test fine-tuned model on Test data

In [29]:
#tokenize the conversation
model_inputs = tokenizer(dataset["test"]["Translated_tweet"][6],  max_length=max_input, padding='max_length', truncation=True)
#make prediction
raw_pred, _, _ = trainer.predict([model_inputs])
#decode the output
output = tokenizer.decode(raw_pred[0])
print("Original tweet: ", dataset["test"]["Tweets"][6])
print("Translated tweet: ", dataset["test"]["Translated_tweet"][6])
print("Generated summary: ", output)

***** Running Prediction *****
  Num examples = 1
  Batch size = 2
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Original tweet:  Ah, o ti o! I don forget say I suppose call my mama today o.
Translated tweet:  Ah, that's it! I don't forget to say I suppose call my mom today.
Generated summary:  </s><s>Ah, that's it! I don't forget to say I suppose call my mom</s>
