In [None]:
!pip install pytorch_lightning



# IMPORT

In [None]:
# Model
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, AdamW
from pytorch_lightning import LightningModule, Trainer

# Dataset
import pandas as pd
import zipfile
import gdown
#from datasets import Dataset, DatasetDict

import os
import io
import traceback

import csv

import numpy as np
import matplotlib.pyplot as plt


**Reproducibility**

In [None]:
# Set the seed
seed = 46

# Set seed for torch, numpy and random libraries
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)

# Set the devide mode on GPU (if available CUDA for Nvidia and  MPS for Apple Silicon) or CPU
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

# DATASET

In [None]:
def download_csv(link_zipped_csv, gdrive_link, zipped_file):

    file_id = link_zipped_csv.split('/')[-2]  # Take the file_id (Ex. "https://drive.google.com/file/d/1BMj4BGXxIMzsd-GYSAEMpB7CF0XB87UT/view?usp=sharing" => file_id: 1BMj4BGXxIMzsd-GYSAEMpB7CF0XB87UT)
    download_link = gdrive_link + file_id # Create the path for gdown (Ex. https://drive.google.com/uc?id={YOUR_FILE_ID})

    try:
        if not os.path.exists(zipped_file):

            gdown.download(
                download_link,
                zipped_file,
                quiet=False
                )
        else:
            print("CSV file already downloaded!")


    except Exception as error:
        print("An error occured:", error)
        traceback.print_exc()

In [None]:
def unzip_csv(csv_zip, csv_dir):

    try:
        if not os.path.exists(csv_dir):
            os.mkdir(csv_dir) # Create the csv_dir directory where we extract files (if not exists)

        if len(os.listdir(csv_dir)) == 0:
            with zipfile.ZipFile(csv_zip, 'r') as zip:
                filelist = zip.namelist() # list of the file inside zip : ['csv /multilingual_nli_test_df.csv', 'csv /tweet_emotions.csv', ...]

                # Iterate over all file in the zip file to extract them
                for filename in filelist:
                    zip.extract(filename, csv_dir) # Extract the file inside the csv_dir

        else:
            print("CSV file already unzipped!")


    except Exception as error:
        print("An error occured:", error)
        traceback.print_exc()

# Download CSV

We have downloaded different csv files containing text from:

- Kaggle: https://www.kaggle.com/datasets?search=text
- Hugging Face: https://huggingface.co/datasets?modality=modality:text&sort=trending

Then we have zipped all csv and loading on Google Drive the csv.zip file.

In this part there is the downloading of the entire zipped dataset to preprocess (eliminate labels, taking only contents).

In particular:

1. muhammadravi251001/multilingual-nli-dataset: https://huggingface.co/datasets/muhammadravi251001/multilingual-nli-dataset/tree/main

?? 2. Multi-lingual HateSpeech Dataset: https://www.kaggle.com/datasets/wajidhassanmoosa/multilingual-hatespeech-dataset?select=Dataset

3. Multilingual Sentiment Datasets: https://github.com/tyqiangz/multilingual-sentiment-datasets/tree/main

4. molamin/Kinyarwanda_Engligh_Multilingual_ASR: https://huggingface.co/datasets/molamin/Kinyarwanda_Engligh_Multilingual_ASR/tree/main

5. odunola/multilingual-sentiments: https://huggingface.co/datasets/odunola/multilingual-sentiments/tree/main

To simplify the process, we have renamed as "text" each column that we want to take from csv file, discarding the others!


In [None]:
link_zipped_csv = 'https://drive.google.com/file/d/1WrXbLSkZzS-QGDjWk2_6dLOBIo-4KMBd/view?usp=drive_link'
gdrive_link = 'https://drive.google.com/uc?id='
csv_dir = './csv'
zipped_file = './csv.zip'

download_csv(
    link_zipped_csv,
    gdrive_link,
    zipped_file
)

unzip_csv(
    zipped_file,
    csv_dir,
)


CSV file already downloaded!
CSV file already unzipped!


In [None]:
def load_dataset_dict(csv_path):

    # Initialization of the dictionary of input data (text) {'filename': ['hi ...','I am ...']}
    text_dict = {}

    filelist = os.listdir(csv_path) # List of all csv file ['filename_1.csv', ...]

    # Iterate over all csv file in the directory
    for filename in filelist:
        # Initialization of the list containing all the texts in a csv file

        text_dict[filename] = []
        # Open the i-th csv file
        with open(csv_path + '/' + filename) as csv_file:

            # Read row by row. Each row is a dictionary with:
            # 1. Key: Name of the column
            # 2. Value: Content of the row
            reader = csv.DictReader(csv_file)

            # Iterate over each row of the csv
            # Ex. ['audio_filepath': 'test_data/37506.wav', 'duration': '15.9279375', 'text': 'maze mvugisha ururimi nti everyone should have..]
            for row in reader:
                # Append to the list each row of the column text
                text_dict[filename].append(row['text'])

    return text_dict

In [None]:
def texty(text_dict, min=10, max=500):

    # Create a list of the type ["my name is", "Hi, how are you?", ....]
    text_list = []

    text_dict_values = []
    # Put in a single list all the values (rows of all csv files)
    for value in text_dict.values():
        text_dict_values += value

    # Number of total row in all csv files
    total_row = len(text_dict_values)

    #
    sum_ = 0
    random_length_list = []

    while(True):
        random_length = np.random.randint(min,max)
        sum_ += random_length
        random_length_list.append(random_length)
        if sum_ >= total_row - max:
            random_length_list.append(total_row - sum_)
            break

    start = 0
    for length in random_length_list:
        text_list.append('\n'.join(text_dict_values[start:start+length+1]))
        start += length

    return text_list

In [None]:
def zippy(text_list):

  # Initialization of the dictionary of labels (zip file) {'filename': ['101 ...','010 ...']}
  labels_list = []

  for text in text_list:
    zip_buffer = io.BytesIO()

    with zipfile.ZipFile(file=zip_buffer, mode='w', compression=zipfile.ZIP_LZMA) as byte:
      byte.writestr("file.txt", text)

    labels_list.append(zip_buffer.getvalue().hex())

  return labels_list

In [None]:
def split(listToSplit, splitPercentages):
  length = len(listToSplit)
  splittedList = []
  lastIndex = 0
  for percentageIndex in range(len(splitPercentages)):
    print(lastIndex)
    splittedList.append(listToSplit[lastIndex:lastIndex+int(splitPercentages[percentageIndex]*length)])
    lastIndex += int(splitPercentages[percentageIndex]*length)

  return splittedList



In [None]:
class ZipDataset(Dataset):

    def __init__(self, data, labels, model_name, to_tokenize=True):
        self.data = data
        self.labels = labels
        self.to_tokenize = to_tokenize

        if self.to_tokenize:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.tokenize(data, labels)

    def __getitem__(self, index):
        if self.to_tokenize:
            return self.tokenized_data[index], self.tokenized_labels[index]
        else:
            return self.data[index]['input_ids'], self.data[index]['attention_mask'], self.labels[index]['input_ids']

    def __len__(self):
        return len(self.data)

    def tokenize(self, text_list, labels_list):
        self.tokenized_data = []
        self.tokenized_labels = []
        for text, label in zip(text_list,labels_list):

            tokenized_text = self.tokenizer(
                text,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )
            self.tokenized_data.append(tokenized_text)


            tokenized_label = self.tokenizer(
                label,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )
            self.tokenized_labels.append(tokenized_label)

In [None]:
text_dict = load_dataset_dict("./csv")

text_list = texty(text_dict=text_dict)

splitPerc = [0.7,0.2,0.1]


#print(text_list[3])
#print(len(text_list[5]))

labels_list = zippy(text_list)
print(labels_list[4])

504b03043f0002000e008a737459b874a0e583590000becf00000800000066696c652e747874090405005d00008000002a194a86237d1a9223db3fcef2c9053a79b75cc880c01affaa998d7901ef4cf87eedc9482f2226f15e4b5014a9977e1bfc11e40b2445be75b3424aeaf0243e305859f4b8d27fb171d2457fbfcbedc61721c14b4434c92216eac43f7e3e927a4afae289046d2b8b8670d44dc25c5228afd9510da0eed5d19c77bb83f8ca5647b72da59e49655ab608718683f4e6ca1102875f0c40ca5588ed12cd5712aad627a323f984a51663c903e12ebea6358a2bcd3a7b15c75ad537d272dda59e4ab2265179f1055dc3984c61d80d4a9639ff23ce032832647f64311d4b305eaca1c952b443b301feac5010be3ca2823aa4f800e19ff784d4af6cc3d2f3ac5d7d8b06974f73b20ab5f77e896be44b62c1e8d475342820f15d353c6ae261694a3cfe5003ff0aa7ee63aeecf6eada582ab97be1c9146434721b3f2e780fc81ef33d6da0e08d9448da44da2f8ea116f465bc94b5ad4dec2ba0a14b2c8e171d8f73a2a6ce80c67a3585173ea4850db89bbe6d7c7a949ebec5f50fd592102d80918bb933adb27524beafa78f8102dcd6d33202ad91fc88c5cba9929ec35ec4ee269bc57e2ab3153f4c275a652401f3c8ba6090a4ec4787bce694676bd51b3aa762635ec0250378d6b26a41

In [None]:
splitted_text = split(text_list, splitPerc)

0
144
185


In [None]:
len(splitted_text[1])

41

In [None]:
len(text_list)

206

In [None]:
splitted_labels = split(labels_list, splitPerc)

0
144
185


# MODEL

In [None]:
model_name = "bert-base-uncased"
dataset_train = ZipDataset(text_list[0], labels_list[0], model_name, to_tokenize=True)
dataset_test = ZipDataset(text_list[1], labels_list[1], model_name, to_tokenize=True)
dataset_val = ZipDataset(text_list[2], labels_list[2], model_name, to_tokenize=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
data, label = dataset_train[0]

print(data)
print(label)

{'input_ids': tensor([[ 101, 1047,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,

In [None]:
class FineTuneModelSeq2seq(LightningModule):
    def __init__(self, model_name, learning_rate=2e-5):
        super(FineTuneModelSeq2seq, self).__init__()
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

In [None]:
class FineTuneModelCausal(LightningModule):
    def __init__(self, model_name, learning_rate=2e-5):
        super(FineTuneModelCausal, self).__init__()
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        print(batch)
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

In [None]:
dataloader_train = DataLoader(dataset_train)
dataloader_test = DataLoader(dataset_test)
dataloader_val = DataLoader(dataset_val)


In [None]:
model = FineTuneModelCausal(model_name)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


In [None]:
trainer = Trainer()

trainer.fit(model, dataloader_train, dataloader_val)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type            | Params | Mode
-------------------------------------------------
0 | model | BertLMHeadModel | 109 M  | eval
-------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
438.057   Total estimated model params size (MB)
0         Modules in train mode
233       Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

[{'input_ids': tensor([[[ 101, 1047,  102,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    

ValueError: not enough values to unpack (expected 3, got 2)