In [60]:
!pip install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.11.8-py3-none-any.whl.metadata (5.2 kB)
Downloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.8-py3-none-any.whl (26 kB)
Downloading torchmetrics-1.6.0-py3-none-any.whl (926 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.11.8 pytorch_lightning-2.4.0 torchmetrics-1.6.0


# IMPORT

In [61]:
# Model
import torch
import torch.nn as nn
from torch.utils.data import Dataset, Dataloader
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, AdamW
from pytorch_lightning import LightningModule, Trainer

# Dataset
import pandas as pd
import zipfile
import gdown
#from datasets import Dataset, DatasetDict

import os
import io
import traceback

import csv

import numpy as np
import matplotlib.pyplot as plt


**Reproducibility**

In [30]:
# Set the seed
seed = 46

# Set seed for torch, numpy and random libraries
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)

# Set the devide mode on GPU (if available CUDA for Nvidia and  MPS for Apple Silicon) or CPU
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

# DATASET

In [31]:
def download_csv(link_zipped_csv, gdrive_link, zipped_file):

    file_id = link_zipped_csv.split('/')[-2]  # Take the file_id (Ex. "https://drive.google.com/file/d/1BMj4BGXxIMzsd-GYSAEMpB7CF0XB87UT/view?usp=sharing" => file_id: 1BMj4BGXxIMzsd-GYSAEMpB7CF0XB87UT)
    download_link = gdrive_link + file_id # Create the path for gdown (Ex. https://drive.google.com/uc?id={YOUR_FILE_ID})

    try:
        if not os.path.exists(zipped_file):

            gdown.download(
                download_link,
                zipped_file,
                quiet=False
                )
        else:
            print("CSV file already downloaded!")


    except Exception as error:
        print("An error occured:", error)
        traceback.print_exc()

In [32]:
def unzip_csv(csv_zip, csv_dir):

    try:
        if not os.path.exists(csv_dir):
            os.mkdir(csv_dir) # Create the csv_dir directory where we extract files (if not exists)

        if len(os.listdir(csv_dir)) == 0:
            with zipfile.ZipFile(csv_zip, 'r') as zip:
                filelist = zip.namelist() # list of the file inside zip : ['csv /multilingual_nli_test_df.csv', 'csv /tweet_emotions.csv', ...]

                # Iterate over all file in the zip file to extract them
                for filename in filelist:
                    zip.extract(filename, csv_dir) # Extract the file inside the csv_dir

        else:
            print("CSV file already unzipped!")


    except Exception as error:
        print("An error occured:", error)
        traceback.print_exc()

# Download CSV

We have downloaded different csv files containing text from:

- Kaggle: https://www.kaggle.com/datasets?search=text
- Hugging Face: https://huggingface.co/datasets?modality=modality:text&sort=trending

Then we have zipped all csv and loading on Google Drive the csv.zip file.

In this part there is the downloading of the entire zipped dataset to preprocess (eliminate labels, taking only contents).

In particular:

1. muhammadravi251001/multilingual-nli-dataset: https://huggingface.co/datasets/muhammadravi251001/multilingual-nli-dataset/tree/main

?? 2. Multi-lingual HateSpeech Dataset: https://www.kaggle.com/datasets/wajidhassanmoosa/multilingual-hatespeech-dataset?select=Dataset

3. Multilingual Sentiment Datasets: https://github.com/tyqiangz/multilingual-sentiment-datasets/tree/main

4. molamin/Kinyarwanda_Engligh_Multilingual_ASR: https://huggingface.co/datasets/molamin/Kinyarwanda_Engligh_Multilingual_ASR/tree/main

5. odunola/multilingual-sentiments: https://huggingface.co/datasets/odunola/multilingual-sentiments/tree/main

To simplify the process, we have renamed as "text" each column that we want to take from csv file, discarding the others!


In [33]:
link_zipped_csv = 'https://drive.google.com/file/d/1WrXbLSkZzS-QGDjWk2_6dLOBIo-4KMBd/view?usp=drive_link'
gdrive_link = 'https://drive.google.com/uc?id='
csv_dir = './csv'
zipped_file = './csv.zip'

download_csv(
    link_zipped_csv,
    gdrive_link,
    zipped_file
)

unzip_csv(
    zipped_file,
    csv_dir,
)


CSV file already downloaded!
CSV file already unzipped!


In [34]:
def load_dataset_dict(csv_path):

    # Initialization of the dictionary of input data (text) {'filename': ['hi ...','I am ...']}
    text_dict = {}

    filelist = os.listdir(csv_path) # List of all csv file ['filename_1.csv', ...]

    # Iterate over all csv file in the directory
    for filename in filelist:
        # Initialization of the list containing all the texts in a csv file

        text_dict[filename] = []
        # Open the i-th csv file
        with open(csv_path + '/' + filename) as csv_file:

            # Read row by row. Each row is a dictionary with:
            # 1. Key: Name of the column
            # 2. Value: Content of the row
            reader = csv.DictReader(csv_file)

            # Iterate over each row of the csv
            # Ex. ['audio_filepath': 'test_data/37506.wav', 'duration': '15.9279375', 'text': 'maze mvugisha ururimi nti everyone should have..]
            for row in reader:
                # Append to the list each row of the column text
                text_dict[filename].append(row['text'])

    return text_dict

In [35]:
def texty(text_dict, min=10, max=500):

    # Create a list of the type ["my name is", "Hi, how are you?", ....]
    text_list = []

    text_dict_values = []
    # Put in a single list all the values (rows of all csv files)
    for value in text_dict.values():
        text_dict_values += value

    # Number of total row in all csv files
    total_row = len(text_dict_values)

    #
    sum_ = 0
    random_length_list = []

    while(True):
        random_length = np.random.randint(min,max)
        sum_ += random_length
        random_length_list.append(random_length)
        if sum_ >= total_row - max:
            random_length_list.append(total_row - sum_)
            break

    start = 0
    for length in random_length_list:
        text_list.append('\n'.join(text_dict_values[start:start+length+1]))
        start += length

    return text_list

In [36]:
def zippy(text_list):

  # Initialization of the dictionary of labels (zip file) {'filename': ['101 ...','010 ...']}
  labels_list = []

  for text in text_list:
    zip_buffer = io.BytesIO()

    with zipfile.ZipFile(file=zip_buffer, mode='w', compression=zipfile.ZIP_LZMA) as byte:
      byte.writestr("file.txt", text)

    labels_list.append(zip_buffer.getvalue().hex())

  return labels_list

In [54]:
def split(listToSplit, splitPercentages):
  length = len(listToSplit)
  splittedList = []
  lastIndex = 0
  for percentageIndex in range(len(splitPercentages)):
    print(lastIndex)
    splittedList.append(listToSplit[lastIndex:lastIndex+int(splitPercentages[percentageIndex]*length)])
    lastIndex += int(splitPercentages[percentageIndex]*length)

  return splittedList



In [38]:
class ZipDataset(Dataset):

    def __init__(self, data, labels, model_name, to_tokenize=True):
        self.data = data
        self.labels = labels
        self.to_tokenize = to_tokenize

        if self.to_tokenize:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.tokenize(data, labels)

    def __getitem__(self, index):
        if self.to_tokenize:
            return self.tokenized_data[index], self.tokenized_labels[index]
        else:
            return self.data[index], self.labels[index]

    def __len__(self):
        return len(self.data)

    def tokenize(self, text_list, labels_list):
        self.tokenized_data = []
        self.tokenized_labels = []
        for text, label in zip(text_list,labels_list):

            tokenized_text = self.tokenizer(
                text,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )
            self.tokenized_data.append(tokenized_text)


            tokenized_label = self.tokenizer(
                label,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )
            self.tokenized_labels.append(tokenized_label)

In [44]:
text_dict = load_dataset_dict("./csv")

text_list = texty(text_dict=text_dict)

splitPerc = [0.7,0.2,0.1]


#print(text_list[3])
#print(len(text_list[5]))

labels_list = zippy(text_list)
print(labels_list[4])

504b03043f0002000e00305872594e0b05f50d3a0000ff8100000800000066696c652e747874090405005d00008000003a9e4aee6e8c97c822794e51fdfe61e867d558b6fe3cfa51ada6dfb3e9889882b8676cdf4da9f57aee81ec50338a6bd8f3badba09d8d12311c17c434e5579e887b41b4bacd47f7d6326b0d60693cb4074db84ce7792c1069da27d38c44a064127c471fedcdaca8492f87ede0162dd45a863d42aee26cc58f76fbb5e6ab4d1ad2a60d2b9d3fabb5e273876d4f7c26f5ee3f160f00c63eb54d66583a4cb69de7ddec5f59c15a323b356c738428135b018ac88d6ef4db8e5ec8a8434e6e374aa2eb23afabacdd4d743103925e6e560560385cd02ddfdb729b71b54485d1f7c9bc50bf382a63d1cfe49195770d694cf2fcd4e65c98da325eb4f6ed87fc55f37dadc12f56624fd0af93033447116224fdfc76dbc32f66dd998ef7c9bc87dab392910fd69f172772a98924659431cbca7ee3b526b3131d309889518cd4940190f239a057bfb9a66f7e5204b763bfd394261c9b0fa098a04e755da042a12c74731fe9f263ad6d973c4a11a49baa71ad1e77c1330ed9873858f448c7ebf0642f48b7e946eff4165f34073204e82c047305ff2829f53fa1ae8594c419f1a0e04c6a9ab9fb1bdc605c3d2bc1ba87eede1f04f781e3ad3d236937b7c6d9fc9bfb6ce44d76d77a85188f

In [55]:
splitted = split(text_list, splitPerc)

0
136
175


In [58]:
len(splitted[1])

39

In [57]:
len(text_list)

195

# MODEL

In [None]:
model_name = "bert-base-uncased"
dataset = ZipDataset(text_list, labels_list, model_name, to_tokenize=True)

In [62]:
data, label = dataset[0]

print(data)
print(label)

{'input_ids': tensor([[  101,  2026,  8481,  2097,  2468,  2307,  2273,  2002,  2209,  2005,
         22820,  1998, 28352, 24238,  1999,  1996,  2827,  4043,  2223,  2971,
          2016,  2001,  2059,  2805,  9162,  1999,  6825,  2012,  1996,  2118,
          2021,  2017,  2071,  3288,  2068,  2045, 14386,  2001,  2141,  1999,
          1996, 17727,  8865,  2225,  2212,  1997, 23624,  5311, 12464,  5400,
         28120,  3366, 24471,  2072,  8529,  4904,  4648,  4890,  2072, 14163,
          4095,  3148,  9152,  3676, 23961, 20411,  3736,  7298,  2020,  3603,
          1999,  1996,  6007,  2030,  2132,  3351,  2906,  1997,  5895,  1997,
          2162,  1996,  4825,  2001,  3243,  6450,  8670,  3683,  4168,  6460,
         12849,  7352, 17915,  1057,  8569,  5831,  2863,  1038,  2860,  1005,
         19557, 16078,  5162, 16021,  6979,  3775, 20008, 21335,  1057, 10139,
         11735,  2050, 14163,  2011, 23692,  1045,  2572,  2183,  2000,  4021,
          4826,  9152, 16313, 18410,  

In [None]:
class FineTuneModelSeq2seq(LightningModule):
    def __init__(self, model_name, num_classes, learning_rate=2e-5):
        super(FineTuneModelSeq2seq, self).__init__()
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name, num_labels=num_classes)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

In [None]:
class FineTuneModelCausal(LightningModule):
    def __init__(self, model_name, num_classes, learning_rate=2e-5):
        super(FineTuneModelCausal, self).__init__()
        self.model = AutoModelForCausalLM.from_pretrained(model_name, num_labels=num_classes)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer