In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
from tqdm.auto import tqdm
import torch.nn as nn
from sklearn.metrics import f1_score, accuracy_score
import csv
import os
import matplotlib.pyplot as plt

In [16]:
OUR_TARGET = ["women", "jews", "asian", "black", "lgbtq", "latino", "muslim", "indigenous", "arab", "disabilities", "others"]
MAPPING = {OUR_TARGET[i]: i for i in range(len(OUR_TARGET))}
INV_MAPPING = {v: k for k, v in MAPPING.items()}

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

DIR = os.getcwd() + "/"
print(DIR)

/content/


In [3]:
class HATEDataset(torch.utils.data.Dataset):
    """Permits to have correctly composed datasets"""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def model_prediction(model, tokenizer, prompt=None, device = DEVICE):

    if not prompt:
        prompt = input("Prompt? ")
    inputs = tokenizer(prompt, return_tensors="pt",  padding = True, truncation = True)
    inputs = inputs.to(device)

    with torch.no_grad():
        logits = model(**inputs.to(device)).logits

    predicted_class_id = logits.argmax().item()
    print("Prompt: ", prompt)
    print(" - Predicted class id: ", predicted_class_id)
    print(" - Predicted category: ", INV_MAPPING[predicted_class_id])


def read_target_split(file):
    """convert the dataset into one list for text and one for labels"""
    data = pd.read_csv(file)
    texts = data.text.to_list()
    labels = data.target.replace(MAPPING).to_list()

    return texts, labels


In [8]:
def prepare_data(file_train, file_test):
    with open(file_train, "w") as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['target', 'text'])

    with open(file_train, "a"):
        for file in os.scandir(DIR + "Data"):
            # print("Processing file: ", file.name)
            try:
              if file.name.split(".")[1] == 'csv':
                  df = pd.read_csv(file, header = None, names = ['text'])
                  target = file.name.split("_")[1].split(".")[0]
                  if target == 'other':
                      target = 'others'
                  df['target'] = MAPPING[target]

                  df = df[['target', 'text']]
                  df.to_csv(file_train, mode='a', header=False, index=False)
            except:
              pass

    # Create test file
    data = pd.read_csv(file_train).sample(1000)
    data.to_csv(file_test, columns = ['target', 'text'], index = False)


def f1(preds, target):
    return f1_score(target, preds, average='macro')

def acc(preds, target):
    return accuracy_score(target, preds)


In [17]:
FILE_TRAIN = DIR + "full_target_id.csv"
FILE_TEST  = DIR + "full_target_id_test.csv"

prepare_data(FILE_TRAIN, FILE_TEST)

FileNotFoundError: [Errno 2] No such file or directory: '/content/Data'

In [11]:
def model_training(model, train_dataset, test_dataset, epochs, optimization, criterion, metrics, device = DEVICE):

    # device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    model.train()

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

    train_loss = list()
    train_accuray = list()
    train_f1 = list()
    eval_loss = list()
    eval_accuray = list()
    eval_f1 = list()

    for epoch in range(epochs):

        print(f"Epoch {epoch + 1} / {epochs}")

        ## Do the training
        epoch_loss = 0
        epoch_metrics = dict(zip(metrics.keys(), torch.zeros(len(metrics))))


        for batch in tqdm(train_loader):
            optimization.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)#.cpu()

            with torch.no_grad():
                _, pred = torch.max(outputs.logits, 1)

            loss = criterion(outputs.logits, labels)

            loss.backward()
            optimization.step()

            with torch.no_grad():
              for k in epoch_metrics.keys():
                pred_cpu = pred.cpu()
                labels_cpu = labels.cpu()
                epoch_metrics[k] += metrics[k](pred_cpu, labels_cpu)

            epoch_loss += loss.item()

        epoch_loss /= len(train_loader)

        epoch_loss = epoch_loss

        for k in epoch_metrics.keys():
          epoch_metrics[k] /= len(train_loader)

        train_loss.append(epoch_loss)
        train_accuray.append(epoch_metrics['ACC'])
        train_f1.append(epoch_metrics['F1-weighted'])

        print('train Loss: {:.4f}, '.format(epoch_loss),
          ', '.join(['{}: {:.4f}'.format(k, epoch_metrics[k]) for k in epoch_metrics.keys()]), "\n")

        # Evaluate the model
        epoch_loss = 0
        epoch_metrics = dict(zip(metrics.keys(), torch.zeros(len(metrics))))

        for batch in tqdm(train_loader):
            with torch.no_grad():

                optimization.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)#.cpu()
                _, pred = torch.max(outputs.logits, 1)

                pred_cpu = pred.cpu()
                labels_cpu = labels.cpu()

                for k in epoch_metrics.keys():
                    epoch_metrics[k] += metrics[k](pred_cpu, labels_cpu)
                epoch_loss += loss.item()

        epoch_loss /= len(train_loader)

        for k in epoch_metrics.keys():
          epoch_metrics[k] /= len(train_loader)

        eval_loss.append(epoch_loss)
        eval_accuray.append(epoch_metrics['ACC'])
        eval_f1.append(epoch_metrics['F1-weighted'])

        print('eval Loss: {:.4f}, '.format(epoch_loss),
          ', '.join(['{}: {:.4f}'.format(k, epoch_metrics[k]) for k in epoch_metrics.keys()]), "\n")

        file_path = DIR + "model/log.csv"

        with open(file_path, "w") as file:
          writer = csv.writer(file)
          writer.writerow(["epoch", "type", "metric", "value"])

        with open(file_path, "a") as file:
          writer = csv.writer(file)

          l = [
              [epoch, "train", "loss", train_loss[0]], [epoch, "train", "acc", train_accuray[0].item()], [epoch, "train", "f1", train_f1[0].item()],
              [epoch, "eval", "loss", eval_loss[0]], [epoch, "eval", "acc", eval_accuray[0].item()], [epoch, "eval", "f1", eval_f1[0].item()]]
          for item in l:
            writer.writerow(item)

    return train_loss, train_accuray, train_f1, eval_loss, eval_accuray, eval_f1

In [13]:
train_texts, train_labels = read_target_split(FILE_TRAIN)
test_texts, test_labels = read_target_split(FILE_TRAIN)
results_models_weights_dir = DIR + 'model/'
EPOCHS = 2

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.10)

# Define tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Encode the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Create instances of HATEDataset (gives all the attributes)
train_dataset = HATEDataset(train_encodings, train_labels)
val_dataset = HATEDataset(val_encodings, val_labels)
test_dataset = HATEDataset(test_encodings, test_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 / 1


  0%|          | 0/5856 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Model training
criterion = nn.CrossEntropyLoss()
model_ = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(MAPPING))
optim = AdamW(model_.parameters(), lr=5e-5)

metrics = {'ACC': acc, 'F1-weighted': f1}

model_.to(DEVICE)

train_loss, train_accuray, train_f1, eval_loss, eval_accuray, eval_f1 = model_training(model_, train_dataset, val_dataset, EPOCHS, optim, criterion, metrics)

torch.save(model_.state_dict(), results_models_weights_dir + 'weights_sentiment_analysis.pth')