Extensive use of : https://huggingface.co/transformers/v3.2.0/custom_datasets.html

In [1]:
import pandas as pd

In [2]:
data_no_tone = pd.read_csv("dataset/no_tone_output.csv")

In [49]:
mapping = {'middle_east': 0,'latino': 1,'chinese': 2,'muslim': 3,'bisexual': 4,'mexican': 5,'lgbtq': 6,'physical_disability': 7,'mental_disability': 8,'asian': 9,'women': 10,'jewish': 11,'immigrant': 12,'native_american': 13,'black': 14, 'trans':15}
mapping_inv = {k:v for (k,v) in enumerate(mapping)}

In [20]:
from pathlib import Path

def read_target_split(file):
    data = pd.read_csv(file)
    texts = data.text.to_list()
    labels = data.target.replace(mapping).to_list()

    return texts, labels

train_texts, train_labels = read_target_split('dataset/no_tone_output.csv')
# need to do something cleaner for the test dataset
d = pd.read_csv('dataset/no_tone_output.csv').sample(20).to_csv('dataset/no_tone_output_test.csv')
test_texts, test_labels = read_target_split('dataset/no_tone_output_test.csv')

  labels = data.target.replace(mapping).to_list()
  labels = data.target.replace(mapping).to_list()


In [18]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.05)

In [19]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [22]:
import torch

class HATEDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = HATEDataset(train_encodings, train_labels)
val_dataset = HATEDataset(val_encodings, val_labels)
test_dataset = HATEDataset(test_encodings, test_labels)

In [107]:
for i in range(2):
    print(i)

0
1


In [108]:
# Copied to trainer.py - kept for double check

from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
from tqdm.auto import tqdm
import torch.nn as nn


def model_training(model, train_dataset, epochs, optimization, criterion, metrics):
    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    model.train()

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

    for epoch in range(epochs):

        epoch_loss = 0
        epoch_metrics = dict(zip(metrics.keys(), torch.zeros(len(metrics))))

        print("Epoch", epoch + 1)
        for batch in tqdm(train_loader):
            optimization.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model_(input_ids, attention_mask=attention_mask, labels=labels)

            with torch.no_grad():
                _, pred = torch.max(outputs.logits, 1)

            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimization.step()
            with torch.no_grad():
                for k in epoch_metrics.keys():
                    epoch_metrics[k] += metrics[k](pred, labels)
            epoch_loss += loss.item()
        epoch_loss /= len(train_loader)

        for k in epoch_metrics.keys():
          epoch_metrics[k] /= len(train_loader)

        print('train Loss: {:.4f}, '.format(epoch_loss),
          ', '.join(['{}: {:.4f}'.format(k, epoch_metrics[k]) for k in epoch_metrics.keys()]))

    return epoch_loss,  epoch_metrics

In [103]:
from sklearn.metrics import f1_score, accuracy_score
def f1(preds, target):
    return f1_score(target, preds, average='macro')

def acc(preds, target):
    return accuracy_score(target, preds)

In [109]:
criterion = nn.CrossEntropyLoss()
model_ = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(mapping))
optim = AdamW(model_.parameters(), lr=5e-5)

metrics = {'ACC': acc, 'F1-weighted': f1}

loss, metric = model_training(model_,train_dataset,2,optim,criterion,metrics)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1


100%|██████████| 42/42 [01:38<00:00,  2.35s/it]


train Loss: 2.1826,  ACC: 0.3879, F1-weighted: 0.2371
Epoch 2


100%|██████████| 42/42 [01:39<00:00,  2.37s/it]

train Loss: 0.8928,  ACC: 0.8795, F1-weighted: 0.7976





In [85]:
## First try, can be removed

# from torch.utils.data import DataLoader
# from transformers import DistilBertForSequenceClassification, AdamW
# from tqdm.auto import tqdm

# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# num_classes = 16
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_classes)
# model.to(device)
# model.train()

# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# optim = AdamW(model.parameters(), lr=5e-5)

# for epoch in range(3):
#     for batch in tqdm(train_loader):
#         optim.zero_grad()
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs[0]
#         loss.backward()
#         optim.step()

# model.eval()

In [61]:
torch.save(model.state_dict(), "model/weights")

In [80]:
text = "There are too many of your community in this country"
inputs = tokenizer(text, return_tensors="pt",  padding = True, truncation = True)
inputs = inputs.to(device)

with torch.no_grad():
    logits = model(**inputs.to(device)).logits

predicted_class_id = logits.argmax().item()
print("Prompt: ", text)
print(" - Predicted class id: ", predicted_class_id)
print(" - Predicted category: ", mapping_inv[predicted_class_id])

Prompt:  There are too many of your community in this country
 - Predicted class id:  12
 - Predicted category:  immigrant


In [63]:
model.load_state_dict(torch.load("model/weights"))

<All keys matched successfully>