In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaModel, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from tqdm.notebook import tqdm
from torch.optim import AdamW

# Further Pretraining

Do further pretraining from gab_1M_unlaelled and reddit_1M_unlabelled

In [2]:
# import emoji
# from tqdm import tqdm
# df_gab = pd.read_csv('data/gab_1M_unlabelled.csv')
# df_reddit = pd.read_csv('data/reddit_1M_unlabelled.csv')
# further_corpus =further_corpus [emoji.demojize(text, delimiters=('', ''), language= 'en').replace('_',' ') for text in tqdm(df_gab['text'].tolist() + df_reddit['text'].tolist())]

In [3]:
# with open('data/further_corpus.txt', 'w') as file:
#     # Write each element of the list to the file
#     for item in further_corpus:
#         file.write(f"{item}\n")  # Add a newline after each item

In [4]:
# import random
# random.shuffle(further_corpus)
# split_index = int(len(further_corpus) * 0.8)
# further_corpus_train = further_corpus[:split_index]
# further_corpus_eval = further_corpus[split_index:]
# 
# with open('data/further_corpus_train.txt', 'w') as file:
#     # Write each element of the list to the file
#     for item in further_corpus_train:
#         file.write(f"{item}\n")  # Add a newline after each item
# 
# with open('data/further_corpus_eval.txt', 'w') as file:
#     # Write each element of the list to the file
#     for item in further_corpus_eval:
#         file.write(f"{item}\n")  # Add a newline after each item

```
python run_mlm.py \
    --model_name_or_path roberta-base \
    --train_file data/further_corpus_train.txt \
    --validation_file data/further_corpus_eval.txt \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --line_by_line \
    --do_train \
    --do_eval \
    --output_dir tmp/test-mlm 
```

# Data Loading

In [2]:
# lode the texts and labels
df = pd.read_csv('data/edos_cleaned.csv')

In [3]:
train_texts_binary = df[df['split'] == 'train']['text_en_emoji_rm_url'].tolist()
train_labels_binary = [1 if (label == "sexist") else 0 for label in df[df['split'] == 'train']['label_sexist'].tolist()]

dev_texts_binary = df[df['split'] == 'dev']['text_en_emoji_rm_url'].tolist()
dev_labels_binary = [1 if (label == "sexist") else 0 for label in df[df['split'] == 'dev']['label_sexist'].tolist()]

test_texts_binary = df[df['split'] == 'test']['text_en_emoji_rm_url'].tolist()
test_labels_binary = [1 if (label == "sexist") else 0 for label in df[df['split'] == 'test']['label_sexist'].tolist()]

In [4]:
df_sexist = df[df['label_sexist'] == 'sexist']
categories ={
'1. threats, plans to harm and incitement': 0,
'2. derogation': 1,
'3. animosity': 2,
'4. prejudiced discussions': 3
}

train_texts_four_categories = df_sexist[df_sexist['split'] == 'train']['text_en_emoji_rm_url'].tolist()
train_labels_four_categories = list(map(lambda x: categories[x] if x in categories else x, df_sexist[df_sexist['split'] == 'train']['label_category'].tolist()))

dev_texts_four_categories = df_sexist[df_sexist['split'] == 'dev']['text_en_emoji_rm_url'].tolist()
dev_labels_four_categories = list(map(lambda x: categories[x] if x in categories else x, df_sexist[df_sexist['split'] == 'dev']['label_category'].tolist()))

test_texts_four_categories = df_sexist[df_sexist['split'] == 'test']['text_en_emoji_rm_url'].tolist()
test_labels_four_categories = list(map(lambda x: categories[x] if x in categories else x, df_sexist[df_sexist['split'] == 'test']['label_category'].tolist()))

## Define Dataset Class

In [5]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

# Define Model

In [6]:
class RobertaClassifier(nn.Module):
    def __init__(self, roberta_model_name, num_classes):
        super(RobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.roberta.config.hidden_size, num_classes)
        # self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        droupout_output = self.dropout(pooled_output)
        logits = self.linear(droupout_output)
        return logits
    
# class RobertaMulti(nn.Module):
#     def __init__(self, roberta_model_name, num_classes):
#         super(RobertaMulti, self).__init__()
#         self.roberta = RobertaModel.from_pretrained(roberta_model_name)
#         self.dropout = nn.Dropout(0.1)
#         self.linear = nn.Linear(self.roberta.config.hidden_size, num_classes)
#         # self.relu = nn.ReLU()
# 
#     def forward(self, input_ids, attention_mask):
#         outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
#         pooled_output = outputs.pooler_output
#         droupout_output = self.dropout(pooled_output)
#         logits = self.linear(droupout_output)
#         return logits

# Training function

In [7]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    train_loss = 0
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        # print(f"Batch Loss: {loss.item():.4f}")
        train_loss += loss.item()
        
    avg_train_loss = train_loss / len(data_loader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

# Evaluation function

In [8]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

# To_train

In [9]:
# Set up parameters
#     roberta_model_name = 'roberta-base'
#     num_classes = 2
#     max_length = 128
#     batch_size = 16
#     num_epochs = 4
#     learning_rate = 2e-5

def to_train(model_name,num_classes,
             train_texts, train_labels, val_texts, val_labels,
             max_length=256, batch_size=16, num_epochs=4, learning_rate=2e-5):
    
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    
    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
    val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    
    device = torch.device("mps")
    model = RobertaClassifier(model_name, num_classes).to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_dataloader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    for epoch in tqdm(range(num_epochs)):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)
    return model

# Roberta_binary

In [13]:
roberta_base_binary = to_train(model_name = 'roberta-base', num_classes= 2,
                               train_texts = train_texts_binary, train_labels = train_labels_binary,
                               val_texts = dev_texts_binary, val_labels = dev_labels_binary,
                               num_epochs = 4)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1/4


  0%|          | 0/875 [00:00<?, ?it/s]

Average Training Loss: 0.3833


  0%|          | 0/125 [00:00<?, ?it/s]

Validation Accuracy: 0.8705
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1514
           1       0.80      0.62      0.70       486

    accuracy                           0.87      2000
   macro avg       0.84      0.79      0.81      2000
weighted avg       0.87      0.87      0.86      2000

Epoch 2/4


  0%|          | 0/875 [00:00<?, ?it/s]

Average Training Loss: 0.2554


  0%|          | 0/125 [00:00<?, ?it/s]

Validation Accuracy: 0.8725
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      1514
           1       0.81      0.62      0.70       486

    accuracy                           0.87      2000
   macro avg       0.85      0.79      0.81      2000
weighted avg       0.87      0.87      0.87      2000

Epoch 3/4


  0%|          | 0/875 [00:00<?, ?it/s]

Average Training Loss: 0.1655


  0%|          | 0/125 [00:00<?, ?it/s]

Validation Accuracy: 0.8770
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      1514
           1       0.79      0.67      0.73       486

    accuracy                           0.88      2000
   macro avg       0.85      0.81      0.82      2000
weighted avg       0.87      0.88      0.87      2000

Epoch 4/4


  0%|          | 0/875 [00:00<?, ?it/s]

Average Training Loss: 0.0908


  0%|          | 0/125 [00:00<?, ?it/s]

Validation Accuracy: 0.8765
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      1514
           1       0.77      0.69      0.73       486

    accuracy                           0.88      2000
   macro avg       0.84      0.81      0.83      2000
weighted avg       0.87      0.88      0.87      2000


# Roberta_multi

In [14]:
# from collections import Counter
# print('dev')
# print(Counter(dev_labels_four_categories))
# print('train')
# print(Counter(train_labels_four_categories))
# print('test')
# print(Counter(test_labels_four_categories))

In [15]:
# train_texts_three_categories = []
# train_labels_three_categories = []
# dev_texts_three_categories = []
# dev_labels_three_categories = []
# 
# for i in range(len(train_texts_four_categories)):
#     if train_labels_four_categories[i] == 4 or train_labels_four_categories[i] == 3:
#         continue
#     train_texts_three_categories.append(train_texts_four_categories[i])
#     train_labels_three_categories.append(train_labels_four_categories[i])
#     
# for i in range(len(dev_texts_four_categories)):
#     if dev_labels_four_categories[i] == 4 or dev_labels_four_categories[i] == 3:
#         continue
#     dev_texts_three_categories.append(dev_texts_four_categories[i])
#     dev_labels_three_categories.append(dev_labels_four_categories[i])

In [16]:
# Counter(train_labels_three_categories)

In [17]:
# Counter(train_labels_four_categories)

In [10]:
roberta_base_multi = to_train(model_name = 'roberta-base', num_classes= 4,
                               train_texts = train_texts_four_categories, train_labels = train_labels_four_categories,
                               val_texts = dev_texts_four_categories, val_labels = dev_labels_four_categories,
                              num_epochs =  8)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1/8


  0%|          | 0/213 [00:00<?, ?it/s]

Average Training Loss: 1.0812


  0%|          | 0/31 [00:00<?, ?it/s]

Validation Accuracy: 0.5720
              precision    recall  f1-score   support

           0       0.65      0.55      0.59        44
           1       0.58      0.77      0.66       227
           2       0.70      0.26      0.38       167
           3       0.41      0.73      0.53        48

    accuracy                           0.57       486
   macro avg       0.59      0.58      0.54       486
weighted avg       0.61      0.57      0.55       486

Epoch 2/8


  0%|          | 0/213 [00:00<?, ?it/s]

Average Training Loss: 0.8396


  0%|          | 0/31 [00:00<?, ?it/s]

Validation Accuracy: 0.6276
              precision    recall  f1-score   support

           0       0.52      0.77      0.62        44
           1       0.62      0.86      0.72       227
           2       0.75      0.28      0.41       167
           3       0.71      0.60      0.65        48

    accuracy                           0.63       486
   macro avg       0.65      0.63      0.60       486
weighted avg       0.66      0.63      0.60       486

Epoch 3/8


  0%|          | 0/213 [00:00<?, ?it/s]

Average Training Loss: 0.6296


  0%|          | 0/31 [00:00<?, ?it/s]

Validation Accuracy: 0.6379
              precision    recall  f1-score   support

           0       0.75      0.61      0.67        44
           1       0.68      0.68      0.68       227
           2       0.55      0.67      0.61       167
           3       0.77      0.35      0.49        48

    accuracy                           0.64       486
   macro avg       0.69      0.58      0.61       486
weighted avg       0.65      0.64      0.64       486

Epoch 4/8


  0%|          | 0/213 [00:00<?, ?it/s]

Average Training Loss: 0.4209


  0%|          | 0/31 [00:00<?, ?it/s]

Validation Accuracy: 0.6523
              precision    recall  f1-score   support

           0       0.70      0.70      0.70        44
           1       0.65      0.81      0.72       227
           2       0.65      0.44      0.53       167
           3       0.63      0.56      0.59        48

    accuracy                           0.65       486
   macro avg       0.66      0.63      0.64       486
weighted avg       0.65      0.65      0.64       486

Epoch 5/8


  0%|          | 0/213 [00:00<?, ?it/s]

Average Training Loss: 0.2565


  0%|          | 0/31 [00:00<?, ?it/s]

Validation Accuracy: 0.6667
              precision    recall  f1-score   support

           0       0.65      0.70      0.67        44
           1       0.69      0.74      0.71       227
           2       0.65      0.58      0.61       167
           3       0.63      0.60      0.62        48

    accuracy                           0.67       486
   macro avg       0.65      0.66      0.65       486
weighted avg       0.67      0.67      0.66       486

Epoch 6/8


  0%|          | 0/213 [00:00<?, ?it/s]

Average Training Loss: 0.1347


  0%|          | 0/31 [00:00<?, ?it/s]

Validation Accuracy: 0.6502
              precision    recall  f1-score   support

           0       0.71      0.68      0.70        44
           1       0.65      0.80      0.72       227
           2       0.65      0.49      0.56       167
           3       0.63      0.46      0.53        48

    accuracy                           0.65       486
   macro avg       0.66      0.61      0.63       486
weighted avg       0.65      0.65      0.64       486

Epoch 7/8


  0%|          | 0/213 [00:00<?, ?it/s]

Average Training Loss: 0.0807


  0%|          | 0/31 [00:00<?, ?it/s]

Validation Accuracy: 0.6646
              precision    recall  f1-score   support

           0       0.72      0.64      0.67        44
           1       0.68      0.78      0.72       227
           2       0.63      0.56      0.59       167
           3       0.66      0.52      0.58        48

    accuracy                           0.66       486
   macro avg       0.67      0.62      0.64       486
weighted avg       0.66      0.66      0.66       486

Epoch 8/8


  0%|          | 0/213 [00:00<?, ?it/s]

Average Training Loss: 0.0584


  0%|          | 0/31 [00:00<?, ?it/s]

Validation Accuracy: 0.6687
              precision    recall  f1-score   support

           0       0.76      0.64      0.69        44
           1       0.68      0.76      0.72       227
           2       0.63      0.59      0.61       167
           3       0.66      0.52      0.58        48

    accuracy                           0.67       486
   macro avg       0.68      0.63      0.65       486
weighted avg       0.67      0.67      0.67       486


# Predict

In [19]:
torch.save(roberta_base_binary, "models/roberta_base_binary.pt")

In [11]:
torch.save(roberta_base_multi, "models/roberta_base_multi.pt")

In [None]:
def predict_sexist(text, model, tokenizer, device, max_length=512):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return preds.item()

In [None]:
# Test sentiment prediction
test_text = ("I need to kill that bitch")
sexist = predict_sexist(test_text, roberta_base_binary, RobertaTokenizer.from_pretrained('roberta-base'), torch.device("mps"))
category = predict_sexist(test_text, roberta_base_multi, RobertaTokenizer.from_pretrained('roberta-base'), torch.device("mps")) if sexist else None
print(test_text)
print(f"Predicted: {sexist} -> {category}")