In [4]:
# !pip install emoji

In [1]:
import numpy as np
import pandas as pd
import nltk
import emoji
import string
from textblob import TextBlob
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/louys/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/louys/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
class Preprocessor:
    def __init__(self,
                 emoji = None, #None, rm, translate
                 urls_users_remover = False, #False, True
                 lowercaser = False, #False, True
                 punctuation_remover = False, #None,
                 spell_checker = False, #False, True
                 stopwords_remover = False, #False, True
                 word_transformer = None, #lemmatization, stemming
                 ):
        self.emoji = emoji
        self.urls_users_remover = urls_users_remover
        self.lowercaser = lowercaser
        self.spell_checker = spell_checker
        self.punctuation_remover = punctuation_remover
        self.stopwords_remover = stopwords_remover
        self.word_transformer = word_transformer

    def process(self, data:list)->list:
        if self.emoji:
            data = [self.de_emoji(text) for text in tqdm(data)]

        if self.urls_users_remover:
          data = [self.remove_urls_users(text) for text in tqdm(data)]

        if self.lowercaser:
            data = [self.lowercase(text) for text in tqdm(data)]

        if self.spell_checker:
            data = [self.correct_spelling(text) for text in tqdm(data)]

        if self.punctuation_remover:
            data = [self.remove_punctuations(text) for text in tqdm(data)]

        if self.stopwords_remover:
            data = [self.remove_stopwords(text) for text in tqdm(data)]

        if self.word_transformer:
            data = [self.word_transform(text) for text in tqdm(data)]

        return data

    def de_emoji(self,text:str) -> str:
        if self.emoji == 'rm':
            cleaned_text = emoji.replace_emoji(text,'')
        elif self.emoji == 'translate':
            cleaned_text = emoji.demojize(text, delimiters=('', ''), language= 'en')
            cleaned_text = cleaned_text.replace('_',' ')
        return cleaned_text

    def remove_urls_users(self, text:str) ->str:
        cleaned_text = text.replace('[URL]','')
        cleaned_text = cleaned_text.replace('[USER]','')
        return cleaned_text

    def remove_punctuations(self,text:str) -> str:
        cleaned_text = text.translate(str.maketrans("", "", string.punctuation))
        return cleaned_text

    def lowercase(self, text:str) -> str:
        cleaned_text = text.lower()
        return cleaned_text

    def correct_spelling(self,text:str) -> str:
        cleaned_text = TextBlob(text)
        return cleaned_text.correct().string

    def remove_stopwords(self,text:str) -> str:
        stop_words = set(nltk.corpus.stopwords.words('english'))
        tokens = nltk.tokenize.word_tokenize(text)
        filtered_text = [word for word in tokens if word.lower() not in stop_words]
        return ' '.join(filtered_text)

    def word_transform(self,text:str) -> str:
        if self.word_transformer == 'lemmatization':
            lemmatizer = nltk.stem.WordNetLemmatizer()
            lemmatized_words = [lemmatizer.lemmatize(word) for word in nltk.tokenize.word_tokenize(text)]
            return " ".join(lemmatized_words)
        elif self.word_transformer == 'stemming':
            stemmer = nltk.stem.PorterStemmer()
            stemmed_words = [stemmer.stem(word) for word in text.split()]
            return " ".join(stemmed_words)

In [3]:
df = pd.read_csv('data/edos_labelled_aggregated.csv')

In [4]:
preprocessor = Preprocessor(emoji = 'translate',
                            urls_users_remover = True,
                            punctuation_remover = True,
                            word_transformer= 'stemming')
data = list(df['text'])
text_preprocessed = preprocessor.process(data)
df['text_preprocessed'] = text_preprocessed

100%|██████████| 20000/20000 [00:00<00:00, 22384.79it/s]
100%|██████████| 20000/20000 [00:00<00:00, 3183170.04it/s]
100%|██████████| 20000/20000 [00:00<00:00, 405893.84it/s]
100%|██████████| 20000/20000 [00:02<00:00, 7701.32it/s]


In [5]:
df

Unnamed: 0,rewire_id,text,label_sexist,label_category,label_vector,split,text_preprocessed
0,sexism2022_english-9609,"In Nigeria, if you rape a woman, the men rape ...",not sexist,none,none,dev,in nigeria if you rape a woman the men rape yo...
1,sexism2022_english-16993,"Then, she's a keeper. 😉",not sexist,none,none,train,then she a keeper wink face
2,sexism2022_english-13149,This is like the Metallica video where the poo...,not sexist,none,none,train,thi is like the metallica video where the poor...
3,sexism2022_english-13021,woman?,not sexist,none,none,train,woman
4,sexism2022_english-966,I bet she wished she had a gun,not sexist,none,none,dev,i bet she wish she had a gun
...,...,...,...,...,...,...,...
19995,sexism2022_english-5228,girls really get fucked almost every weekend ?,not sexist,none,none,train,girl realli get fuck almost everi weekend
19996,sexism2022_english-10140,The hatred for moslems are Reasonable and Just...,not sexist,none,none,train,the hatr for moslem are reason and justifi the...
19997,sexism2022_english-9726,Now this is a woman who gets it. 👆,not sexist,none,none,train,now thi is a woman who get it backhand index p...
19998,sexism2022_english-13365,“American Idol” finalist [USER] said nothing i...,not sexist,none,none,train,“american idol” finalist said noth is go to st...


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
# train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
# train_data.to_csv('/content/train.csv', index=False)
# test_data.to_csv('/content/test.csv', index=False)

In [11]:
# !pip install transformers

In [7]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [13]:
def load_data(df):
    texts = df['text_preprocessed'].tolist()
    labels = [1 if (label == True) else 0 for label in df['label_sexist'].tolist()]
    return texts, labels

In [39]:
texts, labels = load_data(df)

In [15]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [16]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [17]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [18]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [27]:
def predict_sexist(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return True if preds.item() == 1 else False

In [20]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [21]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [22]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
device = torch.device("mps")
model = BERTClassifier(bert_model_name, num_classes).to(device)

NameError: name 'torch' is not defined

In [24]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [40]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4


100%|██████████| 50/50 [00:37<00:00,  1.33it/s]


Validation Accuracy: 0.7850
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       145
           1       0.62      0.58      0.60        55

    accuracy                           0.79       200
   macro avg       0.73      0.72      0.73       200
weighted avg       0.78      0.79      0.78       200

Epoch 2/4


100%|██████████| 50/50 [00:25<00:00,  1.99it/s]


Validation Accuracy: 0.7850
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       145
           1       0.62      0.58      0.60        55

    accuracy                           0.79       200
   macro avg       0.73      0.72      0.73       200
weighted avg       0.78      0.79      0.78       200

Epoch 3/4


100%|██████████| 50/50 [00:25<00:00,  1.99it/s]


Validation Accuracy: 0.7850
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       145
           1       0.62      0.58      0.60        55

    accuracy                           0.79       200
   macro avg       0.73      0.72      0.73       200
weighted avg       0.78      0.79      0.78       200

Epoch 4/4


100%|██████████| 50/50 [00:24<00:00,  2.00it/s]


Validation Accuracy: 0.7850
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       145
           1       0.62      0.58      0.60        55

    accuracy                           0.79       200
   macro avg       0.73      0.72      0.73       200
weighted avg       0.78      0.79      0.78       200


In [26]:
# torch.save(model.state_dict(), "berta_classifier.pth")

In [36]:
test_text = ("men should be in the kitchen.")
sexist = predict_sexist(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted: {sexist}")

men should be in the kitchen.
Predicted: False
