In [7]:
import torch
import nltk

import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niclasstoffregen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/niclasstoffregen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/niclasstoffregen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

mps


In [20]:
def balance_data(data):
        # Identify the majority and minority classes
    majority_class = data['label'].value_counts().idxmax()
    minority_class = data['label'].value_counts().idxmin()
    
    # Separate the majority and minority classes
    majority = data[data['label'] == majority_class]
    minority = data[data['label'] == minority_class]
    
    # Upsample the minority class
    minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
    
    # Combine the majority class and the upsampled minority class
    data_balanced = pd.concat([majority, minority_upsampled])
    
    # Now split the balanced data into training and test sets
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        data_balanced['text'], data_balanced['label'], test_size=0.2, random_state=42
    )

    return train_texts, test_texts, train_labels, test_labels

In [21]:
data = json.loads(open("../data/processed/Oppositional_thinking_analysis_dataset.json").read())
data = pd.DataFrame(data)

# Encode labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['category'])

# Train-test split
train_texts, test_texts, train_labels, test_labels = balance_data(data)
# verify the balance
print(train_labels.value_counts())

label
0    2102
1    2091
Name: count, dtype: int64


In [22]:
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification

# Tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Models for binary classification
bert_model_binary = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
roberta_model_binary = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
bert_model_binary = bert_model_binary.to(device)
roberta_model_binary = roberta_model_binary.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from torch.utils.data import DataLoader, Dataset
import torch

class OppositionalThinkingDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label, dtype=torch.long)}

# Create datasets
train_dataset = OppositionalThinkingDataset(train_texts, train_labels, bert_tokenizer)
test_dataset = OppositionalThinkingDataset(test_texts, test_labels, bert_tokenizer)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

In [26]:
len(train_dataloader)

263

In [None]:
from transformers import AdamW, get_scheduler

def train_model(model, train_dataloader, test_dataloader, device, epochs=3, lr=5e-5):
    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = epochs * len(train_dataloader)
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    for epoch in range(epochs):
        model.train()
        for batch in train_dataloader:
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        model.eval()
        total_acc, total_count = 0, 0
        with torch.no_grad():
            for batch in test_dataloader:
                # Move batch to device
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
                predictions = outputs.logits.argmax(dim=-1)
                total_acc += (predictions == batch['label']).sum().item()
                total_count += batch['label'].size(0)
        
        accuracy = total_acc / total_count
        print(f"Epoch {epoch+1}: Accuracy {accuracy:.4f}")

# Train BERT and RoBERTa models for binary and multiclass classification
train_model(bert_model_binary, train_dataloader, test_dataloader, device)
train_model(roberta_model_binary, train_dataloader, test_dataloader, device)

