In [1]:
import torch
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, RobertaTokenizer
import pandas as pd
import json
import re
import numpy as np
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niclasstoffregen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/niclasstoffregen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/niclasstoffregen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
data = json.loads(open("../data/processed/Oppositional_thinking_analysis_dataset.json").read())
data = pd.DataFrame(data)

In [3]:
def preprocess_basic(text, lem_tag = True, stem_tag = False):
    tokens = nltk.word_tokenize(text.lower())

    if lem_tag:
    # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

    if stem_tag:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)



def preprocess_advanced(text:str, lem_tag = True, stem_tag = False) -> int:

    # Lowercasing
    text = text.lower()
    text = re.sub(r'\d+', '', text) # remove decimals  
    text = re.sub(r'[\:\-\']', '', text)  # Remove specific punctuation
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text) # Remove special characters
    text = re.sub(r'\d+\.\d+', '', text)  # Matches one or more digits followed by a dot and one or more digits
    text = re.sub(r'\bcom\b', '', text, flags=re.IGNORECASE)  # Matches "com" at word boundaries (whole word)


    # Tokenization
    tokens = word_tokenize(text)

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    if lem_tag:
    # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    if stem_tag:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

In [5]:
data['cleaned_text'] = data['text'].apply(preprocess_advanced)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_inputs = tokenizer(data['cleaned_text'].tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
#encoded_inputs = pd.DataFrame.from_dict(encoded_inputs)

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['category'])
labels = torch.tensor(data['label_encoded'].values)


#print("Shape of encoded_inputs: ", encoded_inputs.shape)
print("Shape of labels: ", labels.shape)

# Split data

input_ids, val_input_ids, train_labels, val_labels = train_test_split(encoded_inputs['input_ids'], labels, test_size=0.2, random_state=42)
token_type_ids, val_token_type_ids, _, _ = train_test_split(encoded_inputs['token_type_ids'], labels, test_size=0.2, random_state=42)
attention_mask, val_attention_mask, _, _ = train_test_split(encoded_inputs['attention_mask'], labels, test_size=0.2, random_state=42)
train_inputs = pd.DataFrame(columns=["input_ids", "token_type_ids","attention_mask"], data=zip(input_ids, token_type_ids, attention_mask))
val_inputs = pd.DataFrame(columns=["input_ids", "token_type_ids","attention_mask"], data=zip(val_input_ids, val_token_type_ids, val_attention_mask))


Shape of labels:  torch.Size([4000])


In [None]:
train_inputs['input_ids']

In [7]:


# Prepare DataLoader
train_data = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], torch.tensor(train_labels))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(val_inputs['input_ids'], val_inputs['attention_mask'], torch.tensor(val_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Optimizer and Learning Rate Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch_input_ids, batch_input_mask, batch_labels = batch
        batch_input_ids = batch_input_ids.to(device)
        batch_input_mask = batch_input_mask.to(device)
        batch_labels = batch_labels.to(device)
        
        model.zero_grad()
        outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss for epoch {epoch}: {avg_train_loss}")

    # Validation Loop
    model.eval()
    eval_loss = 0
    eval_accuracy = 0
    for batch in validation_dataloader:
        batch_input_ids, batch_input_mask, batch_labels = batch
        batch_input_ids = batch_input_ids.to(device)
        batch_input_mask = batch_input_mask.to(device)
        batch_labels = batch_labels.to(device)

        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
        loss = outputs.loss
        logits = outputs.logits
        eval_loss += loss.item()
        predictions = torch.argmax(logits, dim=1).flatten()
        eval_accuracy += (predictions == batch_labels).cpu().numpy().mean()

    avg_val_loss = eval_loss / len(validation_dataloader)
    avg_val_accuracy = eval_accuracy / len(validation_dataloader)
    print(f"Validation loss for epoch {epoch}: {avg_val_loss}")
    print(f"Validation accuracy for epoch {epoch}: {avg_val_accuracy}")

  train_data = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], torch.tensor(train_labels))


TypeError: 'int' object is not callable