In [77]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import Dataset, load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, Trainer, TrainingArguments
import torch.nn as nn
import torch
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from torch.utils.data import DataLoader

In [71]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [57]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

In [58]:
print(f"Original classifier: {model.classifier}")  # Should show a layer with 5 outputs

Original classifier: Linear(in_features=768, out_features=5, bias=True)


In [59]:
model.num_labels = 3
model.classifier = nn.Linear(model.config.hidden_size, 3)

In [60]:
print(f"Updated classifier: {model.classifier}")  # Should now show 3 outputs

Updated classifier: Linear(in_features=768, out_features=3, bias=True)


# Finetune BERT

### Comments

In [61]:
def relabel(file):
    df = pd.read_csv(file)
    print(df.head(5))

    df = df.rename(columns={'sentiment': 'label'})

    def map_labels(rating):
        if rating == "negative":
            return 0  # Negative
        elif rating == "neutral":
            return 1  # Neutral
        else:
            return 2
        
    df['label'] = df['label'].apply(map_labels)
    return df

In [62]:
df2020 = relabel('./data/labeled_comments_2020.csv')
df2024 = relabel('./data/labeled_comments_2024.csv')

                                                text  created_utc  ups  \
0  Very true, but the problem is that even at tha...   1590975768    1   
1  You’re full of crap.\r\n\r\nYou present no fac...   1590984758    2   
2  Politically speaking, there is absolutely no w...   1590995791   29   
3  This is what happens when the president of the...   1591008451    1   
4  Silly comment. No matter how bad a president h...   1591013669    1   

   subreddit    neg    neu    pos  compound sentiment  
0  democrats  0.239  0.645  0.116   -0.9769  negative  
1  democrats  0.163  0.692  0.145   -0.1101  negative  
2  democrats  0.062  0.823  0.115    0.9925  positive  
3  democrats  0.000  0.903  0.097    0.4215  positive  
4  democrats  0.261  0.647  0.092   -0.8060  negative  
                                                text  created_utc  ups  \
0  They sure are making the rounds today. Our sys...   1717200019    1   
1  I would say it’s both. Because after a super c...   1717200509    6 

In [64]:
df2020.head()

Unnamed: 0,text,created_utc,ups,subreddit,neg,neu,pos,compound,label
0,"Very true, but the problem is that even at tha...",1590975768,1,democrats,0.239,0.645,0.116,-0.9769,0
1,You’re full of crap.\r\n\r\nYou present no fac...,1590984758,2,democrats,0.163,0.692,0.145,-0.1101,0
2,"Politically speaking, there is absolutely no w...",1590995791,29,democrats,0.062,0.823,0.115,0.9925,2
3,This is what happens when the president of the...,1591008451,1,democrats,0.0,0.903,0.097,0.4215,2
4,Silly comment. No matter how bad a president h...,1591013669,1,democrats,0.261,0.647,0.092,-0.806,0


In [65]:
# split into train and test data
def split_data(df2020, df2024):
    train_size = int(df2020.shape[0] * 0.8)
    test_size = min(df2020.shape[0] - train_size, df2024.shape[0])    # choose either 20% or the entirety of the df2024 data (whichever is smaller)

    train_df = df2020.sample(n=train_size, random_state=42)      # get random rows
    test_df = df2024.sample(n=test_size, random_state=42)

    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    return train_dataset, test_dataset

train_dataset, test_dataset = split_data(df2020, df2024)

In [66]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 158/158 [00:00<00:00, 411.98 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 379.85 examples/s]


In [67]:
print(train_dataset[0])

{'text': "Eh, I think that's shortsighted. If someone kills trump, he becomes martyred and all of these protests will lose their power. The American people by and large are usually swayed towards the side of the victim. Right now, the victims are victims of police brutality. If we make the cops or president a bigger victim, public opinion will sway towards them.\r\n\r\nWe're all just bleeding hearts, at the end of the day. But our hearts bleed for different causes, and different victims. That's why non-violent protests are the number 1 way to go right now, along with writing letters to our representatives. I'm not saying non-violent acts are effective 100% of the time when it comes to producing change, but they do end up being on the morally just side of history more often than the violent acts. They killed Lincoln, and he freed the slaves, and now people side with Lincoln. They killed MLK, and he promoted civil rights, and now people side with MLK. The killers just don't end up as the

In [68]:
print(f"Classifier output size: {model.classifier.out_features}")
print(train_dataset['label'][:100]) 

Classifier output size: 3
[0, 0, 2, 0, 0, 1, 2, 0, 0, 2, 2, 2, 1, 0, 2, 1, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 2, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 1, 0, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 2, 2, 2, 0, 0, 1, 0, 2, 0, 0, 1, 2, 1]


In [None]:
# train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [69]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

In [73]:
batch_size = 16
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    logging_dir='./logs'
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

eval_log = None
trainer.train()
eval_log = trainer.evaluate()


print(f'Evaluation Log: {eval_log}')

  0%|          | 0/50 [29:51<?, ?it/s]
 20%|██        | 10/50 [03:08<12:04, 18.11s/it]
[A
[A
[A

[A[A                                       
                                               
 20%|██        | 10/50 [03:26<12:04, 18.11s/it]
[A

{'eval_loss': 0.8660032153129578, 'eval_accuracy': 0.55, 'eval_runtime': 17.4333, 'eval_samples_per_second': 2.294, 'eval_steps_per_second': 0.172, 'epoch': 1.0}


 40%|████      | 20/50 [07:16<14:10, 28.34s/it]
[A
[A
[A

[A[A                                       
                                               
 40%|████      | 20/50 [08:08<14:10, 28.34s/it]
[A

{'eval_loss': 0.8487539291381836, 'eval_accuracy': 0.6, 'eval_runtime': 52.3692, 'eval_samples_per_second': 0.764, 'eval_steps_per_second': 0.057, 'epoch': 2.0}


 60%|██████    | 30/50 [11:49<08:00, 24.01s/it]
[A
[A
[A

[A[A                                       
                                               
 60%|██████    | 30/50 [12:06<08:00, 24.01s/it]
[A

{'eval_loss': 0.9155132174491882, 'eval_accuracy': 0.525, 'eval_runtime': 17.4396, 'eval_samples_per_second': 2.294, 'eval_steps_per_second': 0.172, 'epoch': 3.0}


 80%|████████  | 40/50 [15:57<04:37, 27.78s/it]
[A
[A
[A

[A[A                                       
                                               
 80%|████████  | 40/50 [16:15<04:37, 27.78s/it]
[A

{'eval_loss': 0.9621081352233887, 'eval_accuracy': 0.55, 'eval_runtime': 17.6977, 'eval_samples_per_second': 2.26, 'eval_steps_per_second': 0.17, 'epoch': 4.0}


100%|██████████| 50/50 [22:10<00:00, 26.76s/it]
[A
[A
[A

[A[A                                       
                                               
100%|██████████| 50/50 [22:54<00:00, 26.76s/it]
[A

{'eval_loss': 0.9641491770744324, 'eval_accuracy': 0.525, 'eval_runtime': 39.5872, 'eval_samples_per_second': 1.01, 'eval_steps_per_second': 0.076, 'epoch': 5.0}



100%|██████████| 50/50 [22:58<00:00, 27.58s/it]


{'train_runtime': 1378.861, 'train_samples_per_second': 0.573, 'train_steps_per_second': 0.036, 'train_loss': 0.5398244476318359, 'epoch': 5.0}


100%|██████████| 3/3 [00:10<00:00,  3.48s/it]

Evaluation Log: {'eval_loss': 0.8487539291381836, 'eval_accuracy': 0.6, 'eval_runtime': 17.4915, 'eval_samples_per_second': 2.287, 'eval_steps_per_second': 0.172, 'epoch': 5.0}





In [80]:
def finetune_confusion(test_dataset, fig_name):
    batch_size = 32  # Set the batch size
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    all_labels = []
    all_preds = []

    # Iterate over the test dataset
    for batch in test_dataloader:
        # Get inputs and labels, move them to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Get predicted class (argmax on logits)
        preds = torch.argmax(outputs.logits, dim=1)

        # Append to lists
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

    cm = confusion_matrix(all_labels, all_preds)
    print(cm)

    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', cbar=True)

    plt.title(fig_name)
    plt.ylabel('Ground Truth Label')
    plt.xlabel('Predicted Label')

    plt.show()

finetune_confusion(test_dataset, 'Finetuned Model Confusion Matrix for Comments')

AttributeError: 'list' object has no attribute 'to'

### Posts

In [None]:
pdf2020 = relabel('./data/labeled_posts_2020.csv')
pdf2024 = relabel('./data/labeled_posts_2024.csv')

train_dataset, test_dataset = split_data(df2020, df2024)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
batch_size = 16
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    logging_dir='./logs'
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

eval_log = None
trainer.train()
eval_log = trainer.evaluate()


print(f'Evaluation Log: {eval_log}')