In [1]:
import pandas as pd
import numpy as np
import evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, TextClassificationPipeline
from datasets import Dataset

In [None]:
dfo = pd.read_csv('data/obama_cleaned.csv')
dfo = dfo.rename(columns={'tweets' : 'text', 'class' : 'label'})

dfr = pd.read_csv('data/romney_cleaned.csv')
dfr = dfr.rename(columns={'tweets' : 'text', 'class' : 'label'})

# Pre-trained model: BERTweet

Fine-tuning using our data:

In [None]:
Xo = dfo['text']
yo = dfo['label'].map({1 : 2, 0 : 1, -1 : 0})
Xo_train, Xo_eval, yo_train, yo_eval = train_test_split(Xo, yo, test_size = 0.25, random_state = 21)

Xr = dfr['text']
yr = dfr['label'].map({1 : 2, 0 : 1, -1 : 0})
Xr_train, Xr_eval, yr_train, yr_eval = train_test_split(Xr, yr, test_size = 0.25, random_state = 21)


traindf_o = pd.concat([Xo_train,yo_train], axis = 1)
evaldf_o = pd.concat([Xo_eval,yo_eval], axis = 1)

traindf_r = pd.concat([Xr_train,yr_train], axis = 1)
evaldf_r = pd.concat([Xr_eval,yr_eval], axis = 1)

In [None]:
train_o = Dataset.from_pandas(traindf_o, split = 'train')
eval_o = Dataset.from_pandas(evaldf_o, split = 'eval')

train_r = Dataset.from_pandas(traindf_r, split = 'train')
eval_r = Dataset.from_pandas(evaldf_r, split = 'eval')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
    
tokenized_train_o = train_o.map(tokenize_function, batched=True)
tokenized_eval_o = eval_o.map(tokenize_function, batched=True)

tokenized_train_r = train_r.map(tokenize_function, batched=True)
tokenized_eval_r = eval_o.map(tokenize_function, batched=True)

In [None]:
model_o = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis", num_labels=3)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_o = TrainingArguments(output_dir="checkpoints/test_trainer_o", evaluation_strategy="epoch", num_train_epochs=2)

trainer_o = Trainer(
    model=model_o,
    args=training_args_o,
    train_dataset=tokenized_train_o,
    eval_dataset=tokenized_eval_o,
    compute_metrics=compute_metrics,
)

trainer_o.train()

In [None]:
trainer_o.save_model('models/obama_final')

In [None]:
model_r = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis", num_labels=3)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_r = TrainingArguments(output_dir="checkpoints/test_trainer_r", evaluation_strategy="epoch", num_train_epochs=3)

trainer_r = Trainer(
    model=model_r,
    args=training_args_r,
    train_dataset=tokenized_train_r,
    eval_dataset=tokenized_eval_r,
    compute_metrics=compute_metrics,
)

trainer_r.train()

In [None]:
trainer_r.save_model('models/romney_final')

# Load from checkpoints

In [None]:
model_o = AutoModelForSequenceClassification.from_pretrained("checkpoints/test_trainer_o", num_labels=3)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_o = TrainingArguments(output_dir="test_trainer_o", evaluation_strategy="epoch", num_train_epochs=2)

trainer_o = Trainer(
    model=model_o,
    args=training_args_o,
    train_dataset=tokenized_train_o,
    eval_dataset=tokenized_test_o,
    compute_metrics=compute_metrics,
)

trainer_o.train()

In [None]:
model_r = AutoModelForSequenceClassification.from_pretrained("checkpoints/test_trainer_r", num_labels=3)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args_r = TrainingArguments(output_dir="checkpoints/test_trainer_r", evaluation_strategy="epoch", num_train_epochs=3)

trainer_r = Trainer(
    model=model_r,
    args=training_args_r,
    train_dataset=tokenized_train_r,
    eval_dataset=tokenized_eval_r,
    compute_metrics=compute_metrics,
)

trainer_r.train()

# Load finetuned models

In [None]:
model_o = AutoModelForSequenceClassification.from_pretrained('models/obama_final')
model_r = AutoModelForSequenceClassification.from_pretrained('models/romney_final')

In [None]:
pipe_o = TextClassificationPipeline(model=model_o, tokenizer=tokenizer)
pipe_r = TextClassificationPipeline(model=model_r, tokenizer=tokenizer)

In [None]:
test_o = pd.read_csv('data/test_obama_cleaned.csv')
test_r = pd.read_csv('data/test_romney_cleaned.csv')

In [None]:
pred_o = list()
for tweet in test_o['tweets']:
    pred_o.append(pipe_o(tweet)[0]['label'])

pred_r = list()
for tweet in test_r['tweets']:
    pred_r.append(pipe_r(tweet)[0]['label'])

In [None]:
target_o = test_o['class'].map({-1: 'NEG', 0: 'NEU', 1:'POS'})
target_r = test_r['class'].map({-1: 'NEG', 0: 'NEU', 1:'POS'})

In [None]:
acc_o = accuracy_score(target_o, pred_o)
prec_o = precision_score(target_o, pred_o, average = None, zero_division = np.nan)
rec_o = recall_score(target_o, pred_o, average = None)
f1_o = f1_score(target_o, pred_o, average = None)
print("Accuracy:", acc_o)
print("Precision:", prec_o)
print("Recall:", rec_o)
print("F1:", f1_o)

In [None]:
acc_r = accuracy_score(target_r, pred_r)
prec_r = precision_score(target_r, pred_r, average = None, zero_division = np.nan)
rec_r = recall_score(target_r, pred_r, average = None)
f1_r = f1_score(target_r, pred_r, average = None)
print("Accuracy:", acc_r)
print("Precision:", prec_r)
print("Recall:", rec_r)
print("F1:", f1_r)