In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import TextClassificationPipeline

In [23]:
dfo = pd.read_csv('data/obama_cleaned.csv')
dfo = dfo.rename(columns={'tweets' : 'text', 'class' : 'label'})
dfr = pd.read_csv('data/romney_cleaned.csv')
dfr = dfr.rename(columns={'tweets' : 'text', 'class' : 'label'})
df = pd.concat([dfo, dfr], ignore_index = True)
df.info()
pd.set_option('display.max_rows', None)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11271 entries, 0 to 11270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11271 non-null  object
 1   label   11271 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 176.2+ KB


In [24]:
df = df.astype({'text' : 'string'})

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11271 entries, 0 to 11270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11271 non-null  string
 1   label   11271 non-null  int64 
dtypes: int64(1), string(1)
memory usage: 176.2 KB


# Pre-trained model: BERTweet

Fine-tuning using our data:

In [26]:
X0 = df['text']
y0 = df['label'].map({1 : 2, 0 : 1, -1 : 0})
X, X_test, y, y_test = train_test_split(X0, y0, test_size = 0.2, random_state = 27)
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size = 0.25, random_state = 27)


traindf = pd.concat([X_train,y_train], axis = 1)

evaldf = pd.concat([X_eval,y_eval], axis = 1)

testdf = pd.concat([X_test,y_test], axis = 1)



In [27]:
train = Dataset.from_pandas(traindf, split = 'train')
val = Dataset.from_pandas(evaldf, split = 'eval')
test = Dataset.from_pandas(testdf, split = 'test')


In [28]:
tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
    

tokenized_train = train.map(tokenize_function, batched=True)
tokenized_eval = val.map(tokenize_function, batched=True)
tokenized_test = test.map(tokenize_function, batched=True)


Map:   0%|          | 0/6762 [00:00<?, ? examples/s]

Map:   0%|          | 0/2254 [00:00<?, ? examples/s]

Map:   0%|          | 0/2255 [00:00<?, ? examples/s]

In [29]:
model = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis", num_labels=3)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="checkpoints/test_trainer_unique", evaluation_strategy="epoch", num_train_epochs=3)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.8588,0.775103,0.660603
2,0.5972,0.893185,0.696983
3,0.3768,1.039635,0.691216


TrainOutput(global_step=2538, training_loss=0.5951146617285185, metrics={'train_runtime': 1718.2219, 'train_samples_per_second': 11.806, 'train_steps_per_second': 1.477, 'total_flos': 1334379698007552.0, 'train_loss': 0.5951146617285185, 'epoch': 3.0})

In [30]:
trainer.save_model('models/model_unique')

# Load from checkpoints

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("checkpoints/test_trainer_unique/checkpoint-1000", num_labels=3)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="test_trainer_unique", evaluation_strategy="epoch", num_train_epochs=5)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

trainer.train()

In [11]:
trainer.save_model('models/model_unique')

# Load finetuned models

In [31]:
model = AutoModelForSequenceClassification.from_pretrained('models/model_unique')

In [32]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)

In [33]:
pos = list()
neg = list()
neu = list()
pred = pd.DataFrame()
for t in testdf['text']:
    prediction = pipe(t, top_k=None)
    for l in prediction:
        if l['label'] == 'POS':
            pos.append(l['score'])
        elif l['label'] == 'NEG':
            neg.append(l['score'])
        else: 
            neu.append(l['score'])
            
pred['pos'] = pos
pred['neg'] = neg
pred['neu'] = neu

In [34]:
pred['class'] = list(testdf['label'].map({0 : -1, 1 : 0, 2 : 1}))

# Predict label using maximum probability


In [35]:
def pred_label(df):
    preds = list()
    for idx, row in df.iterrows():
        if row['pos'] >= row['neu'] and row['pos'] >= row['neg']:
            preds.append(1)
        elif row['neg'] >= row['neu'] and row['neg'] > row['pos']:
            preds.append(-1)
        elif row['neu'] > row['pos'] and row['neu'] > row['neg']:
            preds.append(0)
    df['pred'] = preds
    acc = accuracy_score(df['class'], df['pred'])
    prec = precision_score(df['class'], df['pred'], average = None, zero_division = np.nan)
    rec = recall_score(df['class'], df['pred'], average = None)
    f1 = f1_score(df['class'], df['pred'], average = None)
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1:", f1)
    return df

In [36]:
predicted_labels = pred_label(pred)


Accuracy: 0.7024390243902439
Precision: [0.75       0.62426036 0.70777989]
Recall: [0.78664008 0.58367911 0.70510397]
F1: [0.76788321 0.60328806 0.70643939]
