In [1]:
import pandas as pd
import numpy as np
from pysentimiento import create_analyzer
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
dfo = pd.read_csv('data/obama_cleaned.csv')
dfr = pd.read_csv('data/romney_cleaned.csv')
df = pd.concat([dfo, dfr], ignore_index = True)
df.info()
pd.set_option('display.max_rows', None)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11271 entries, 0 to 11270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweets  11271 non-null  object
 1   class   11271 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 176.2+ KB


In [3]:
df = df.astype({'tweets' : 'string'})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11271 entries, 0 to 11270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweets  11271 non-null  string
 1   class   11271 non-null  int64 
dtypes: int64(1), string(1)
memory usage: 176.2 KB


# Pre-trained model: BERTweet

In [5]:

analyzer = create_analyzer(task="sentiment", lang="en")


In [6]:
def get_prob_dictionary(p):
    start_index = p.find("{") + 1
    end_index = p.rfind("}")
    d = p[start_index:end_index]
    scores = d.split(', ')
    probs = dict()
    for i in range(0, len(scores)):
        k, v = scores[i].split(': ')
        probs[k] = float(v)
    return probs


In [7]:

pos = list()
neg = list()
neu = list()

for t in df['tweets']:
    p = get_prob_dictionary(str(analyzer.predict(t)))
    pos.append(p['POS'])
    neg.append(p['NEG'])
    neu.append(p['NEU'])


df['pos'] = pos
df['neg'] = neg
df['neu'] = neu

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11271 entries, 0 to 11270
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tweets  11271 non-null  string 
 1   class   11271 non-null  int64  
 2   pos     11271 non-null  float64
 3   neg     11271 non-null  float64
 4   neu     11271 non-null  float64
dtypes: float64(3), int64(1), string(1)
memory usage: 440.4 KB


# Predict label using maximum probability


In [9]:
preds = list()
for idx, row in df.iterrows():
    if row['pos'] >= row['neu'] and row['pos'] >= row['neg']:
        preds.append(1)
    elif row['neg'] >= row['neu'] and row['neg'] > row['pos']:
        preds.append(-1)
    elif row['neu'] > row['pos'] and row['neu'] > row['neg']:
        preds.append(0)
df['pred'] = preds

In [10]:
acc = accuracy_score(df['class'], df['pred'])
prec = precision_score(df['class'], df['pred'], average = None, zero_division = np.nan)
rec = recall_score(df['class'], df['pred'], average = None)
f1 = f1_score(df['class'], df['pred'], average = None)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)

Accuracy: 0.6247005589566144
Precision: [0.68694362 0.51322085 0.68081761]
Recall: [0.76213992 0.55728739 0.47167756]
F1: [0.72259071 0.53434714 0.55727156]
