In [1]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.6 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 79.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 72.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, BertTokenizerFast

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# specify GPU
device = torch.device("cuda")

# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased', return_dict=False)

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
# tweets regarding the politicans
mehmet_oz_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/mehmet_oz.csv")
john_fetterman_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/john_fetterman.csv")
adam_laxalt_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/adam_laxalt.csv")
catherine_cortez_masto_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/catherine_cortez_masto.csv")
ron_johnson_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/ron_johnson.csv")
mandela_barnes_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/mandela_barnes.csv")
donald_bolduc_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/donald_bolduc.csv")
maggie_hassan_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/maggie_hassan.csv")
ted_budd_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/ted_budd.csv")
cheri_beasly_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/cheri_beasly.csv")
joe_pinion_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/joe_pinion.csv")
charles_schumer_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/charles_schumer.csv")
jd_vance_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/jd_vance.csv")
tim_ryan_df = pd.read_csv("/content/drive/MyDrive/Twitter/twitter_api_data/original/tim_ryan.csv")

In [8]:
# Load Model
path = '/content/drive/MyDrive/Twitter/models/saved_weights.pt'

In [9]:
# sample data
text = ["Hi! This is a testing sample to see if this works. I hope it does!"]

# encode text
sent_id = tokenizer.batch_encode_plus(text, padding=True, return_token_type_ids=False)

In [10]:
# define model class
class BERT_Arch(nn.Module):
        def __init__(self, bert):
            super(BERT_Arch, self).__init__()
            self.bert = bert
            # dropout layer
            self.dropout = nn.Dropout(0.1)
            # relu activation function
            self.relu =  nn.ReLU()
            # dense layer 1
            self.fc1 = nn.Linear(768,512)
            # dense layer 2 (Output layer)
            self.fc2 = nn.Linear(512,2)
            #softmax activation function
            self.softmax = nn.LogSoftmax(dim=1)

        #define the forward pass
        def forward(self, sent_id, mask):
            #pass the inputs to the model
            _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
            x = self.fc1(cls_hs)
            x = self.relu(x)
            x = self.dropout(x)
            # output layer
            x = self.fc2(x)
            # apply softmax activation
            x = self.softmax(x)
            return x

In [11]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)

model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [12]:
texts = ["I love my life so much", "I hate life"]

In [13]:
# use model on sample tweet
model.eval()
for text in texts:
  with torch.no_grad():
      # convert text to ids
      sent_id = tokenizer.batch_encode_plus([text], padding=True, return_token_type_ids=False)
      
      # convert to tensor
      input_ids = torch.tensor(sent_id['input_ids']).to(device)
      attention_mask = torch.tensor(sent_id['attention_mask']).to(device)
      
      # get model predictions for the given text
      output = model(input_ids, attention_mask)
      print(output)
      print("Sentiment: ", np.argmax(output.cpu().detach().numpy(), axis=1))


tensor([[-0.8349, -0.5690]], device='cuda:0')
Sentiment:  [1]
tensor([[-0.0370, -3.3146]], device='cuda:0')
Sentiment:  [0]


In [14]:
# run model on all csvs
def run_model(df):
    # create new column for sentiment
    df['sentiment'] = 0
    
    # use model on
    model.eval()
    for index, row in df.iterrows():
        with torch.no_grad():
            # convert text to ids
            sent_id = tokenizer.batch_encode_plus([row['Tweet']], padding=True, return_token_type_ids=False)
            
            # convert to tensor
            input_ids = torch.tensor(sent_id['input_ids']).to(device)
            attention_mask = torch.tensor(sent_id['attention_mask']).to(device)
            
            # get model predictions for the given text
            output = model(input_ids, attention_mask)
            df.at[index, 'sentiment'] = np.argmax(output.cpu().detach().numpy(), axis=1)
            

In [15]:
# run model on all csvs
run_model(mehmet_oz_df)
run_model(john_fetterman_df)
run_model(adam_laxalt_df)
run_model(catherine_cortez_masto_df)
run_model(ron_johnson_df)
run_model(mandela_barnes_df)
run_model(donald_bolduc_df)
run_model(maggie_hassan_df)
run_model(ted_budd_df)
run_model(cheri_beasly_df)
run_model(joe_pinion_df)
run_model(charles_schumer_df)
run_model(jd_vance_df)
run_model(tim_ryan_df)

In [16]:
# get number of positive and negatives tweets from each df
def get_sentiment(df):
    pos = 0
    neg = 0
    for index, row in df.iterrows():
        if row['sentiment'] == 0:
            neg += 1
        else:
            pos += 1
    return pos, neg

In [17]:
# get number of positive and negatives tweets from each df
pos, neg = get_sentiment(mehmet_oz_df)
print("Mehmet Oz: ", pos, neg)
pos, neg = get_sentiment(john_fetterman_df)
print("John Fetterman: ", pos, neg)
pos, neg = get_sentiment(adam_laxalt_df)
print("Adam Laxalt: ", pos, neg)
pos, neg = get_sentiment(catherine_cortez_masto_df)
print("Catherine Cortez Masto: ", pos, neg)
pos, neg = get_sentiment(ron_johnson_df)
print("Ron Johnson: ", pos, neg)
pos, neg = get_sentiment(mandela_barnes_df)
print("Mandela Barnes: ", pos, neg)
pos, neg = get_sentiment(donald_bolduc_df)
print("Donald Bolduc: ", pos, neg)
pos, neg = get_sentiment(maggie_hassan_df)
print("Maggie Hassan: ", pos, neg)
pos, neg = get_sentiment(ted_budd_df)
print("Ted Budd: ", pos, neg)
pos, neg = get_sentiment(cheri_beasly_df)
print("Cheri Beasley: ", pos, neg)
pos, neg = get_sentiment(joe_pinion_df)
print("Joe Pinion: ", pos, neg)
pos, neg = get_sentiment(charles_schumer_df)
print("Charles Schumer: ", pos, neg)
pos, neg = get_sentiment(jd_vance_df)
print("JD Vance: ", pos, neg)
pos, neg = get_sentiment(tim_ryan_df)
print("Tim Ryan: ", pos, neg)

Mehmet Oz:  130 180
John Fetterman:  260 240
Adam Laxalt:  105 115
Catherine Cortez Masto:  60 70
Ron Johnson:  480 20
Mandela Barnes:  220 220
Donald Bolduc:  5 20
Maggie Hassan:  15 115
Ted Budd:  305 195
Cheri Beasley:  135 30
Joe Pinion:  30 20
Charles Schumer:  130 45
JD Vance:  300 200
Tim Ryan:  300 200


In [18]:
# get pos/neg ,pos/all, neg/all ratio for all dfs
def get_ratios(df):
    pos, neg = get_sentiment(df)
    pos_all = pos / (pos + neg)
    neg_all = neg / (pos + neg)
    pos_neg = pos / neg
    # round all ratios to 2 decimal places
    pos_all = round(pos_all, 2)
    neg_all = round(neg_all, 2)
    pos_neg = round(pos_neg, 2)
    return pos_all, neg_all, pos_neg

In [19]:
pos_all, neg_all, pos_neg = get_ratios(mehmet_oz_df)
print("Mehmet Oz: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(john_fetterman_df)
print("John Fetterman: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(adam_laxalt_df)
print("Adam Laxalt: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(catherine_cortez_masto_df)
print("Catherine Cortez Masto: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(ron_johnson_df)
print("Ron Johnson: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(mandela_barnes_df)
print("Mandela Barnes: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(donald_bolduc_df)
print("Donald Bolduc: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(maggie_hassan_df)
print("Maggie Hassan: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(ted_budd_df)
print("Ted Budd: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(cheri_beasly_df)
print("Cheri Beasley: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(joe_pinion_df)
print("Joe Pinion: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(charles_schumer_df)
print("Charles Schumer: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(jd_vance_df)
print("JD Vance: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = get_ratios(tim_ryan_df)
print("Tim Ryan: ", pos_all, neg_all, pos_neg)

Mehmet Oz:  0.42 0.58 0.72
John Fetterman:  0.52 0.48 1.08
Adam Laxalt:  0.48 0.52 0.91
Catherine Cortez Masto:  0.46 0.54 0.86
Ron Johnson:  0.96 0.04 24.0
Mandela Barnes:  0.5 0.5 1.0
Donald Bolduc:  0.2 0.8 0.25
Maggie Hassan:  0.12 0.88 0.13
Ted Budd:  0.61 0.39 1.56
Cheri Beasley:  0.82 0.18 4.5
Joe Pinion:  0.6 0.4 1.5
Charles Schumer:  0.74 0.26 2.89
JD Vance:  0.6 0.4 1.5
Tim Ryan:  0.6 0.4 1.5


In [20]:
# winners vs losers
# winners: john fetterman, catherine cortez masto, ron johnson, maggie hassan, ted budd, charles schumer, jd vance
# losers: mehmet oz, adam laxalt, mandela barnes, donald bolduc, cheri beasley, joe pinion, tim ryan
winners = [john_fetterman_df, catherine_cortez_masto_df, ron_johnson_df, maggie_hassan_df, ted_budd_df, charles_schumer_df, jd_vance_df]
losers = [mehmet_oz_df, adam_laxalt_df, mandela_barnes_df, donald_bolduc_df, cheri_beasly_df, joe_pinion_df, tim_ryan_df]

In [21]:
# sum number of positive and negatives tweets from list of df
def sum_sentiment(dfs):
    pos = 0
    neg = 0
    for df in dfs:
        pos_df, neg_df = get_sentiment(df)
        pos += pos_df
        neg += neg_df
    return pos, neg

In [22]:
pos, neg = sum_sentiment(winners)
print("Winners: ", pos, neg)
pos, neg = sum_sentiment(losers)
print("Losers: ", pos, neg)

Winners:  1550 885
Losers:  925 785


In [23]:
# average the ratios of winners and losers
def avg_ratios(dfs):
    pos_all = 0
    neg_all = 0
    pos_neg = 0
    for df in dfs:
        pos_all_df, neg_all_df, pos_neg_df = get_ratios(df)
        pos_all += pos_all_df
        neg_all += neg_all_df
        pos_neg += pos_neg_df
    pos_all = pos_all / len(dfs)
    neg_all = neg_all / len(dfs)
    pos_neg = pos_neg / len(dfs)
    return pos_all, neg_all, pos_neg

In [24]:
pos_all, neg_all, pos_neg = avg_ratios(winners)
print("Winners: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = avg_ratios(losers)
print("Losers: ", pos_all, neg_all, pos_neg)

Winners:  0.5728571428571428 0.4271428571428571 4.574285714285714
Losers:  0.5171428571428571 0.48285714285714293 1.4828571428571427


In [25]:
# democrats vs republicans
# democrats: mehmet oz, adam laxalt, ron johnson, donald bolduc, ted budd, joe pinion, jd vance
# republicans: john fetterman, catherine cortez masto, mandela barnes, maggie hassan, cheri beasley, charles schumer, tim ryan
dems = [mehmet_oz_df, adam_laxalt_df, ron_johnson_df, donald_bolduc_df, ted_budd_df, joe_pinion_df, jd_vance_df]
reps = [john_fetterman_df, catherine_cortez_masto_df, mandela_barnes_df, maggie_hassan_df, cheri_beasly_df, charles_schumer_df, tim_ryan_df]

In [26]:
pos, neg = sum_sentiment(dems)
print("Democrats: ", pos, neg)
pos, neg = sum_sentiment(reps)
print("Republicans: ", pos, neg)

Democrats:  1355 750
Republicans:  1120 920


In [27]:
pos_all, neg_all, pos_neg = avg_ratios(dems)
print("Democrats: ", pos_all, neg_all, pos_neg)
pos_all, neg_all, pos_neg = avg_ratios(reps)
print("Republicans: ", pos_all, neg_all, pos_neg)

Democrats:  0.5528571428571428 0.4471428571428571 4.348571428571428
Republicans:  0.5371428571428571 0.4628571428571428 1.7085714285714286


In [29]:
# convert all dfs to csv
john_fetterman_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/john_fetterman_annotated.csv")
mehmet_oz_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/mehmet_oz_annotated.csv")
adam_laxalt_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/adam_laxalt_annotated.csv")
catherine_cortez_masto_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/catherine_cortez_masto_annotated.csv")
ron_johnson_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/ron_johnson_annotated.csv")
mandela_barnes_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/mandela_barnes_annotated.csv")
donald_bolduc_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/donald_bolduc_annotated.csv")
maggie_hassan_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/maggie_hassan_annotated.csv")
ted_budd_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/ted_budd_annotated.csv")
cheri_beasly_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/cheri_beasly_annotated.csv")
joe_pinion_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/joe_pinion_annotated.csv")
charles_schumer_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/charles_schumer_annotated.csv")
jd_vance_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/jd_vance_annotated.csv")
tim_ryan_df.to_csv("/content/drive/MyDrive/Twitter/twitter_api_data/bert_annotated/tim_ryan_annotated.csv")