In [1]:
import torch
import torch.nn.functional as F
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, set_seed

In [2]:
PATH = "./entire_model.pt"
BERT_MODEL = 'bert-base-uncased' # BERT model name for Transformer
MAX_LENGTH = 90
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Check for GPU
BATCH_SIZE = 128

In [3]:
model = torch.load(PATH)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)

In [4]:
df = pd.read_csv('./data/analysis/Right_clean.csv')

In [5]:
df['tweet_text']

0       trump first ran president hard many imagine ro...
1       trishregan left realize secret service always ...
2                                dbongino good luck buddy
3       jaltucher feel like need restraining order guy...
4       msm might want look actual stats cdc next time...
                              ...                        
3671    breaking biden begins staffing commission aime...
3672              cloris leachman legendary actress dead 
3673    biden says impose equity see susan rice charge...
3674    bidens shaken teachers union tell teachers tea...
3675    clean energy solar wind make  energy portfolio...
Name: tweet_text, Length: 3676, dtype: object

In [6]:
def get_output(texts):
    tokens = tokenizer(
            texts,
            padding='max_length',
            max_length=MAX_LENGTH,
            truncation=True,
            return_tensors="pt"
        )

    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)
    # labels = tokens['labels'].to(device)

    input_ids, attention_mask
    output = model(input_ids, attention_mask=attention_mask)
    yhat = output.logits

    yhat = F.softmax(yhat, dim=1)
    yhat = torch.argmax(yhat, dim=1)

    return ["Hate" if x else "Not Hate" for x in yhat.tolist()]

In [20]:
FILE_PATH = './data/analysis/'

for file in ['Right', "Left", "Fake"]:
    filename = FILE_PATH + file + '_clean.csv'
    df = pd.read_csv(filename)
    tweets = list(df['tweet_text'])
    print(len(tweets))
    res = []
    i = 0
    while i < len(tweets):
        res += get_output(tweets[i : i+BATCH_SIZE])
        i += BATCH_SIZE
    
    df["label"] = res
    df.to_csv(FILE_PATH + file + '_labeled.csv')
    
    df.label.value_

3676
3788
8385


In [15]:
list(df['tweet_text'])

['trump first ran president hard many imagine role dilbert creator',
 'trishregan left realize secret service always president even hospital room even ho',
 'dbongino good luck buddy',
 'jaltucher feel like need restraining order guy wish would address actual problems gotten',
 'msm might want look actual stats cdc next time insist realdonaldtrump americ',
 'mccormickprof idea social tolerance experimentone would also make good reality tv would film volunteer wearin',
 'rickygervais remembered life insignificant blip eternal void nothingness',
 'people think answer unelected get away longer',
 'thebabylonbee monster releases exciting new energy drink infused trump antibodies',
 'thank mzhemingway need names details seem important missing',
 'peoples champion',
 'medias mad trump endangering secret service driving around block oh come onthat',
 'kaysmythe please read listen share interview reflections tomilahren lavagrants larryoconnor drdrew foxnation r',
 'great paint expect law enfor

In [16]:
df = pd.read_csv('./data/analysis/Right_labeled.csv')

In [17]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,id,tweet_text,label
0,0,0,2020-10-05,1312907856165826565,trump first ran president hard many imagine ro...,Not Hate
1,1,1,2020-10-05,1312967247112134656,trishregan left realize secret service always ...,Hate
2,2,2,2020-10-05,1313113401120436226,dbongino good luck buddy,Not Hate
3,3,4,2020-10-05,1313167228897030144,jaltucher feel like need restraining order guy...,Not Hate
4,4,5,2020-10-05,1313175357864935425,msm might want look actual stats cdc next time...,Not Hate
