In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

import sys, os
from tqdm.auto import tqdm

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
for dev in physical_devices:
    tf.config.experimental.set_memory_growth(dev, True)

In [3]:
from transformers import BertTokenizer, TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model.load_weights('bert_emo_finetuned_weights_only.h5')

In [5]:
DATA_DIR = '../data'

In [6]:
biden_df = pd.read_csv(os.path.join(DATA_DIR, 'cleaned_1_to_1_biden.csv'))
trump_df = pd.read_csv(os.path.join(DATA_DIR, 'cleaned_1_to_1_trump.csv'))

In [7]:
biden_df.loc[:,'who'] = 'biden'
trump_df.loc[:,'who'] = 'trump'

In [8]:
balance_data_df = pd.DataFrame(columns=biden_df.columns)

In [9]:
balance_data_df

Unnamed: 0,tweet,long,lat,state_code,sentiment,who


In [10]:
states_list = set(biden_df['state_code']).union(set(trump_df['state_code'])) - set(['MP', 'GU', 'PR', 'VI', 'AS', 'UM', ])

In [11]:
for state in states_list:
    trump_df_state = trump_df[trump_df['state_code']==state]
    biden_df_state = biden_df[biden_df['state_code']==state]
    
    n_sample = len(trump_df_state) if len(trump_df_state) < len(biden_df_state) else len(biden_df_state)
    
    if len(trump_df_state) < len(biden_df_state):
        balance_data_df = pd.concat([balance_data_df, trump_df_state])
        balance_data_df = pd.concat([balance_data_df, biden_df_state.sample(len(trump_df_state))])
    else:
        balance_data_df = pd.concat([balance_data_df, biden_df_state])
        balance_data_df = pd.concat([balance_data_df, trump_df_state.sample(len(biden_df_state))])

In [12]:
balance_data_df

Unnamed: 0,tweet,long,lat,state_code,sentiment,who
1234,I will never give up fighting for you and our ...,-105.573285,36.407238,NM,0,trump
1235,"From the ""you couldn't make this shit up if yo...",-105.573285,36.407238,NM,0,trump
1237,"Today, the US set a single day record of 88,45...",-105.573285,36.407238,NM,0,trump
1239,"""I was putting too much trust in Americans . ....",-105.573285,36.407238,NM,0,trump
1240,Trump says nobody gets hacked but forgot his h...,-105.573285,36.407238,NM,0,trump
...,...,...,...,...,...,...
2109,I am a moderate democrat yet I am a trillion t...,-100.540737,47.620146,ND,0,biden
44097,Such a powerful song. Heres to the land that m...,-96.900362,46.874967,ND,0,biden
39844,It almost would be fitting that the king of la...,-100.540737,47.620146,ND,0,biden
19393,"EVERYONE GET OUT AND VOTE TOMORROW, PRESIDENTI...",-100.540737,47.620146,ND,0,biden


In [13]:
batch_size = 100

In [14]:
def emotion_classify(tweet_list):
    batch_idx = np.append(np.arange(0, len(tweet_list), batch_size), len(tweet_list))
    labels = np.array([])
    for i in tqdm(range(len(batch_idx)-1)):
        tf_batch = tokenizer(tweet_list[batch_idx[i]:batch_idx[i+1]], max_length=280, padding=True, truncation=True, return_tensors='tf')
        tf_outputs = model(tf_batch)
        tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
        labels = np.append(labels, tf.argmax(tf_predictions, axis=1))
        del tf_batch
    return labels

In [15]:
balance_data_df['sentiment'] = emotion_classify(list(balance_data_df['tweet']))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=768.0), HTML(value='')))




In [26]:
# biden_df.to_csv(os.path.join(DATA_DIR, 'classified_1_to_1_biden.csv'), index=False, header=True)
# trump_df.to_csv(os.path.join(DATA_DIR, 'classified_1_to_1_trump.csv'), index=False, header=True)

In [27]:
len(states_list)

51

In [28]:
dem_states = set(['DC', 'VT', 'HI', 'MA', 'MD', 'CA', 'NY', 'RI', 'DE', 
                 'CT', 'WA', 'ME', 'NJ', 'OR', 'IL'])
rep_states = set(['OH', 'IA', 'TX', 'MT', 'SC', 'AK', 'MO', 'IN', 'NE',
                 'KS', 'UT', 'MS', 'TN', 'SD', 'KY', 'LA', 'AL', 'ND',
                 'ID', 'AR', 'OK', 'WV', 'WY'])
swing_states = states_list - dem_states - rep_states

In [29]:
swing_states

{'AZ', 'CO', 'FL', 'GA', 'MI', 'MN', 'NC', 'NH', 'NM', 'NV', 'PA', 'VA', 'WI'}

In [67]:
states_results = pd.DataFrame(columns=['state_code', 'biden', 'trump', 'type', 'winner'])

In [68]:
states_results.loc[:,'state_code'] = list(states_list)

In [69]:
type_vs_candidate = {'blue': 'biden', 'red': 'trump', 'purple': 'contentious'}
for state in states_list:
    trump_slice = balance_data_df.query(f'state_code == "{state}" and who == "trump"')
    biden_slice = balance_data_df.query(f'state_code == "{state}" and who == "biden"')
    trump_score = np.sum(trump_slice['sentiment']) + 1
    biden_score = np.sum(biden_slice['sentiment']) + 1
    
    state_type = ''
    if state in dem_states:
        state_type = 'blue'
    elif state in rep_states:
        state_type = 'red'
    else:
        state_type = 'purple'
        
    winner = ''
    if len(trump_slice) + len(biden_slice) < 1000:
        winner = type_vs_candidate[state_type]
    else:
        if trump_score/biden_score > 1.1:
            winner = 'trump'
        elif biden_score/trump_score > 1.1:
            winner = 'biden'
        else:
            winner = 'contentious'
        
    states_results.loc[states_results['state_code'] == state, ['biden', 'trump', 'type', 'winner']] = \
        [biden_score, trump_score, state_type, winner]

In [74]:
states_results.to_csv(os.path.join(DATA_DIR, 'state_results.csv'), index=False, header=True)

In [71]:
states_results.query('type=="blue" and winner=="trump"')

Unnamed: 0,state_code,biden,trump,type,winner


In [72]:
states_results.query('type=="red" and winner=="biden"')

Unnamed: 0,state_code,biden,trump,type,winner


In [73]:
states_results.query('winner=="contentious"')

Unnamed: 0,state_code,biden,trump,type,winner
0,NM,100,87,purple,contentious
11,NH,76,67,purple,contentious
15,NV,376,356,purple,contentious
20,AZ,516,474,purple,contentious
28,OH,618,566,red,contentious
29,TN,413,399,red,contentious
31,WI,277,247,purple,contentious
32,MN,292,252,purple,contentious
42,FL,2033,1924,purple,contentious
43,TX,2449,2281,red,contentious
