In [4]:
import json
import re, string
import matplotlib.pyplot as plt
from collections import defaultdict
from ibm_watson import ToneAnalyzerV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

from textblob import TextBlob

In [5]:
tokenizer = TweetTokenizer()
stop_words = stopwords.words('english')
#keep words that may contribute to sentiment 
stop_words.pop(stop_words.index('no'))
stop_words.pop(stop_words.index('not'))
stop_words.pop(stop_words.index('against'))
stop_words.pop(stop_words.index("couldn't"))
stop_words.pop(stop_words.index("aren't"))
stop_words.pop(stop_words.index("won't"))
            
def read_jsonl(filename):
    '''Iterates through a JSONL file'''
    with open(filename, "r", encoding="utf8") as f:
        for line in f:
            yield json.loads(line.rstrip('\n|\r'))
            

def remove_noise(tweet_tokens, stop_words = ()):
    '''
    Cleans the tweet tokens, removing links and special characters,
    tags part of speech of words, and lemmatizes
    '''
    cleaned_tokens = []
    myre = re.compile(u'['
    u'\U0001F300-\U0001F64F'
    u'\U0001F680-\U0001F6FF'
    u'\u2600-\u26FF\u2700-\u27BF]+', 
    re.UNICODE)
    
    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        token = myre.sub('',token)
        token = token.replace('“','').replace('”','').replace('…','')

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [6]:
all_data = pd.DataFrame(columns = ['username','location','longitude','latitude','tweets'])

### Reading tweet data from json file
datareader = read_jsonl('election_filter_10-26_10-29.jsonl')

count = 0
for tweet in datareader:
    if tweet['location'] != '':
        count += 1
        # Read in data and preprocess
        text = tweet['text']
        name = tweet['username']
        loc = tweet['location']
        long = tweet['longitude']
        lat = tweet['latitude']
        tokenized_tweet = tokenizer.tokenize(text)
        cleaned_text = remove_noise(tokenized_tweet, stop_words)
        cleaned_text = ' '.join(cleaned_text)
        data = {'username':name, 'location': loc, 'longitude':long, 'latitude':lat, 'tweets':cleaned_text}
        all_data = all_data.append(data, ignore_index = True)
        
        #if count == 100:
        #    break
    
display(all_data)

Unnamed: 0,username,location,longitude,latitude,tweets
0,TRTWorldNow,"Istanbul, Turkey",,,us president donald trump rival candidate joe ...
1,VicOlsenHolt,United States,,,great ad #watch #pennsylvania #michigan #flori...
2,Artistwhogives,"Headingley, Leeds, UK",,,dumpy donnie candidate rioter looter arsonist ...
3,BevAzevedo,Canada,,,time come #vote #election2020 #raiseyourvoice ...
4,benpowell321,Singapore,,,blackrock investment institute update global o...
...,...,...,...,...,...
36379,latinojustice,New York,,,#cadavotocuenta volunteer #ga make sure commun...
36380,RebekahWriter,Maryland,,,got plan 11/3 no worried idea #election2020 #e...
36381,MxdWrstlgAsscNY,"New York, USA",,,vote matter make voting plan #election2020 fin...
36382,CSAC_Counties,Sacramento,,,vote add voice chorus form opinion basis actio...


In [8]:
compiled = all_data.groupby(['username','location'])['tweets'].apply(list).reset_index(name='all_tweets')
dem_tweets = dict()
rep_tweets = dict()
for row in range(len(compiled)):
    tweets = compiled.iloc[row,-1]
    dem_tweets[row] = []
    rep_tweets[row] = []
    for t in range(len(tweets)):
        if 'biden' in tweets[t] or 'kamala' in tweets[t] or 'harris' in tweets[t]:
            dem_tweets[row].append(tweets[t])
        if 'trump' in tweets[t] or 'pence' in tweets[t]:
            rep_tweets[row].append(tweets[t])
            
compiled['NumTweets'] = compiled['all_tweets'].apply(len)
compiled['DemTweets'] = list(dem_tweets.values())
compiled['RepTweets'] = list(rep_tweets.values())
compiled = compiled.reset_index(drop=True)
# drop if both tweets have empty lists
indicies = []
indicies = [row for row in range(len(compiled)) if len(compiled.iloc[row,-2]) == 0 and len(compiled.iloc[row,-1]) == 0]
compiled = compiled.drop(indicies).reset_index(drop=True)
display(compiled.head())
print(len(compiled))

Unnamed: 0,username,location,all_tweets,NumTweets,DemTweets,RepTweets
0,000HMY,"Las Vegas, NV",[#democracy life #vote #maga not #trump realit...,5,[],[#democracy life #vote #maga not #trump realit...
1,02deebo,"Oregon, USA",[something #trump truly take credit #trumplied...,1,[],[something #trump truly take credit #trumplied...
2,09072021,Close to Lincoln’s dead body👻,[one reason #trump many devoted supporter ⬇ ️ ...,1,[],[one reason #trump many devoted supporter ⬇ ️ ...
3,0bzerve,"Kentucky, USA",[’ #biden campaign bus travel good security li...,1,[’ #biden campaign bus travel good security li...,[]
4,0ldManStoneZone,SOME🌴BEACH🌅SOME👣WHERE❤,[#biden joe mind even work no mo #voteredtosav...,3,[#biden joe mind even work no mo #voteredtosav...,[get new trump bumper sticker nice huh #votere...


13009


In [104]:
states = pd.read_csv('.\Data\Misc\states.csv')
states = states.dropna(axis = 0)
collected = [states.iloc[row, name] for name in range(len(states.columns)) for row in range(len(states))]
print(collected)
display(states.head())

['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Dist. of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Virgin Islands', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'Ala.', 'Alaska', 'Ariz.', 'Ark.', 'Calif.', 'Colo.', 'Conn.', 'Del.', 'D.C.', 'Fla.', 'Ga.', 'Guam', 'Hawaii', 'Idaho', 'Ill.', 'Ind.', 'Iowa', 'Kans.', 'Ky.', 'La.', 'Maine', 'Md.', 'Mass.', 'Mich.', 'Minn.', 'Miss.', 'Mo.', 'Mont.', 'Nebr.', 'Nev.', 'N.H.', 'N.J.', 'N.M.', 'N.Y.', 'N.C.', 'N.D.', 'Ohio', 'Okla.

Unnamed: 0,State,Abbreviation,Postalcode
0,Alabama,Ala.,AL
1,Alaska,Alaska,AK
3,Arizona,Ariz.,AZ
4,Arkansas,Ark.,AR
5,California,Calif.,CA
