In [None]:
import pandas as pd
import numpy as np

In [None]:
pres_tweets = pd.read_csv('pres_tweets.csv', index_col=0)
pres_tweets.drop_duplicates(subset=None, keep='first', inplace=True)
pres_tweets = pres_tweets.reset_index(drop=True)
pres_tweets

In [None]:
import re

In [None]:
def clean_tweets(x):
    if type(x) != str:
        return ''
    else:
        x = re.sub('@[\w]*', ' ', x)
        x = re.sub('http\S+',' ',x)
        x = re.sub('www\S+',' ',x)
        x = x.lower()
        x = re.findall('[A-za-z]+', x)
    return x
    

In [None]:
cleaned_tweets = list(map(clean_tweets, pres_tweets['tweet']))

In [None]:
#make list of candidate names
c_names = ['Joe Biden',
 'Howie Hawkins',
 'Donald Trump',
 'Jo Jorgensen',
 'Gary Johnson',
 'Darrell Castle',
 'Evan McMullin',
 'Jill Stein',
 'Donald Trump',
 'Hillary Clinton',
 'Mitt Romney',
 'Gary Johnson',
 'Jill Stein',
 'Barack Obama',
 'John McCain',
 'Ralph Nader',
 'Bob Barr',
 'Barack Obama',
 'Chuck Baldwin',
 'Cynthia McKinney']
c_names = [x.lower() for name in c_names for x in name.split()]

In [None]:
#find most used words for each candidate
def make_bag(candidate):
    cand_df = pres_tweets[pres_tweets['candidate']== candidate]
    index = cand_df.index
    word_bag = ''
    for tweet in cleaned_tweets[index[0]:index[-1]+1]:
        for word in tweet:
            if word not in c_names:
                word_bag = word_bag+' '+word
    return word_bag

In [None]:
biden_bag = make_bag('Joe Biden')
len(biden_bag)

In [None]:
text = open("biden.txt", "w")
text.write(biden_bag)
text.close()

In [None]:
trump_bag = make_bag('Donald Trump')
text = open("trump.txt", "w")
text.write(trump_bag)
text.close()

# lemmatize

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

In [None]:
def lemmatize(l):
    
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper() 
        tag_dict = {"J": wordnet.ADJ, 
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)

    lemmatizer = WordNetLemmatizer() 
    lemmatized = [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in l]
    return lemmatized

In [None]:
cleaned_tweets

In [None]:
#time lem for 100 tweets:
from time import perf_counter

temp = cleaned_tweets[0:100]
start = perf_counter()
t = list(map(lemmatize, temp))
end = perf_counter()
execution_time = (end - start)
print(execution_time) #.55 seconds for 100 tweets

In [None]:
lemmatized_pres_tweets = list(map(lemmatize, cleaned_tweets))
#started 12:15-12:26

In [None]:
#remove stopwords
from nltk.corpus import stopwords

def remove_stopwords(l):
    return ' '.join([word for word in l if not word in stopwords.words('english')])

In [None]:
#test 100 tweets for time
start = perf_counter()
a = list(map(remove_stopwords, t))
end = perf_counter()
execution_time = (end - start)
print(execution_time)

In [None]:
def remove_stopwords2(l):
    cachedStopWords = stopwords.words("english")
    return ' '.join([word for word in l if word not in cachedStopWords])

In [None]:
#test 100 tweets for time
start = perf_counter()
a = list(map(remove_stopwords2, t))
end = perf_counter()
execution_time = (end - start)
print(execution_time)

In [None]:
pres_tweets['tweet_processed'] = list(map(remove_stopwords2, lemmatized_pres_tweets))
#started 1:42-45?

In [None]:
pres_tweets

# gubernatorial

In [None]:
gub_tweets = pd.read_csv('gub_tweets.csv', index_col=0)
gub_tweets.drop_duplicates(subset=None, keep='first', inplace=True)
gub_tweets = gub_tweets.reset_index(drop=True)
gub_tweets

In [None]:
#remove tweets that might not be about the candidate
gub = pd.read_csv('gub.csv', index_col=0)
#make dictionary of candidates with state
state_dict = {gub['candidate'][i].strip(): gub['state'][i] for i in range(len(gub))}
state_dict['Pat Quinn'] = 'Illinois'


In [None]:
def clean_tweets2(x):
    if type(x) != str:
        return np.nan
    elif key_words[0] not in x or key_words[1] not in x or key_words[2] not in x:
        return np.nan
    else: 
        x = re.sub('@[\w]*', ' ', x)
        x = re.sub('http\S+',' ',x)
        x = x.lower()
        x = re.findall('[A-za-z]+', x)
    return x

In [None]:
#drop null tweets from df
#gub_tweets.dropna(subset = ["tweet"], inplace=True)

In [None]:
cleaned_tweets_gub = list(map(clean_tweets, gub_tweets['tweet']))

In [None]:
lemmatized_tweets = list(map(lemmatize, cleaned_tweets_gub))
#started 12:35-1:17

In [None]:
gub_tweets['tweet_processed'] = list(map(remove_stopwords2, lemmatized_tweets))
#started 1:34-42?

In [None]:
gub_tweets

In [None]:
pres_tweets.to_csv('pres_tweets_processed.csv')
gub_tweets.to_csv('gub_tweets_processed.csv')