In [1]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob


In [2]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [3]:
df = pd.read_pickle("pickle/df_t_raw.pick")
df.shape

(879311, 38)

In [4]:
df['trump'] = df['trump'].astype(int)
df['biden'] = df['biden'].astype(int)

In [6]:
df.sample(10)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
1013,1312904496507432961,1312835414873399297,2020-10-04 23:56:39 UTC,2020-10-04,23:56:39,0,1181366088099123202,oknyork1,OkNYork,,...,,,,"[{'screen_name': 'Back_dafucup', 'name': 'Jenn...",,,,,1,0
4247,1312538964826365952,1312494005758783488,2020-10-03 23:44:09 UTC,2020-10-03,23:44:09,0,1916249839,billy_cage,𝐙𝐢pp𝐢𝐧𝐠𝐫𝐨𝐜𝐤𝐬 (Halo CE MacWorld Remake),,...,,,,"[{'screen_name': 'Truxillogical', 'name': 'Tru...",,,,,1,0
8008,1317611310323015681,1317610975068119047,2020-10-17 23:39:51 UTC,2020-10-17,23:39:51,0,980824830613315584,datawookiee,ÐR Plissken ESQ (unpaid intern for @ResitsTrump),,...,,,,"[{'screen_name': 'ResitsTrump', 'name': 'Dr. K...",,,,,1,1
12281,1318334102097915904,1318334102097915904,2020-10-19 23:31:58 UTC,2020-10-19,23:31:58,0,898759943393816576,americafirstmg,The DC Patriot,,...,,,,[],,,,,1,0
242,1321240167664001027,1321240167664001027,2020-10-27 23:59:38 UTC,2020-10-27,23:59:38,0,356845993,brokenletter,brokenletter,,...,,,,[],,,,,0,1
10762,1314713465462849537,1314309141591138320,2020-10-09 23:44:51 UTC,2020-10-09,23:44:51,0,1494105091,wendywamsley,wendy wamsley,,...,,,,"[{'screen_name': 'NikkiHaley', 'name': 'Nikki ...",,,,,0,1
4549,1323050167319339013,1323050167319339013,2020-11-01 23:51:55 UTC,2020-11-01,23:51:55,0,29989542,teresaxlynn10,Teresa 💛💛,,...,,,,[],,,,,1,0
3844,1315439486873079808,1315417973591547906,2020-10-11 23:49:48 UTC,2020-10-11,23:49:48,0,27317649,leachtheteach,Andrew Leach,,...,,,,"[{'screen_name': 'realDonaldTrump', 'name': 'D...",,,,,1,1
3542,1315440201477521409,1315327978553958400,2020-10-11 23:52:38 UTC,2020-10-11,23:52:38,0,20778809,mimionthehoops,Mimi (wear 2 freakin' masks!),,...,,,,"[{'screen_name': 'aerotycoon', 'name': 'AeroTy...",,,,,0,1
2782,1313628906696605696,1313628906696605696,2020-10-06 23:55:12 UTC,2020-10-06,23:55:12,0,57366686,fbellavia,Frank B.,,...,,,,[],,,,,1,1


## Preprocessing

We'll start small by simply removing numbers & punctuation and converting each tweet to lowercase. 

In [7]:
# keep only necessary columns
data = df.loc[:,['tweet', 'trump', 'biden']]
data.head(3)

Unnamed: 0,tweet,trump,biden
0,@capriaaf @JoeBiden Plenty of results for #Tru...,1,1
1,@Changinglenses @greger_mary @JRubinBlogger @G...,0,1
2,"Inside a Biden v. Trump marriage: ""you woke me...",1,1


In [8]:
no_links = lambda x: re.sub(r"https?:\/\/\S+", "", x)
no_handles = lambda x: re.sub(r"@[\d\w_]+", "", x)
alphanum = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

In [9]:
data['original'] = data.tweet

data['tweet'] = (data['tweet']
                 .map(no_handles)
                 .map(no_links)
                 .map(punc_lower)
                 .map(alphanum))


In [10]:
data.head(3)

Unnamed: 0,tweet,trump,biden,original
0,plenty of results for trumpcrimefamily and ...,1,1,@capriaaf @JoeBiden Plenty of results for #Tru...
1,he left washington before impeachment,0,1,@Changinglenses @greger_mary @JRubinBlogger @G...
2,inside a biden v trump marriage you woke me...,1,1,"Inside a Biden v. Trump marriage: ""you woke me..."


In [11]:
# remove tweets in which trump and biden are both named
mask = (data.trump==1) & (data.biden==1)
data = data[mask]

## spaCy

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [13]:
data.tweet.iloc[15]

'all of you are the worst sort of faux  journalists    biased to the point of bigoted and totally in the tank for biden   if you had a shred of integrity you would look at the bobulinski allegations   but you don t   a deplorable phd      trump  '

## NLTK Tweet Tokenizer

In [14]:
mask = (df.trump == 0) & (df.biden == 1)
biden_tweets = df[mask]['tweet']

mask = (df.trump == 1) & (df.biden == 0)
trump_tweets = df[mask]['tweet']


In [15]:
trump_tweets.shape

(351598,)

In [16]:
biden_tweets.shape

(352465,)

In [18]:
trump_tweets.head(4)

0    Not very surprising seeing as how she traveled...
1    @LindseyGrahamSC @HerschelWalker Trumps little...
2    @CalebPatriotKag 🚨 Trump is a Malta Mason serv...
3    @birdmonger @Cernovich @realDonaldTrump Lots o...
Name: tweet, dtype: object

In [19]:
biden_tweets.head(4)

1    @Changinglenses @greger_mary @JRubinBlogger @G...
6        Loving all these Republicans endorsing Biden.
7    WATCH: 'Ballot chaser' boasts she got $55,000 ...
8                   @JoeBiden  https://t.co/qTxhMODuIH
Name: tweet, dtype: object

In [20]:
puncs = [c for c in string.punctuation if c not in ["#", ":"]]
print(puncs)

['!', '"', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


ew - try pyspellchecker

In [23]:
# from autocorrect import Speller # TOO SLOW...TRY PYSPELLCHECKER
def tweet_tokenize(tweets):
    """Get all of the tokens in a set of tweets"""
    twt = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = [token for tweet in tweets for token in twt.tokenize(tweet)]
    # combine stop words and punctuation
    puncs = [c for c in string.punctuation if c not in ["#", ":"]]
    stop = stopwords.words("english") + puncs + ['”']
    stemmer = PorterStemmer()
    tokens = [ stemmer.stem(token) for tweet in tweets
              for token in twt.tokenize(tweet)
              if token.lower() not in stop]
#     spell = Speller(lang='en')
#     tokens = [spell(t) for t in tokens]

    
    
    return tokens

In [24]:
tokens = tweet_tokenize(data.tweet)

In [42]:
tokens.most_common(20)

AttributeError: 'list' object has no attribute 'most_common'

In [25]:
count = Counter(tokens)

Nice. Now we need to get these back into string form, and send them through a vectorizer.

In [34]:
v = CountVectorizer(stop_words='english', max_df=0.075)

In [35]:
strings = ' '.join(tokens)

In [36]:
sparse = v.fit_transform(tokens)

In [40]:
dense = pd.DataFrame(sparse.toarray(), columns=v.get_feature_names())

In [41]:
dense.to_pickle("dense_cv_all_0218.p")

OSError: [Errno 28] No space left on device

In [39]:
dense = pd.read_pickle("dense_cv_all_0218.p")

FileNotFoundError: [Errno 2] No such file or directory: 'dense_cv_all_0218.p'

In [None]:
t_words = ' '.join(trump_tokens)
b_words = ' '.join(biden_tokens)
t_words[:30]

In [None]:
v_trump = v.fit_transform(trump_tokens)
v_biden = v.fit_transform(biden_tokens)

In [None]:
va_trump = v_trump.toarray()
va_biden = v_biden.toarray()

In [None]:
df_tv = pd.DataFrame(va_trump, columns=v.get_feature_names())
df_bv = pd.DataFrame(va_biden, columns=v.get_feature_names())


In [None]:
doc_word = vectorizer.fit_transform(data.tweet)
doc_word[:20]

In [None]:
X_matrix = doc_word.toarray(1`)
pd.DataFrame(X_matrix, columns=vectorizer.get_feature_names())

**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers