In [1]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np
import datetime
from textblob import TextBlob
from collections import Counter

from langdetect import detect

import spacy
from copy import deepcopy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

# import contextualSpellCheck

# November 2 Tweet Analysis

In [2]:
df = pd.read_pickle("pickle/clean_nov2_tweets.pick")
df.head(3)

Unnamed: 0,id,date,time,user_id,username,tweet,hashtags
0,1323414586586968064,2020-11-02,23:59:59,1242220312676241409,kamalatoe,Trump is campaigning with steel and autoworker...,"['macon', 'gasen', 'savannah', 'riseup']"
1,1323414586511429634,2020-11-02,23:59:59,2324659466,didih2214,Trump is having unscalable fencing erected aro...,['votehimout2020']
2,1323414586503143424,2020-11-02,23:59:59,215506541,cuffy__,"They support trump like he cares about them, h...",[]


In [3]:
nlp = spacy.load('en_core_web_sm')

**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [4]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [5]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [6]:
df.head(3)

Unnamed: 0,id,date,time,user_id,username,tweet,hashtags
0,1323414586586968064,2020-11-02,23:59:59,1242220312676241409,kamalatoe,Trump is campaigning with steel and autoworkers today. We are closing with Lady Gaga and gun control in Philly Vote for our woke asses. @MaconBibb #Macon #GAsen #Savannah #RiseUp,"['macon', 'gasen', 'savannah', 'riseup']"
1,1323414586511429634,2020-11-02,23:59:59,2324659466,didih2214,"Trump is having unscalable fencing erected around White House grounds AFTER sewing civil unrest toward Democratic voters, scientists &amp; anyone against him. What I see is a coward who stirs the pot &amp; hides in the safety of his temporary home #VoteHimOut2020",['votehimout2020']
2,1323414586503143424,2020-11-02,23:59:59,215506541,cuffy__,"They support trump like he cares about them, he don’t like poor white folk either 🤦🏾‍♂️😂",[]


In [7]:
df.columns

Index(['id', 'date', 'time', 'user_id', 'username', 'tweet', 'hashtags'], dtype='object')

In [8]:
len(df.username.unique())

223199

Now let's create a subset, containing the same amount of Trump tweets as Biden tweets as tweets mentioning both candidates.

In [9]:
%%time
[t for t in nlp("My name is Elliot.")]

CPU times: user 8.37 ms, sys: 130 µs, total: 8.5 ms
Wall time: 7.13 ms


[My, name, is, Elliot, .]

In [17]:
# spacy practice

text = 'Hi, readers! My name is Elliot Wilens. I love mountains, especially Mt. Everest.'

for t in nlp(text):
    print(t, t.pos_, t.dep_)

Hi INTJ ROOT
, PUNCT punct
readers NOUN npadvmod
! PUNCT punct
My PRON poss
name NOUN nsubj
is AUX ROOT
Elliot PROPN compound
Wilens PROPN attr
. PUNCT punct
I PRON nsubj
love VERB ROOT
mountains NOUN dobj
, PUNCT punct
especially ADV advmod
Mt. PROPN compound
Everest PROPN appos
. PUNCT punct


In [11]:
nlp(text)

Hi, readers! My name is Elliot Wilens. I love mountains.

## Initiate Pipeline

In [12]:
df = df.copy()[-25000:]

In [13]:
df.head(3)

Unnamed: 0,id,date,time,user_id,username,tweet,hashtags
76902,1323391939270434816,2020-11-02,22:30:00,876863947617513473,punto_decorte,"#2Nov ""De ganar Biden, seguirá la misma política de exterior, pero quizás con un pequeño matiz, que no será un aliado directo de la figura de Juan Guaidó"", dijo el internacionalista @LuisAngaritaEEI en Entrevista #PDCTV 🌍 | #Internacionales https://t.co/Ho6SIYMVCF","['2nov', 'pdctv', 'internacionales']"
76906,1323391938729443328,2020-11-02,22:30:00,1282422325556572160,cyntonm,@Jessicam6946 @rocketmann55 @JoeBiden 🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏🙏,[]
76909,1323391938448531459,2020-11-02,22:30:00,330826792,rapplerdotcom,"In the United States, as the elections draw near, more stars come out in support of Democratic presidential candidate Joe Biden. Updates here: https://t.co/majhtqkIOK https://t.co/CJQk22jM7q",[]


In [14]:
df['spacy_doc'] = list(nlp.pipe(df.tweet))

## Now let's split our data into biden & trump tweets.

In [15]:
mask = (df.tweet.str.contains("[Tt][Rr][Uu][Mm][Pp]")) & (~df.tweet.str.contains("[Bb][Ii][Dd][Ee][Nn]"))
trump_tweets = df[mask]

mask = (~df.tweet.str.contains("[Tt][Rr][Uu][Mm][Pp]")) & (df.tweet.str.contains("[Bb][Ii][Dd][Ee][Nn]"))
biden_tweets = df[mask]


Now, let's use spaCy's `pipe` method in order to process multiple documents in one go.

In [16]:
biden_adj = [token.text.lower() for doc in biden_tweets.tweet.spacy_doc for token in doc if token.pos_=='ADJ']
trump_adj = [token.text.lower() for doc in trump_tweets.tweet.spacy_doc for token in doc if token.pos_=='ADJ']

biden_noun = [token.text.lower() for doc in biden_tweets.tweet.spacy_doc for token in doc if token.pos_=='NOUN']
trump_noun = [token.text.lower() for doc in trump_tweets.tweet.spacy_doc for token in doc if token.pos_=='NOUN']

AttributeError: 'Series' object has no attribute 'spacy_doc'