In [1]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

import spacy
from copy import deepcopy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

# import contextualSpellCheck

In [2]:
nlp = spacy.load('en_core_web_sm')

**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [3]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [4]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [5]:
df = pd.read_pickle("pickle/df_t_raw.pick")
df.shape

(879311, 38)

In [6]:
df.columns

Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'place', 'tweet', 'language', 'mentions',
       'urls', 'photos', 'replies_count', 'retweets_count', 'likes_count',
       'hashtags', 'cashtags', 'link', 'retweet', 'quote_url', 'video',
       'thumbnail', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest', 'biden', 'trump'],
      dtype='object')

In [7]:
len(df.username.unique())

438106

Now let's create a subset, containing the same amount of Trump tweets as Biden tweets as tweets mentioning both candidates.

In [8]:
%%time
[t for t in nlp("My name is Elliot.")]

CPU times: user 8.53 ms, sys: 227 µs, total: 8.76 ms
Wall time: 7.4 ms


[My, name, is, Elliot, .]

In [9]:
# spacy practice

text = 'Hi, readers! My name is Elliot Wilens. I love mountains.'

for t in nlp(text):
    print(t, t.pos_, t.dep_)

Hi INTJ ROOT
, PUNCT punct
readers NOUN npadvmod
! PUNCT punct
My PRON poss
name NOUN nsubj
is AUX ROOT
Elliot PROPN compound
Wilens PROPN attr
. PUNCT punct
I PRON nsubj
love VERB ROOT
mountains NOUN dobj
. PUNCT punct


In [10]:
nlp(text)

Hi, readers! My name is Elliot Wilens. I love mountains.

## Initiate Pipeline

In [12]:
df = df.copy()[-25000]

KeyError: -25000

In [None]:
df['spacy_doc'] = list(nlp.pipe(df.tweet))

## Now let's split our data into biden & trump tweets.

In [None]:
mask = (df.trump == 0) & (df.biden == 1)
biden_tweets = df[mask].tail(500)

mask = (df.trump == 1) & (df.biden == 0)
trump_tweets = df[mask].tail(500)

# mask = (df.trump == 1) & (df.biden == 1)
# both_tweets = df[mask].tail(1000)

Now, let's use spaCy's `pipe` method in order to process multiple documents in one go.

In [None]:
biden_adj = [token.text.lower() for doc in biden_tweets.tweet.spacy_doc for token in doc if token.pos_=='ADJ']
trump_adj = [token.text.lower() for doc in trump_tweets.tweet.spacy_doc for token in doc if token.pos_=='ADJ']

biden_noun = [token.text.lower() for doc in biden_tweets.tweet.spacy_doc for token in doc if token.pos_=='NOUN']
trump_noun = [token.text.lower() for doc in trump_tweets.tweet.spacy_doc for token in doc if token.pos_=='NOUN']