In [44]:


# importing libraries
import tweepy
from textblob import TextBlob
from wordcloud import WordCloud

import configparser
import os

import nltk
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = ['amp','biden','know','say','today','start','week','want','day','talk','new','thank','birthday','wish','happy','discuss']
stopwords.extend(new_stopwords)


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS.union(new_stopwords)


import pyLDAvis
import pyLDAvis.sklearn

import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from tqdm import tqdm

In [4]:
senate = pd.read_excel(open('data/congress_twitter.xlsx', 'rb'),
              sheet_name='Senate')  
house = pd.read_excel(open('data/congress_twitter.xlsx', 'rb'),
              sheet_name='House')  

In [3]:
senate

Unnamed: 0,SENATORS,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,Name,Link,State,Party
1,"Baldwin, Tammy",https://twitter.com/SenatorBaldwin,WI,D
2,"Barrasso, John",https://twitter.com/SenJohnBarrasso,WY,R
3,"Bennet, Michael F.",https://twitter.com/SenatorBennet,CO,D
4,"Blackburn, Marsha",https://twitter.com/MarshaBlackburn,TN,R
...,...,...,...,...
96,"Warren, Elizabeth",https://twitter.com/SenWarren,MA,D
97,"Whitehouse, Sheldon",https://twitter.com/SenWhitehouse,RI,D
98,"Wicker, Roger F.",https://twitter.com/SenatorWicker,MS,R
99,"Wyden, Ron",https://twitter.com/RonWyden,OR,D


In [5]:
house

Unnamed: 0,REPRESENTATIVES,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,Name,Link,State,Party
1,"Adams, Alma",https://twitter.com/RepAdams,NC,D
2,"Aderholt, Robert",https://twitter.com/Robert_Aderholt,AL,R
3,"Aguilar, Pete",https://twitter.com/RepPeteAguilar,CA,D
4,"Allen, Rick",https://twitter.com/RepRickAllen,GA,R
...,...,...,...,...
436,"Wittman, Robert J.",https://twitter.com/RobWittman,VA,R
437,"Womack, Steve",https://twitter.com/rep_stevewomack,AR,R
438,"Yarmuth, John A.",https://twitter.com/RepJohnYarmuth,KY,D
439,"Young, Don",https://twitter.com/repdonyoung,AK,R


In [4]:
all_congress_tweets = pd.read_csv('data/cong_tweets.csv')
all_congress_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327747 entries, 0 to 327746
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   user    327747 non-null  object
 1   text    327747 non-null  object
 2   date    327747 non-null  object
 3   fav     327747 non-null  int64 
 4   rt      327747 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 12.5+ MB


## Restrict tweets to 2021-2022


In [5]:
## Filter dates from 2021-2022

start_date = '2021-01-01 00:00:00+00:00'
end_date   = '2022-01-20 00:00:00+00:00'
mask = (all_congress_tweets['date'] > start_date) & (all_congress_tweets['date'] <= end_date)

all_tweets= all_congress_tweets.loc[mask]
all_tweets.info

<bound method DataFrame.info of                 user                                               text  \
0           RepAdams  RT @WhiteHouse: Thanks to President Biden’s ec...   
1           RepAdams         RT @RepTeresaLF: Let's end the filibuster!   
2           RepAdams  RT @repjimcooper: More than ONE MILLION childr...   
3           RepAdams  RT @RepUnderwood: The evidence is clear: in si...   
4           RepAdams  RT @WhipClyburn: Having access to affordable, ...   
...              ...                                                ...   
327742  SenToddYoung  My heart breaks for the Bayh family. Susan was...   
327743  SenToddYoung  #ICYMI: The @Olympics should be a time when th...   
327744  SenToddYoung  As we have said, this will help alleviate the ...   
327745  SenToddYoung  I’m glad that my advocacy with @ChrisMurphyCT ...   
327746  SenToddYoung  This week, I met with Hoosier members of @leag...   

                             date   fav    rt  
0             2022-

In [35]:


def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def removeRT(text):
    RTless = lambda x: re.compile(r'\#').sub('', re.compile('RT @').sub('@', x, count=1).strip())
    return (RTless(text))

def clean_text(text):
    
    # Remove RT
    text = removeRT(text)
    
    # Remove emojis
    text = remove_emoji(text)
    
    # Remove mentions
    text = re.sub("@[A-Za-z0-9_]+","", text)
    
    # Remove Hastags
    text = re.sub("#[A-Za-z0-9_]+","", text)
    
    # Make lowercase   
    text = text.lower()
    
    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove punctuation   
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) 
    
    # Remove words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Remove Stopwords
    text = text.split()
    text = [w for w in text if not w in stopwords.words('english')]
    text = " ".join(word for word in text)    
    
    
    return text

tweets_df_clean = pd.DataFrame(all_tweets.text.apply(lambda x: clean_text(x)))

In [10]:
tweets_df_clean

Unnamed: 0,text
0,thanks president biden’s economic plans us big...
1,lets end filibuster
2,one million children tennessee received monthl...
3,evidence clear six months expanded childtaxcre...
4,access affordable reliable health insurance sa...
...,...
327742,heart breaks bayh family susan accomplished at...
327743,icymi time world comes together unfortunately ...
327744,said help alleviate worst humanitarian crisis ...
327745,i’m glad advocacy amp remove houthi terrorist ...


## Reality Check: Size of Corpus?

In [34]:
word_corpus = tweets_df_clean.text

corpuslen = sum([len(d.split(' ')) for d in word_corpus]) 
print(f'Total words in corpus: {corpuslen}')

Total words in corpus: 3057294


In [11]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()
def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)
tweets_df_clean = pd.DataFrame(tweets_df_clean.text.apply(lambda x: lemmatizer(x)))
tweets_df_clean['text'] = tweets_df_clean['text'].str.replace('-PRON-', '')

  from ._conv import register_converters as _register_converters


In [57]:
## Functionalized NLP pipeline

from sklearn.decomposition import LatentDirichletAllocation

def getTopics(df, min_df, max_df, max_features):
    
    ## Vectorization
    vectorizer = CountVectorizer(
        analyzer='word',       
        min_df=min_df,# minimum required occurences of a word 
        #max_df=.7,# maximum required occurences of a word 
        stop_words=set(stop_words),# remove stop words
        lowercase=True,# convert all words to lowercase
        token_pattern='[a-zA-Z0-9]{3,}',# num chars > 3
        max_features=max_features # max number of unique words
        )
    
    data_matrix = vectorizer.fit_transform(df.text)
    
    ## Modeling
    lda_model = LatentDirichletAllocation(
    n_components=10, # Number of topics
    learning_method='online',
    random_state=20,       
    n_jobs = -1  # Use all available CPUs
                                        )
    lda_output = lda_model.fit_transform(data_matrix)    
    
    return lda_model, vectorizer, data_matrix, lda_output


lda_model, vectorizer, data_matrix, lda_output = getTopics(tweets_df_clean, min_df=3, max_df=.7, max_features=5000)


## Visualize Topics

In [58]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, data_matrix, vectorizer, mds='tsne')

  default_term_info = default_term_info.sort_values(


## List Top 10 Topics

In [61]:
for i,topic in enumerate(lda_model.components_):
    print(f'Top 10 words for topic #{i}:')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['government', 'way', 'federal', 'protect', 'help', 'families', 'health', 'right', 'care', 'need']


Top 10 words for topic #1:
['jobs', 'plan', 'house', 'bipartisan', 'infrastructure', 'democrats', 'proud', 'american', 'americans', 'act']


Top 10 words for topic #2:
['school', 'morning', 'county', 'republicans', 'hearing', 'continue', 'news', 'joined', 'national', 'great']


Top 10 words for topic #3:
['free', 'violence', 'nation', 'congressional', 'fight', 'security', 'office', 'members', 'service', 'administration']


Top 10 words for topic #4:
['business', 'leadership', 'hear', 'lost', 'long', 'watch', 'end', 'joe', 'inflation', 'crisis']


Top 10 words for topic #5:
['celebrate', 'law', 'businesses', 'colleagues', 'capitol', 'congress', 'states', 'live', 'honor', 'women']


Top 10 words for topic #6:
['better', 'build', 'veterans', 'family', 'like', 'years', 'working', 'country', 'join', 'year']


Top 10 words for topic #7:
['high', 'life', 'covid', 'wo

## Look at Unigrams, Bigrams and Trigrams

In [60]:
## Unigrams
#from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words=set(stop_words)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(tweets_df_clean.text, 10)
unigram = pd.DataFrame(common_words, columns = ['unigram' , 'count'])

In [48]:
unigram

Unnamed: 0,unigram,count
0,act,11692
1,american,10314
2,president,9935
3,people,8926
4,americans,8486
5,house,8386
6,proud,8096
7,democrats,7629
8,great,7551
9,time,7417


In [50]:
## bigrams
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(stop_words=set(stop_words),ngram_range=(2,2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(tweets_df_clean.text, 10)
bigram = pd.DataFrame(common_words, columns = ['bigram' , 'count'])

In [53]:
bigram

Unnamed: 0,bigram,count
0,bipartisan infrastructure,1952
1,health care,1934
2,build better,1839
3,united states,1721
4,american people,1674
5,years ago,1652
6,southern border,1554
7,small businesses,1467
8,men women,1343
9,voting rights,1305


In [54]:
## bigrams
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(stop_words=set(stop_words),ngram_range=(3,3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(tweets_df_clean.text, 10)
trigram = pd.DataFrame(common_words, columns = ['trigram' , 'count'])

In [56]:
trigram

Unnamed: 0,trigram,count
0,american rescue plan,1099
1,build better act,771
2,child tax credit,743
3,infrastructure investment jobs,566
4,investment jobs act,535
5,telephone town hall,410
6,brave men women,408
7,bipartisan infrastructure law,372
8,crisis southern border,350
9,john lewis voting,312
