# Using NLP to Determine the 2022 U.S. Midterm Elections Political Platforms 

In [1]:


# importing libraries

import configparser
import os

import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')


import tweepy
from textblob import TextBlob
from wordcloud import WordCloud


import nltk
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = ['amp','biden','know','say','today','start','week','want','day','talk','new','thank','birthday','wish','happy','discuss']
stopwords.extend(new_stopwords)

import spacy 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS.union(new_stopwords)


import pyLDAvis
import pyLDAvis.sklearn



from tqdm import tqdm

## Create Twitter API connection

In [None]:
# login to twitter dev account
config = configparser.ConfigParser()
config.read('config.ini')



api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']


# aunthenticate

auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)


api = tweepy.API(auth, wait_on_rate_limit=True)

In [None]:
## Reality Check! 
## Pull 200 HouseDemocrat tweets to test connection
posts = api.user_timeline(screen_name = "HouseDemocrats", count=200, tweet_mode="extended")


tweets = []
columns=['user','text','date','favs','retweets']
for tweet in posts:
    tweets.append([tweet.user.screen_name, tweet.full_text, tweet.created_at, tweet.favorite_count, tweet.retweet_count])

dftweets = pd.DataFrame(tweets, columns=columns)    
dftweets

## Read in list of congress twitter users

In [None]:
senate = pd.read_excel(open('data/congress_twitter.xlsx', 'rb'),
              sheet_name='Senate')  
house = pd.read_excel(open('data/congress_twitter.xlsx', 'rb'),
              sheet_name='House')  

# Getting all Dem accounts

In [None]:

senate_dems = senate[senate['Party ']=='D']
house_dems  = house[house['Party']=='D']
all_dems_df = pd.concat([senate_dems,house_dems])


all_dems_df = all_dems_df.drop(all_dems_df.columns[[2,3,4,5]], axis=1)  # df.columns is zero-based pd.Index
all_dems_df

In [None]:
all_dems_df['Party']='D' 
all_dems_df['Acct']= all_dems_df['Link'].str.replace('https://twitter.com/','',regex=True)
all_dems_df

## Getting all GOP Accounts

In [None]:

senate_gop = senate[senate['Party ']=='R']
house_gop  = house[house['Party']=='R']
all_gop_df = pd.concat([senate_gop,house_gop])


all_gop_df = all_gop_df.drop(all_gop_df.columns[[2,3,4,5]], axis=1)  # df.columns is zero-based pd.Index
all_gop_df

In [None]:
all_gop_df['Party']='R' 
all_gop_df['Acct'] = all_gop_df['Link'].str.replace('https://twitter.com/','',regex=True)
all_gop_df

## Create combined list of accounts

In [None]:
## Create combined list of accounts
all_congress_accounts = pd.concat([all_gop_df,all_dems_df])
all_congress_accounts.to_csv('data/cong_accounts.csv', encoding='utf-8', index=False)
all_congress_accounts

## Scrape Tweets

In [None]:
## Get 1000 recent tweets from user

def get_1k_Tweets(user):
    
    tweets = []
    columns=['User','Content','Date','Favs','RTs']
        
    for tweet in tweepy.Cursor(api.user_timeline,screen_name=user).items(1000):
        tweets.append([tweet.user.screen_name, 
                       tweet.text, 
                       tweet.created_at, 
                       tweet.favorite_count, 
                       tweet.retweet_count])
    tempdf = pd.DataFrame(tweets, columns=columns)
    return tempdf
    

In [None]:
all_congress_tweets = pd.DataFrame()
#all_congress_tweets  = pd.DataFrame()#
no_accts = []
for cong in tqdm(all_congress_accounts[214:].Acct):
    try:
        temp_tweets = get_1k_Tweets(cong)
        all_congress_tweets = pd.concat([all_congress_tweets,temp_tweets])
    except:
        no_accts.append(cong)
        print(f'{cong} account is not active or does not have tweets')

In [None]:
no_accts

In [None]:
all_congress_tweets.info()

In [None]:

all_congress_tweets.to_csv('data/all_cong_tweets_01012021_01162022.csv', encoding='utf-8', index=False)

In [None]:
all_congress_tweets.head()

In [None]:
all_congress_accounts.head()


In [None]:
## Get Political Party and Name of account 
full_Cong_df = pd.merge(all_congress_tweets,all_congress_accounts, left_on='User', right_on='Acct')

In [None]:
## Remove redundant columns (link and Acct)
full_Cong_df.drop(['Link','Acct'], axis=1, inplace=True)


In [None]:
full_Cong_df.info()

## Write out full dataframe

In [None]:
full_Cong_df.to_csv('data/all_party_tweets_01012021_01162022.csv', encoding='utf-8', index=False)

In [None]:
full_Cong_df

In [2]:
all_congress_tweets = pd.read_csv('data/all_party_tweets_01012021_01162022.csv')
all_congress_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318055 entries, 0 to 318054
Data columns (total 7 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   User     314055 non-null  object
 1   Content  318055 non-null  object
 2   Date     318055 non-null  object
 3   Favs     318055 non-null  int64 
 4   RTs      318055 non-null  int64 
 5   Name     318055 non-null  object
 6   Party    318055 non-null  object
dtypes: int64(2), object(5)
memory usage: 17.0+ MB


## Restrict tweets to 2021-2022


In [3]:
## Filter dates from 2021-2022

start_date = '2021-01-01 00:00:00+00:00'
end_date   = '2022-01-20 00:00:00+00:00'
mask = (all_congress_tweets['Date'] > start_date) & (all_congress_tweets['Date'] <= end_date)

all_tweets= all_congress_tweets.loc[mask]
all_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266955 entries, 0 to 317693
Data columns (total 7 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   User     262955 non-null  object
 1   Content  266955 non-null  object
 2   Date     266955 non-null  object
 3   Favs     266955 non-null  int64 
 4   RTs      266955 non-null  int64 
 5   Name     266955 non-null  object
 6   Party    266955 non-null  object
dtypes: int64(2), object(5)
memory usage: 16.3+ MB


## Separate Dem and GOP Tweets


In [4]:
just_gop_df = all_tweets[all_tweets.Party=='R']
just_gop_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38782 entries, 0 to 53963
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   User     34782 non-null  object
 1   Content  38782 non-null  object
 2   Date     38782 non-null  object
 3   Favs     38782 non-null  int64 
 4   RTs      38782 non-null  int64 
 5   Name     38782 non-null  object
 6   Party    38782 non-null  object
dtypes: int64(2), object(5)
memory usage: 2.4+ MB


In [5]:
just_dem_df = all_tweets[all_tweets.Party=='D']
just_dem_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 228173 entries, 54128 to 317693
Data columns (total 7 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   User     228173 non-null  object
 1   Content  228173 non-null  object
 2   Date     228173 non-null  object
 3   Favs     228173 non-null  int64 
 4   RTs      228173 non-null  int64 
 5   Name     228173 non-null  object
 6   Party    228173 non-null  object
dtypes: int64(2), object(5)
memory usage: 13.9+ MB


In [None]:
#testdf = all_tweets.head()

## Cleaning Data

In [6]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


def removeRT(text):
    RTless = lambda x: re.compile(r'\#').sub('', re.compile('RT @').sub('@', x, count=1).strip())
    return (RTless(text))

def clean_text(text):
    # Remove RT
    text = removeRT(text)
    
    # Remove emojis
    text = remove_emoji(text)

    #Make text lowercase   
    text = text.lower()
    
    #remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    #remove punctuation   
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) 
    
    #remove words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)
    

    
    return text



In [None]:
#testdf_clean = pd.DataFrame(testdf.Content.apply(lambda x: clean_text(x)))
#testdf_clean

In [7]:
## Clean Dem Tweets
dem_clean = pd.DataFrame(just_dem_df.Content.apply(lambda x: clean_text(x)))
dem_clean

Unnamed: 0,Content
54128,one wisconsin senator me voted to deliver mil...
54129,this is a horrible tragedy that demands a thor...
54130,i am proud to announce that wisconsin is recei...
54131,reshadhudson senatorbaldwin reacts to supreme ...
54132,shout out to milwaukees bronzeville neighborho...
...,...
317689,is also bringing us the continued dominance o...
317690,congratulations to ukfootball on their taxslay...
317691,i feel compelled to note that the senate major...
317692,if mitch doesn’t want to send out checks to a...


In [8]:
## Clean Gop Tweets
gop_clean = pd.DataFrame(just_gop_df.Content.apply(lambda x: clean_text(x)))
gop_clean

Unnamed: 0,Content
0,biden let illegal immigrants enter our country...
1,if youre in a position to donate blood please ...
2,the biden admin is sitting on covid tests whi...
3,biden had the audacity to go on a reckless spe...
4,gopoversight dr fauci shut down debate about t...
...,...
53959,pursuant to the us constitution state legislat...
53960,in battleground states signature verification ...
53961,usarmy selfless service \n\nsunrise at arling...
53962,richardgrenell washington is so out of touch


In [9]:
## Clean all combined tweets
all_tweets_clean = pd.DataFrame(all_tweets.Content.apply(lambda x: clean_text(x)))
all_tweets_clean


Unnamed: 0,Content
0,biden let illegal immigrants enter our country...
1,if youre in a position to donate blood please ...
2,the biden admin is sitting on covid tests whi...
3,biden had the audacity to go on a reckless spe...
4,gopoversight dr fauci shut down debate about t...
...,...
317689,is also bringing us the continued dominance o...
317690,congratulations to ukfootball on their taxslay...
317691,i feel compelled to note that the senate major...
317692,if mitch doesn’t want to send out checks to a...


## Lemmatization to get word roots

In [10]:
## Lemmatization 

from nltk.stem import WordNetLemmatizer
from functools32 import lru_cache
wnl = WordNetLemmatizer()
lemmatize = lru_cache(maxsize=50000)(wnl.lemmatize)




nlp = spacy.load("en_core_web_sm")
def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)


In [11]:
all_tweets_clean = pd.DataFrame(all_tweets_clean.Content.apply(lambda x: lemmatizer(x)))
all_tweets_clean['Content'] = all_tweets_clean['Content'].str.replace('-PRON-', '')
all_tweets_clean.head()

Unnamed: 0,Content
0,biden let illegal immigrant enter our country ...
1,if you re in a position to donate blood please...
2,the biden admin be sit on covid test while p...
3,biden have the audacity to go on a reckless sp...
4,gopoversight dr fauci shut down debate about t...


In [13]:
## Lemmatize Dem Tweet data
dem_clean = pd.DataFrame(dem_clean.Content.apply(lambda x: lemmatizer(x)))
dem_clean['Content'] = dem_clean['Content'].str.replace('-PRON-', '')
dem_clean.head()

Unnamed: 0,Content
54128,one wisconsin senator I vote to deliver mill...
54129,this be a horrible tragedy that demand a thoro...
54130,I be proud to announce that wisconsin be recei...
54131,reshadhudson senatorbaldwin react to supreme c...
54132,shout out to milwaukee bronzeville neighborhoo...


In [12]:
## Lemmatize Gop Tweet data
gop_clean = pd.DataFrame(gop_clean.Content.apply(lambda x: lemmatizer(x)))
gop_clean['Content'] = gop_clean['Content'].str.replace('-PRON-', '')
gop_clean.head()

Unnamed: 0,Content
0,biden let illegal immigrant enter our country ...
1,if you re in a position to donate blood please...
2,the biden admin be sit on covid test while p...
3,biden have the audacity to go on a reckless sp...
4,gopoversight dr fauci shut down debate about t...


In [14]:
## Write out port-lemmatization data 
dem_clean.to_csv('data/dem_clean.csv', encoding='utf-8', index=False)
gop_clean.to_csv('data/gop_clean.csv', encoding='utf-8', index=False)
all_tweets_clean.to_csv('data/all_tweets_clean.csv', encoding='utf-8', index=False)

# Modeling

In [18]:
## n-grams
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
'''
def get_n_gram(corpus, n=None):
    vec = CountVectorizer(stop_words=set(stop_words)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
'''
def get_n_gram(corpus, ng, n=None):
    vec = CountVectorizer(stop_words=set(stop_words),ngram_range=ng).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]



In [22]:
# Republican Bigrams
common_words = get_n_gram(gop_clean.Content,(2,2), 10)
bigrams = pd.DataFrame(common_words, columns = ['bigram' , 'count'])
bigrams

Unnamed: 0,bigram,count
0,small business,365
1,southern border,321
2,american people,301
3,man woman,290
4,united states,289
5,vaccine mandate,261
6,year ago,235
7,proud join,234
8,border crisis,233
9,look forward,231


In [21]:
# Republican Trigrams
common_words = get_n_gram(gop_clean.Content,(3,3), 10)
trigram = pd.DataFrame(common_words, columns = ['trigram' , 'count'])
trigram

Unnamed: 0,trigram,count
0,brave man woman,99
1,crisis southern border,91
2,supply chain crisis,78
3,chinese communist party,69
4,colleague send letter,56
5,law enforcement officer,53
6,congressional art competition,53
7,high school student,52
8,look forward work,50
9,capitol police officer,49


In [24]:
# Democrat Bigrams
common_words = get_n_gram(dem_clean.Content,(2,2), 10)
trigram = pd.DataFrame(common_words, columns = ['trigram' , 'count'])
trigram

Unnamed: 0,trigram,count
0,bipartisan infrastructure,2592
1,health care,2504
2,year ago,2453
3,buildbackbetter act,1796
4,small business,1778
5,american rescue,1763
6,rescue plan,1759
7,make sure,1641
8,climate change,1585
9,child care,1581


In [23]:
# Democrat Trigrams
common_words = get_n_gram(dem_clean.Content,(3,3), 10)
trigram = pd.DataFrame(common_words, columns = ['trigram' , 'count'])
trigram

Unnamed: 0,trigram,count
0,american rescue plan,1707
1,child tax credit,1273
2,bipartisan infrastructure law,566
3,john lewis voting,496
4,infrastructure investment jobs,455
5,investment jobs act,455
6,lewis voting rights,421
7,telephone town hall,399
8,bipartisan infrastructure deal,383
9,look forward work,321


In [25]:
# All tweet Trigrams
common_words = get_n_gram(all_tweets_clean.Content,(3,3), 10)
trigram = pd.DataFrame(common_words, columns = ['trigram' , 'count'])
trigram

Unnamed: 0,trigram,count
0,american rescue plan,1729
1,child tax credit,1282
2,bipartisan infrastructure law,573
3,john lewis voting,503
4,infrastructure investment jobs,462
5,investment jobs act,462
6,telephone town hall,433
7,lewis voting rights,427
8,bipartisan infrastructure deal,390
9,look forward work,371


In [None]:


def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def removeRT(text):
    RTless = lambda x: re.compile(r'\#').sub('', re.compile('RT @').sub('@', x, count=1).strip())
    return (RTless(text))

def clean_text(text):
    
    # Remove RT
    text = removeRT(text)
    
    # Remove emojis
    text = remove_emoji(text)
    
    # Remove mentions
    text = re.sub("@[A-Za-z0-9_]+","", text)
    
    # Remove Hastags
    text = re.sub("#[A-Za-z0-9_]+","", text)
    
    # Make lowercase   
    text = text.lower()
    
    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove punctuation   
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) 
    
    # Remove words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Remove Stopwords
    text = text.split()
    text = [w for w in text if not w in stopwords.words('english')]
    text = " ".join(word for word in text)    
    
    
    return text



In [None]:
testdf.Content

In [None]:
import re
import string

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def removeRT(text):
    RTless = lambda x: re.compile(r'\#').sub('', re.compile('RT @').sub('@', x, count=1).strip())
    return (RTless(text))

def clean_text(text):
    
    # Remove RT
    text = removeRT(text)
    
    # Remove emojis
    text = remove_emoji(text)
    
    # Remove mentions
    text = re.sub("@[A-Za-z0-9_]+","", text)
    
    # Remove Hastags
    text = re.sub("#[A-Za-z0-9_]+","", text)
    
    # Make lowercase   
    text = text.lower()
    
    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove punctuation   
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) 
    
    # Remove words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Remove Stopwords
    text = text.split()
    text = [w for w in text if not w in stopwords.words('english')]
    text = " ".join(word for word in text)    
    
    
    return text

testdf_clean = pd.DataFrame(testdf.Content.apply(lambda x: clean_text(x)))

In [None]:
## Cleaning all tweets data (combined GOP and Dem)
testdf_clean = pd.DataFrame(testdf.Content.apply(lambda x: clean_text(x)))

In [None]:
## Cleaning GOP tweets data 
clean_gop_tweets = pd.DataFrame(just_gop_df.Content.apply(lambda x: clean_text(x)))

In [None]:
## Cleaning Dem tweets data 
clean_dem_tweets = pd.DataFrame(just_dem_df.Content.apply(lambda x: clean_text(x)))

In [None]:
tweets_df_clean

## Reality Check: Size of Corpus?

In [None]:
word_corpus = tweets_df_clean.text

corpuslen = sum([len(d.split(' ')) for d in word_corpus]) 
print(f'Total words in corpus: {corpuslen}')

In [None]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()
def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)
tweets_df_clean = pd.DataFrame(tweets_df_clean.text.apply(lambda x: lemmatizer(x)))
tweets_df_clean['text'] = tweets_df_clean['text'].str.replace('-PRON-', '')

In [None]:
## Functionalized NLP pipeline

from sklearn.decomposition import LatentDirichletAllocation

def getTopics(df, min_df, max_df, max_features):
    
    ## Vectorization
    vectorizer = CountVectorizer(
        analyzer='word',       
        min_df=min_df,# minimum required occurences of a word 
        #max_df=.7,# maximum required occurences of a word 
        stop_words=set(stop_words),# remove stop words
        lowercase=True,# convert all words to lowercase
        token_pattern='[a-zA-Z0-9]{3,}',# num chars > 3
        max_features=max_features # max number of unique words
        )
    
    data_matrix = vectorizer.fit_transform(df.text)
    
    ## Modeling
    lda_model = LatentDirichletAllocation(
    n_components=10, # Number of topics
    learning_method='online',
    random_state=20,       
    n_jobs = -1  # Use all available CPUs
                                        )
    lda_output = lda_model.fit_transform(data_matrix)    
    
    return lda_model, vectorizer, data_matrix, lda_output


lda_model, vectorizer, data_matrix, lda_output = getTopics(tweets_df_clean, min_df=3, max_df=.7, max_features=5000)


## Visualize Topics

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, data_matrix, vectorizer, mds='tsne')

## List Top 10 Topics

In [None]:
for i,topic in enumerate(lda_model.components_):
    print(f'Top 10 words for topic #{i}:')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

## Look at Unigrams, Bigrams and Trigrams

In [None]:
## Unigrams
#from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words=set(stop_words)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(tweets_df_clean.text, 10)
unigram = pd.DataFrame(common_words, columns = ['unigram' , 'count'])

In [None]:
unigram

In [None]:
## bigrams
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(stop_words=set(stop_words),ngram_range=(2,2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(tweets_df_clean.text, 10)
bigram = pd.DataFrame(common_words, columns = ['bigram' , 'count'])

In [None]:
bigram

In [None]:
## trigrams
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(stop_words=set(stop_words),ngram_range=(3,3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(tweets_df_clean.text, 10)
trigram = pd.DataFrame(common_words, columns = ['trigram' , 'count'])

In [None]:
trigram