In [117]:
import os
import re
import json
import datetime
from datetime import date
from contextlib import contextmanager
import contractions

import snscrape.modules.twitter as sntwitter
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [136]:
nltk.download('stopwords')

# stopwords
stopwords_set = set(stopwords.words('english'))

# update stopwords set
stopwords_set.update(['s', 'will', 'amp',
                      'must', 'rt', 'american',
                      'americans', 're', '000'
                      ])
sid = SIA()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/denniesbor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Collection

Snscrape library is utilized to collect data from Twitter. 
A JSON file is loaded consisting of the two main categories in the study: geopolitical and social policies. 
Furthermore, in each category tree are the sub-groups and their associated keywords applied in text filtering.

In [206]:
# opening JSON file
with open('data.json') as json_file:
    categories = json.load(json_file)
categories

{'Social Policies': {'All': 'all',
  'LGBTQ': ['transphobia',
   'sexual identity',
   'homophobia',
   'trans',
   'LGBTQ',
   'biphobia',
   'lgbtq'],
  'internet': ['broadband', 'internet'],
  'women health, and rights': ['abortion',
   'maternal',
   'momnibus',
   'reproductive',
   'birth',
   'breast',
   'pregnancy',
   'pregnant'],
  'substance and mental health': ['mental',
   'suicide',
   'restoringhope',
   'marijuana',
   'substance'],
  'guns and assault weapons': ['guns',
   'arms',
   'shooter',
   'assault',
   'weapons',
   'gun'],
  'immigration': ['southern',
   'immigration',
   'migrant',
   'border',
   'immigrants'],
  'energy': ['gallon',
   'fossil',
   'electric',
   'gas',
   'fuel',
   'oil',
   'pump',
   'renewable',
   'energy',
   'petroleum'],
  'climate change': ['greenhouse',
   'fossil',
   'climate',
   'gas',
   'emissions',
   'carbon']},
 'Geo Political Policies': {'All': 'all',
  'china': ['china', 'ccp', 'communist', 'chinese'],
  'chips scie

#### Get Twitter Handles of Legislators 

In [145]:
# link us reps website
url = "https://pressgallery.house.gov/member-data/members-official-twitter-handles"


def get_house_reps():
    '''A function which scrapes US representatives and their Twitter handles

    ------------
    attributes

    return: list
    list of tuples of each house rep and the party
    ------------
    '''

    # read the housereps and pass into a dataframe
    print('***Fetching house reps ***')
    dfs = pd.read_html(url)
    print('***House reps response received***')
    house_reps = dfs[0]

    # make the first row as columns
    house_reps.columns = house_reps.iloc[1]

    df = house_reps.drop(index=0, inplace=False)[['Twitter Handle', 'Party']]
    df['Twitter Handle'] = df['Twitter Handle'].str.replace('@', '')

    # create list of tuples from the columns of dataframes
    house_rep_lists = list(zip(df['Twitter Handle'], df.Party))

    return house_rep_lists

In [146]:
get_house_reps()

***Fetching house reps ***
***House reps response received***


[('Twitter Handle', 'Party'),
 ('RepAdams', 'D'),
 ('Robert_Aderholt', 'R'),
 ('RepPeteAguilar', 'D'),
 ('RepRickAllen', 'R'),
 ('RepColinAllred', 'D'),
 ('MarkAmodeiNV2', 'R'),
 ('RepArmstrongND', 'R'),
 ('RepArrington', 'R'),
 ('RepAuchincloss', 'D'),
 ('RepCindyAxne', 'D'),
 ('RepBrianBabin', 'R'),
 ('RepDonBacon', 'R'),
 ('RepJimBaird', 'R'),
 ('RepBalderson', 'R'),
 ('RepJimBanks', 'R'),
 ('RepAndyBarr', 'R'),
 ('RepBarragan', 'D'),
 ('RepKarenBass', 'D'),
 ('RepBeatty', 'D'),
 ('RepBentz', 'R'),
 ('RepBera', 'D'),
 ('RepJackBergman', 'R'),
 ('RepDonBeyer', 'D'),
 ('RepBice', 'R'),
 ('RepAndyBiggsAZ', 'R'),
 ('RepGusBilirakis', 'R'),
 ('RepDanBishop', 'R'),
 ('SanfordBishop', 'D'),
 ('repblumenauer', 'D'),
 ('RepLBR', 'D'),
 ('RepBoebert', 'R'),
 ('RepBonamici', 'D'),
 ('RepBost', 'R'),
 ('RepBourdeaux', 'D'),
 ('RepBowman', 'D'),
 ('CongBoyle', 'D'),
 ('RepKevinBrady', 'R'),
 ('RepMoBrooks', 'R'),
 ('RepAnthonyBrown', 'D'),
 ('RepShontelBrown', 'D'),
 ('RepBrownley', 'D'),
 ('Ver

In [147]:
def get_time_delta(update: int) -> int:
    ''' Get the time delta of the tweets to be scraped. Initializing the database
    is set to the first Jan of 2021. A user can specify the time delta to fetch the tweets since today

    input update: int

    returns -> int
        time delta in unix
    '''
    
    # date time
    date_from = datetime.datetime(2021, 1, 1)
    date_now = datetime.datetime.now()
    delta = (date_now - date_from).days

    if(update):
        delta = update

    time_delta1 = datetime.timedelta(days=delta)
    date_since = date_now-time_delta1

    # extract unix time
    unix = datetime.datetime.timestamp(date_since)
    
    return unix

In [148]:
def contractors(text):
    '''Cleaning the texts, non-alphanumeric letters are removed
    including those in shortened words such as can't, won't, etc.
    This function expands these words.
    '''

    # creating an empty list
    expanded_words = []
    
    for word in text.split():
        # using contractions.fix to expand the shortened words
        expanded_words.append(contractions.fix(word))

    expanded_text = ' '.join(expanded_words)
    return expanded_text


def clean_tweets(tweet:str):
    """ This function cleans the tweets
    Attrs
    ---------
    input: str
    tweet
    Returns
    ---------
    output: str
    clean tweet 
    """

    tweet = contractors(tweet)
    tweet = tweet.lower()
    tweet = re.sub('@[^\s]+', '', tweet)  # remove twitter handlers
    # tweet = re.sub(r'\B#\S+','',tweet)  # remove hashtags
    tweet = re.sub(r"http\S+", "", tweet)  # Remove URLS
    tweet = re.sub(r'\s+', ' ', tweet, flags=re.I)  # substitute multiple spaces with single space
    tweet = ' '.join(re.findall(r'\w+', tweet))  # remove all the special characters
    tweet = re.sub(r'(^| ).(( ).)*( |$)', ' ', tweet)  #remove all single characters

    return tweet

In [188]:
def get_policy_cat(text):
    '''This function searches through a tweet text and categorizes it into
    their respective categories, e.g., geo-political or social, and further
    break down into sub-categories, e.g., climate change, etc.
    '''

    social_policy = ''
    geopolitical_policy = ''
    policies = ''
    policy_cat = ''

    for policy_type in categories:

        for policy in categories[policy_type]:
            if policy != 'All':
                search = '|'.join([f"{p} " for p in categories[policy_type][policy]])
                regexp = re.search(r'%s' % search, text, re.I)

                if regexp:
                    policies += policy+'|'

                    if policy_type in policy_cat:
                        pass
                    else:
                        policy_cat += policy_type+' '
                        if(policy_type == 'Social Policies'):
                            social_policy += policy_type
                        if(policy_type == 'Geo Political Policies'):
                            geopolitical_policy += policy_type
            else:
                pass

    return pd.Series([social_policy, geopolitical_policy, policies.split('|')])

In [189]:
def compute_sentiments(df):
    """Function which computes the sentiments of a dataframe texts. 
    """
    
    df['sentiments'] = df['clean_text'].apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+', x.lower()))))

    # extract scores of sentiments. 0.00001 added incase of a score of 0
    df['positive_sentiment'] = df['sentiments'].apply(lambda x: x['pos']+1*(10**-6))
    df['neutral_sentiment'] = df['sentiments'].apply(lambda x: x['neu']+1*(10**-6))
    df['negative_sentiment'] = df['sentiments'].apply(lambda x: x['neg']+1*(10**-6))
    df['compound_sentiment'] = df['sentiments'].apply(lambda x: x['compound']+1*(10**-6))
    df['sentiment_text'] = df['compound_sentiment'].apply(lambda x: 'positive' if x > 0.05 else ('negative' if x < -0.05 else 'neutral'))
    df.drop(columns=['sentiments'], inplace=True)

    print('Finished computing sentiment analysis \n')

    return df

In [190]:
def fetch_tweets(username, party, update=False):
    ''' A function that fetch tweets from a user and return as pandas DF
    '''

    unix = get_time_delta(update)

    tweet_list = []
    compile = re.compile(r'^RT ')

    print(f"Fetching tweets of {username}")
    # get tweets
    for tweet_obj in sntwitter.TwitterSearchScraper(f'from:{username}').get_items():

        created_at = tweet_obj.date  # utc time tweet created
        tweet = tweet_obj.rawContent  # tweet
        unix_created = datetime.datetime.timestamp(created_at)

        if (not re.search(compile, tweet)) and (unix_created >= unix):
            tweet_list.append(
                dict(
                    tweet_id=tweet_obj.id,
                    username=tweet_obj.user.username,
                    party=party,
                    tweet=tweet,
                    favorite_count=tweet_obj.likeCount,
                    retweet_count=tweet_obj.retweetCount,
                    created_at=created_at,
                    source=tweet_obj.sourceLabel
                    ))
        else:
            break

    if tweet_list == []:
        print('Empty Tweets')
        return
    else:

        # create dataframe
        df = pd.DataFrame(tweet_list)
        df['clean_text'] = df['tweet'].apply(clean_tweets)
        print('finished cleaning tweets')

        df[['social_policy', 'geopolitical_policy', 'policies']] = df.clean_text.apply(get_policy_cat)

        # drop empty policies
        df = df[df['policies'].map(lambda text: len(text)) > 1]

        df = compute_sentiments(df)
        return df

In [196]:
def tweets(update_time: int) -> pd.DataFrame:
    
    '''export tweets into a pandas dataframe for analysis
    '''
    
    dfs = []
    house_reps = get_house_reps()
    for user, party in house_reps[0:]:

        df = fetch_tweets(user, party, update=update_time)

        try:
            if df:
                pass
        except ValueError:
          dfs.append(df)
    if len(dfs)>1:
        df = pd.concat(dfs)
        df.reset_index(inplace=True, drop=True)
        return df

In [197]:
# fetch tweets from the past 90 days

df = tweets(90)

df.to_csv('tweets.csv', index=False)

df = pd.read_csv('tweets.csv')

# evaluate string into list
df.policies.apply(eval)

***Fetching house reps ***
***House reps response received***
Fetching tweets of Twitter Handle
Empty Tweets
Fetching tweets of RepAdams
finished cleaning tweets
Finished computing sentiment analysis 

Fetching tweets of Robert_Aderholt
finished cleaning tweets
Finished computing sentiment analysis 

Fetching tweets of RepPeteAguilar
finished cleaning tweets
Finished computing sentiment analysis 

Fetching tweets of RepRickAllen
finished cleaning tweets
Finished computing sentiment analysis 

Fetching tweets of RepColinAllred
finished cleaning tweets
Finished computing sentiment analysis 

Fetching tweets of MarkAmodeiNV2
finished cleaning tweets
Finished computing sentiment analysis 

Fetching tweets of RepArmstrongND
finished cleaning tweets
Finished computing sentiment analysis 

Fetching tweets of RepArrington
finished cleaning tweets
Finished computing sentiment analysis 

Fetching tweets of RepAuchincloss
finished cleaning tweets
Finished computing sentiment analysis 

Fetching t

0                              [LGBTQ, ]
1             [energy, climate change, ]
2           [women health, and rights, ]
3           [guns and assault weapons, ]
4           [women health, and rights, ]
                      ...               
13509                    [immigration, ]
13510                    [immigration, ]
13511                    [immigration, ]
13512    [substance and mental health, ]
13513                    [immigration, ]
Name: policies, Length: 13514, dtype: object

In [198]:
df.shape

(13514, 17)

In [199]:
df.head(5)

Unnamed: 0,tweet_id,username,party,tweet,favorite_count,retweet_count,created_at,source,clean_text,social_policy,geopolitical_policy,policies,positive_sentiment,neutral_sentiment,negative_sentiment,compound_sentiment,sentiment_text
0,1594716768500686849,RepAdams,D,My statement on the tragic events in Colorado ...,9,5,2022-11-21 15:38:00+00:00,Twitter Web App,my statement on the tragic events in colorado ...,Social Policies,,"['LGBTQ', '']",0.039001,0.563001,0.399001,-0.972899,negative
1,1587829863980883968,RepAdams,D,I am fighting Big Oil by taking on unlawful pr...,13,7,2022-11-02 15:31:54+00:00,Twitter Web App,am fighting big oil by taking on unlawful pri...,Social Policies,,"['energy', 'climate change', '']",1e-06,0.885001,0.115001,-0.490199,negative
2,1583131771671318528,RepAdams,D,"""Oversight Committee and Black Maternal Health...",21,16,2022-10-20 16:23:22+00:00,Twitter Web App,oversight committee and black maternal health ...,Social Policies,,"['women health, and rights', '']",0.045001,0.955001,1e-06,0.177901,positive
3,1577015599234183168,RepAdams,D,"Some companies introduced ""shoot now, pay late...",14,6,2022-10-03 19:19:53+00:00,Twitter Web App,some companies introduced shoot now pay later ...,Social Policies,,"['guns and assault weapons', '']",0.119001,0.796001,0.084001,0.381801,positive
4,1575523426848342016,RepAdams,D,Members of the @HouseGOP are doubling down on ...,11,8,2022-09-29 16:30:31+00:00,TweetDeck,members of the are doubling down on the extrem...,Social Policies,,"['women health, and rights', '']",0.157001,0.700001,0.143001,0.128001,positive


In [200]:
df.loc[4, 'tweet']

"Members of the @HouseGOP are doubling down on the extreme MAGA agenda:\n\n🚨 Criminalizing reproductive care\n🚨 Raising seniors' drug prices and slashing Social Security &amp; Medicare\n🚨 Attacking our Democracy &amp; Government"

### To Do List -  BoxPlot

In [204]:
import seaborn as sns
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'seaborn'

ModuleNotFoundError: No module named 'matplotlib'