In [1]:
import datetime
import re

import pandas as pd
import snscrape
import snscrape.modules.twitter as sntwitter

### Get Twitter Handles of Legislators

In [2]:
# link us reps website
url = "https://pressgallery.house.gov/member-data/members-official-twitter-handles"


def get_house_reps():
    """A function which scrapes US representatives and their Twitter handles

    ------------
    attributes

    return: list
    list of tuples of each house rep and the party
    ------------
    """

    # read the housereps and pass into a dataframe
    print("***Fetching house reps ***")
    dfs = pd.read_html(url)
    print("***House reps response received***")
    house_reps = dfs[0]

    # make the first row as columns
    house_reps.columns = house_reps.iloc[1]

    df = house_reps.drop(index=0, inplace=False)[["Twitter Handle", "Party"]]
    df["Twitter Handle"] = df["Twitter Handle"].str.replace("@", "")

    # create list of tuples from the columns of dataframes
    house_rep_lists = list(zip(df["Twitter Handle"], df.Party))

    return house_rep_lists

### Clean the Tweets

In [3]:
def contractors(text):
    """Cleaning the texts, non-alphanumeric letters are removed
    including those in shortened words such as can't, won't, etc.
    This function expands these words.
    """

    # creating an empty list
    expanded_words = []

    for word in text.split():
        # using contractions.fix to expand the shortened words
        expanded_words.append(contractions.fix(word))

    expanded_text = " ".join(expanded_words)
    return expanded_text


def clean_tweets(tweet: str):
    """This function cleans the tweets
    Attrs
    ---------
    input: str
    tweet
    Returns
    ---------
    output: str
    clean tweet
    """

    tweet = contractors(tweet)
    tweet = tweet.lower()
    tweet = re.sub("@[^\s]+", "", tweet)  # remove twitter handlers
    # tweet = re.sub(r'\B#\S+','',tweet)  # remove hashtags
    tweet = re.sub(r"http\S+", "", tweet)  # Remove URLS
    tweet = re.sub(
        r"\s+", " ", tweet, flags=re.I
    )  # substitute multiple spaces with single space
    tweet = " ".join(re.findall(r"\w+", tweet))  # remove all the special characters
    tweet = re.sub(r"(^| ).(( ).)*( |$)", " ", tweet)  # remove all single characters

    return tweet

### Period to collect the tweets

In [4]:
def get_time_delta(start_date: str) -> int:
    """Get the time delta of the tweets to be scraped. Initializing the database
    is set to the first Jan of 2021. A user can specify the time delta to fetch the tweets since today

    input update: int

    returns -> int
        time delta in unix
    """

    # date time
    if start_date:
        date_from = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    else:
        date_from = datetime.datetime(2021, 1, 1)
    date_now = datetime.datetime.now()
    delta = (date_now - date_from).days

    time_delta1 = datetime.timedelta(days=delta)
    date_since = date_now - time_delta1

    # extract unix time
    unix = datetime.datetime.timestamp(date_since)

    return unix

In [5]:
def fetch_tweets(username, party, start_date=False):
    """A function that fetch tweets from a user and return as pandas DF"""

    unix = get_time_delta(start_date)

    tweet_list = []
    remove_rt = re.compile(r"^RT ")

    print(f"Fetching tweets of {username}")
    # get tweets
    for tweet_obj in sntwitter.TwitterSearchScraper(f"from:{username}").get_items():

        created_at = tweet_obj.date  # utc time tweet created
        tweet = tweet_obj.rawContent  # tweet
        unix_created = datetime.datetime.timestamp(created_at)

        if (not re.search(remove_rt, tweet)) and (unix_created >= unix):
            tweet_list.append(
                dict(
                    tweet_id=tweet_obj.id,
                    username=tweet_obj.user.username,
                    party=party,
                    tweet=tweet,
                    favorite_count=tweet_obj.likeCount,
                    retweet_count=tweet_obj.retweetCount,
                    created_at=created_at,
                    source=tweet_obj.sourceLabel,
                )
            )
        else:
            break

    if tweet_list == []:
        print("Empty Tweets")
        return
    else:

        # create dataframe
        df = pd.DataFrame(tweet_list)
        #         df["clean_text"] = df["tweet"].apply(clean_tweets)
        #         print("finished cleaning tweets")

        #         df[["social_policy", "geopolitical_policy", "policies"]] = df.clean_text.apply(
        #             get_policy_cat
        #         )

        #         # drop empty policies
        #         df = df[df["policies"].map(lambda text: len(text)) > 1]

        #         df = compute_sentiments(df)
        return df

### Export Raw Tweets

In [8]:
def tweets(start_date: str) -> pd.DataFrame:

    """export tweets into a pandas dataframe for analysis"""

    dfs = []
    house_reps = get_house_reps()
    for user, party in house_reps[0:]:
        try:

            df = fetch_tweets(user, party, start_date=start_date)

            try:
                if df:
                    pass
            except ValueError:
                dfs.append(df)
        except snscrape.base.ScraperException:
            break
            
    if len(dfs) > 1:
        df = pd.concat(dfs)
        df.reset_index(inplace=True, drop=True)
        return df

In [None]:
# fetch tweets from the past 90 days and export to a csv file

df = tweets(start_date="2021-01-01")

df.to_csv("tweetsRaw.csv", index=False)

***Fetching house reps ***
***House reps response received***
Fetching tweets of Twitter Handle
Empty Tweets
Fetching tweets of RepAdams
Fetching tweets of Robert_Aderholt
Fetching tweets of RepPeteAguilar
Fetching tweets of RepRickAllen
Fetching tweets of RepColinAllred
Fetching tweets of MarkAmodeiNV2
Fetching tweets of RepArmstrongND
Fetching tweets of RepArrington
Fetching tweets of RepAuchincloss
Fetching tweets of RepCindyAxne
Fetching tweets of RepBrianBabin
Fetching tweets of RepDonBacon
Fetching tweets of RepJimBaird
Fetching tweets of RepBalderson
Fetching tweets of RepJimBanks
Fetching tweets of RepAndyBarr
Fetching tweets of RepBarragan
Fetching tweets of RepKarenBass
Fetching tweets of RepBeatty
Fetching tweets of RepBentz
Fetching tweets of RepBera
Fetching tweets of RepJackBergman
Fetching tweets of RepDonBeyer
Fetching tweets of RepBice
Fetching tweets of RepAndyBiggsAZ
Fetching tweets of RepGusBilirakis
Fetching tweets of RepDanBishop
Fetching tweets of SanfordBishop
F