In [10]:
import json
import re

import contractions
import pandas as pd

In [31]:
# load raw tweets

df = pd.read_csv("raw_tweets.csv")

In [32]:
# open json file consisting of policies and its sub-groups

with open("data.json") as json_file:
    categories = json.load(json_file)
categories

{'Social Policies': {'All': 'all',
  'LGBTQ': ['transphobia',
   'sexual identity',
   'homophobia',
   'trans',
   'LGBTQ',
   'biphobia',
   'lgbtq'],
  'internet': ['broadband', 'internet'],
  'women health, and rights': ['abortion',
   'maternal',
   'momnibus',
   'reproductive',
   'birth',
   'breast',
   'pregnancy',
   'pregnant'],
  'substance and mental health': ['mental',
   'suicide',
   'restoringhope',
   'marijuana',
   'substance'],
  'guns and assault weapons': ['guns',
   'arms',
   'shooter',
   'assault',
   'weapons',
   'gun'],
  'immigration': ['southern',
   'immigration',
   'migrant',
   'border',
   'immigrants'],
  'energy': ['gallon',
   'fossil',
   'electric',
   'gas',
   'fuel',
   'oil',
   'pump',
   'renewable',
   'energy',
   'petroleum'],
  'climate change': ['greenhouse',
   'fossil',
   'climate',
   'gas',
   'emissions',
   'carbon']},
 'Geo Political Policies': {'All': 'all',
  'china': ['china', 'ccp', 'communist', 'chinese'],
  'chips scie

### Clean the Tweets

In [33]:
def contractors(text):
    """Cleaning the texts, non-alphanumeric letters are removed
    including those in shortened words such as can't, won't, etc.
    This function expands these words.
    """

    # creating an empty list
    expanded_words = []

    for word in text.split():
        # using contractions.fix to expand the shortened words
        expanded_words.append(contractions.fix(word))

    expanded_text = " ".join(expanded_words)
    return expanded_text


def clean_tweets(tweet: str):
    """This function cleans the tweets
    Attrs
    ---------
    input: str
    tweet
    Returns
    ---------
    output: str
    clean tweet
    """

    tweet = contractors(tweet)
    tweet = tweet.lower()
    tweet = re.sub("@[^\s]+", "", tweet)  # remove twitter handlers
    # tweet = re.sub(r'\B#\S+','',tweet)  # remove hashtags
    tweet = re.sub(r"http\S+", "", tweet)  # Remove URLS
    tweet = re.sub(
        r"\s+", " ", tweet, flags=re.I
    )  # substitute multiple spaces with single space
    tweet = " ".join(re.findall(r"\w+", tweet))  # remove all the special characters
    tweet = re.sub(r"(^| ).(( ).)*( |$)", " ", tweet)  # remove all single characters

    return tweet

In [34]:
# clean the tweets

df["clean_text"] = df["tweet"].apply(clean_tweets)

### Identify Tweet Categories

In [35]:
def get_policy_cat(text):
    """This function searches through a tweet text and categorizes it into
    their respective categories, e.g., geo-political or social, and further
    break down into sub-categories, e.g., climate change, etc.
    """

    social_policy = ""
    geopolitical_policy = ""
    policies = ""
    policy_cat = ""

    for policy_type in categories:

        for policy in categories[policy_type]:
            if policy != "All":
                search = "|".join([f"{p} " for p in categories[policy_type][policy]])
                regexp = re.search(r"%s" % search, text, re.I)

                if regexp:
                    policies += policy + "|"

                    if policy_type in policy_cat:
                        pass
                    else:
                        policy_cat += policy_type + " "
                        if policy_type == "Social Policies":
                            social_policy += policy_type
                        if policy_type == "Geo Political Policies":
                            geopolitical_policy += policy_type
            else:
                pass

    return pd.Series([social_policy, geopolitical_policy, policies.split("|")])

In [36]:
# get_policy_cat will create three columns in the dataframe
df[["social_policy", "geopolitical_policy", "policies"]] = df.clean_text.apply(
    get_policy_cat
)

# drop empty policies
df = df[df["policies"].map(lambda text: len(text)) > 1]

In [37]:
df.head(5)

Unnamed: 0,tweet_id,username,party,tweet,clean_text,favorite_count,retweet_count,created_at,source,social_policy,geopolitical_policy,policies
3,1583131771671318528,RepAdams,D,"""Oversight Committee and Black Maternal Health...",oversight committee and black maternal health ...,25,16,2022-10-20 16:23:22+00:00,Twitter Web App,Social Policies,,"[women health, and rights, ]"
16,1577015599234183168,RepAdams,D,"Some companies introduced ""shoot now, pay late...",some companies introduced shoot now pay later ...,14,6,2022-10-03 19:19:53+00:00,Twitter Web App,Social Policies,,"[guns and assault weapons, ]"
21,1575523426848342016,RepAdams,D,Members of the @HouseGOP are doubling down on ...,members of the are doubling down on the extrem...,11,8,2022-09-29 16:30:31+00:00,TweetDeck,Social Policies,,"[women health, and rights, ]"
30,1573300633343508480,RepAdams,D,"Earlier this year, I spoke with @WRAL about th...",earlier this year spoke with about the black m...,17,6,2022-09-23 13:17:56+00:00,Twitter Web App,Social Policies,,"[women health, and rights, ]"
35,1571959264276520960,RepAdams,D,Today is National Stillbirth Prevention Day - ...,today is national stillbirth prevention day an...,30,14,2022-09-19 20:27:48+00:00,Twitter Web App,Social Policies,,"[women health, and rights, ]"


### Compute Sentiments
**Vader**

Vader Sentiment Analysis returns probabilities of given sentences as positive, negative, or neutral and according to developers it's attuned to sentiments expressed in social media


In [38]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [39]:
sid = SIA()

In [40]:
def compute_sentiments(df):
    """Function which computes the sentiments of a dataframe texts."""

    df["sentiments"] = df["clean_text"].apply(
        lambda x: sid.polarity_scores(" ".join(re.findall(r"\w+", x.lower())))
    )

    # extract scores of sentiments. 0.00001 added incase of a score of 0
    df["positive_sentiment"] = df["sentiments"].apply(
        lambda x: x["pos"] + 1 * (10**-6)
    )
    df["neutral_sentiment"] = df["sentiments"].apply(
        lambda x: x["neu"] + 1 * (10**-6)
    )
    df["negative_sentiment"] = df["sentiments"].apply(
        lambda x: x["neg"] + 1 * (10**-6)
    )
    df["compound_sentiment"] = df["sentiments"].apply(
        lambda x: x["compound"] + 1 * (10**-6)
    )
    df["sentiment_text"] = df["compound_sentiment"].apply(
        lambda x: "positive" if x > 0.05 else ("negative" if x < -0.05 else "neutral")
    )
    df.drop(columns=["sentiments"], inplace=True)

    print("Finished computing sentiment analysis \n")

    return df

In [41]:
# calculate sentiment scores
df = compute_sentiments(df)


Finished computing sentiment analysis 



In [44]:
df.shape

(118538, 17)

In [45]:
df.to_csv('clean_tweets.csv', index=False)