In [21]:
import json
import re
import warnings

import contractions
import pandas as pd

warnings.filterwarnings("ignore")

In [22]:
# load raw tweets

df = pd.read_csv("raw_tweets.csv")

In [23]:
# open json file consisting of policies and its sub-groups

with open("data.json") as json_file:
    categories = json.load(json_file)
categories

{'Social Policies': {'All': 'all',
  'LGBTQ Community': ['transphobia',
   'sexual identity',
   'homophobia',
   'trans',
   'LGBTQ',
   'biphobia',
   'lgbtq'],
  'Broadband Internet': ['broadband', 'internet'],
  'Abortion': ['abortion'],
  'Substance Abuse and Mental Health': ['mental',
   'suicide',
   'marijuana',
   'substance'],
  'Gun Control': ['guns', 'arms', 'shooter', 'assault', 'weapons', 'gun'],
  'Immigration and Border Control': ['southern',
   'immigration',
   'migrant',
   'border',
   'immigrants'],
  'Fossil Fuels': ['fossil', 'oil', 'petroleum'],
  'Climate Change': ['greenhouse', 'climate', 'emissions', 'carbon']},
 'Geo Political Policies': {'All': 'all',
  'Chinese Communist Party': ['china', 'ccp', 'communist', 'chinese'],
  'CHIPS and Science Act': ['science act', 'chips act', 'chips'],
  'Taiwan': ['taiwan'],
  'Ukraine-Russia': ['vladimir',
   'invasion',
   'putin',
   'zelensky',
   'russia',
   'ukraine',
   'ukrainian',
   'russian',
   'nord stream',


### Clean the Tweets

In [24]:
def contractors(text):
    """Cleaning the texts, non-alphanumeric letters are removed
    including those in shortened words such as can't, won't, etc.
    This function expands these words.
    """

    # creating an empty list
    expanded_words = []

    for word in text.split():
        # using contractions.fix to expand the shortened words
        expanded_words.append(contractions.fix(word))

    expanded_text = " ".join(expanded_words)
    return expanded_text


def clean_tweets(tweet: str):
    """This function cleans the tweets
    Attrs
    ---------
    input: str
    tweet
    Returns
    ---------
    output: str
    clean tweet
    """

    tweet = contractors(tweet)
    tweet = tweet.lower()
    tweet = re.sub("@[^\s]+", "", tweet)  # remove twitter handlers
    # tweet = re.sub(r'\B#\S+','',tweet)  # remove hashtags
    tweet = re.sub(r"http\S+", "", tweet)  # Remove URLS
    tweet = re.sub(
        r"\s+", " ", tweet, flags=re.I
    )  # substitute multiple spaces with single space
    tweet = " ".join(re.findall(r"\w+", tweet))  # remove all the special characters
    tweet = re.sub(r"(^| ).(( ).)*( |$)", " ", tweet)  # remove all single characters

    return tweet

In [25]:
# clean the tweets

df["clean_text"] = df["tweet"].apply(clean_tweets)

### Identify Tweet Categories

In [26]:
def get_policy_cat(text):
    """This function searches through a tweet text and categorizes it into
    their respective categories, e.g., geo-political or social, and further
    break down into sub-categories, e.g., climate change, etc.
    """

    social_policy = ""
    geopolitical_policy = ""
    policies = ""
    policy_cat = ""

    for policy_type in categories:

        for policy in categories[policy_type]:
            if policy != "All":
                search = "|".join([f"{p} " for p in categories[policy_type][policy]])
                regexp = re.search(r"\b%s\b" % search, text, re.I)

                if regexp:
                    policies += policy + "|"

                    if policy_type in policy_cat:
                        pass
                    else:
                        policy_cat += policy_type + " "
                        if policy_type == "Social Policies":
                            social_policy += policy_type
                        if policy_type == "Geo Political Policies":
                            geopolitical_policy += policy_type
            else:
                pass

    return pd.Series([social_policy, geopolitical_policy, policies.split("|")])

In [27]:
# get_policy_cat will create three columns in the dataframe
df[["social_policy", "geopolitical_policy", "policies"]] = df.clean_text.apply(
    get_policy_cat
)

# drop empty policies
df = df[df["policies"].map(lambda text: len(text)) > 1]

### Flattening the DataFrame

Some tweets belong to different categories, i.e., they discuss Ukraine-Russia and fossil fuels.
The code below assigns a policy and a tweet to each row, increasing the length of the dataframe.

In [28]:
articles = ["and", "LGBTQ", "the", "of"]

In [29]:
df.head(5)

Unnamed: 0,tweet_id,username,party,tweet,clean_text,favorite_count,retweet_count,created_at,source,social_policy,geopolitical_policy,policies
16,1577015599234183168,RepAdams,D,"Some companies introduced ""shoot now, pay late...",some companies introduced shoot now pay later ...,14,6,2022-10-03 19:19:53+00:00,Twitter Web App,Social Policies,,"[Gun Control, ]"
40,1570042526119219201,RepAdams,D,MAGA Republicans want a nationwide abortion ba...,maga republicans want nationwide abortion ban ...,12,4,2022-09-14 13:31:22+00:00,Twitter for iPhone,Social Policies,,"[Abortion, ]"
42,1569760198784421889,RepAdams,D,The #InflationReductionAct is historic legisla...,the inflationreductionact is historic legislat...,11,2,2022-09-13 18:49:30+00:00,Twitter Web App,Social Policies,,"[Climate Change, ]"
51,1565390497111281667,RepAdams,D,Two simple things Congress should do:\n\nRaise...,two simple things congress should do raise the...,27,5,2022-09-01 17:25:52+00:00,Twitter for iPhone,Social Policies,,"[Gun Control, ]"
61,1562467900803653632,RepAdams,D,Beginning Soon: our town hall will explain rec...,beginning soon our town hall will explain rece...,4,5,2022-08-24 15:52:31+00:00,Twitter Web App,Social Policies,,"[Abortion, ]"


In [30]:
social_policies = categories["Social Policies"]
social_policies = list(social_policies.keys())
social_policies.remove("All")

geo_political_policies = categories["Geo Political Policies"]
geo_political_policies = list(geo_political_policies.keys())
geo_political_policies.remove("All")

policies_list = []
cats = [social_policies, geo_political_policies]
for cat in cats:
    for policy in sorted(cat, key=lambda x: x.lower()):

        title = " ".join(
            [
                word.capitalize() if word not in articles else word
                for word in policy.split(" ")
            ]
        )
        # category df
        df["policies"] = df["policies"].apply(str)
        cat_df = df[df.policies.str.contains(r"%s" % policy, re.I, regex=True)]
        cat_df["policy"] = policy
        policies_list.append(cat_df)

dfs = pd.concat(policies_list)
dfs = dfs.sample(frac=1)

# twitter_data = dfs[
#     [
#         "username",
#         "party",
#         "clean_text",
#         "compound_sentiment",
#         "govtrack_class",
#         "policy",
#     ]
# ]
# twitter_data.to_csv('/home/denniesbor/Dropbox/twitter_nlp_project2/data/twitter_data.csv', index=False)

In [31]:
dfs.to_csv("clean_tweets.csv", index=False)