In [1]:
import pandas as pd
import json
import emoji

# Loading Data

In [2]:
# Setting up references for source and created data
SOURCE_DIR = "../data/source/"
CREATED_DIR = "../data/created/"
CREATED_SAMPLE_DIR = "../data/created/sample/"

In [3]:
headers = ['twitter_handle',
 'tweet_content',
 'tweet_id',
 'geolocation_of_tweet',
 'language_of_tweet',
 'date_and_time' 
]
congress_tweets = pd.read_csv(CREATED_DIR + "congress_tweets_raw.csv", low_memory=False, names=headers)

# Cleaning

## Dropping NA Values

In [5]:
congress_tweets = congress_tweets.dropna(subset=["tweet_id", "tweet_content"], axis=0).reset_index(drop=True)

## Missing Twitter Handle

In [6]:
congress_tweets = congress_tweets[congress_tweets["twitter_handle"].map(lambda x: len(x) < 30)].reset_index(drop=True)

## Filtering emojis

In [7]:
congress_tweets = congress_tweets[congress_tweets["twitter_handle"].map(lambda x: emoji.demojize(x) == x)].reset_index(drop=True)

## Removing Duplicate Tweet IDs

In [8]:
congress_tweets = congress_tweets.drop_duplicates(subset = ["tweet_id"]).reset_index(drop=True)

# Adding Congress Member Metadata

In [9]:
congress_member_data = pd.read_csv(SOURCE_DIR + "congress_member_data.csv", index_col=0)
cols = list(congress_member_data.columns)

In [10]:
twitter = list(congress_member_data["twitter"])
full_name = list(congress_member_data["full_name"])
abbreviated_state = list(congress_member_data["state"])
assert len(twitter) == len(full_name) == len(abbreviated_state)


handles = list(congress_tweets["twitter_handle"])

## List of Full Names

In [11]:
twitter_full_name = {}
for i in range(len(full_name)):
    twitter_full_name[twitter[i]] = full_name[i]

full_names_tweets = [twitter_full_name[x] for x in handles]
congress_tweets = congress_tweets.copy()

congress_tweets["full_name_of_member"] = full_names_tweets

## Abbreviated State Name

In [12]:
twitter_abbreviated_state = {}

for i in range(len(abbreviated_state)):
    twitter_abbreviated_state[twitter[i]] = abbreviated_state[i]
    
abbreviated_state_tweets = [twitter_abbreviated_state[x] for x in handles]
congress_tweets = congress_tweets.copy()

congress_tweets["abbreviated_state"] = abbreviated_state_tweets

## List Unabbreviated State Name

In [13]:
states = json.load(open(SOURCE_DIR + "states.json", encoding="utf-8"))
unabbreviated_state_tweets = [states[x] for x in abbreviated_state_tweets]

congress_tweets = congress_tweets.copy()
congress_tweets["unabbreviated_state"] = unabbreviated_state_tweets

In [14]:
full_name = list(congress_member_data["full_name"])
type_of_member = list(congress_member_data["type"])

full_name_type = {}

if len(full_name) == len(type_of_member):
    for i in range(len(type_of_member)):
        if type_of_member[i] == "sen":
            full_name_type[full_name[i]] = "Senator"
        elif type_of_member[i] == "rep":
            full_name_type[full_name[i]] = "Representative"

member_name = list(congress_tweets["full_name_of_member"])


type_of_member_tweets = [full_name_type[x] for x in member_name]
congress_tweets = congress_tweets.copy()

congress_tweets["type_of_member"] = type_of_member_tweets

## Adding Full Language Names

In [15]:
languages = json.load(open(SOURCE_DIR + "languages.json", encoding="utf-8"))
all_language_codes = languages.keys()

In [16]:
language_of_tweet = list(congress_tweets["language_of_tweet"])
twitter_language = []

for lang in language_of_tweet:
    if lang in all_language_codes:
        twitter_language.append(languages[lang]["name"])
    else:
        twitter_language.append("Undetermined")

congress_tweets["language_of_tweet"] = twitter_language

## Party

In [17]:
twitter_party = {}

# twitter = list(congress_member_data["twitter"])
party = list(congress_member_data["party"])

for i in range(len(twitter)):
    twitter_party[twitter[i]] = party[i]

party_of_member = []
for handle in handles:
    party_of_member.append(twitter_party[handle])
    

congress_tweets = congress_tweets.copy()
congress_tweets["party_of_member"] = party_of_member

In [18]:
congress_tweets.head()

Unnamed: 0,twitter_handle,tweet_content,tweet_id,geolocation_of_tweet,language_of_tweet,date_and_time,full_name_of_member,abbreviated_state,unabbreviated_state,type_of_member,party_of_member
0,RepEspaillat,Pass the #CASHAct Mitch! This has been a devas...,1344299700539764736,,English,2020-12-30 15:09:59+00:00,Adriano Espaillat,NY,New York,Representative,Democrat
1,RepEspaillat,News of the loss of Congressman-Elect Luke Let...,1344282084521684992,,English,2020-12-30 13:59:59+00:00,Adriano Espaillat,NY,New York,Representative,Democrat
2,RepEspaillat,#WearAMask wash your hands practice social dis...,1344115508933783554,,English,2020-12-30 02:58:04+00:00,Adriano Espaillat,NY,New York,Representative,Democrat
3,RepEspaillat,Can someone tell me why @senatemajldr continue...,1344075603742515201,,English,2020-12-30 00:19:30+00:00,Adriano Espaillat,NY,New York,Representative,Democrat
4,RepEspaillat,Delighted to join friends during the #NY13 Kwa...,1344014037588324352,,English,2020-12-29 20:14:51+00:00,Adriano Espaillat,NY,New York,Representative,Democrat


# Writing to csv

In [19]:
congress_tweets_processed_unlabeled = "/Users/darvesh/Programming/R/fall2021/gr5702/us_congress_twitter_covid/data/created/congress_tweets_processed_unlabeled.csv"
pd.DataFrame.to_csv(congress_tweets, congress_tweets_processed_unlabeled, index=False)

In [20]:
d = {
    'twitter_handle': 'string',
    'tweet_content': 'string',
    'tweet_id': 'string',
    'geolocation_of_tweet': 'object',
    'language_of_tweet': 'string',
    'date_and_time': 'object',
    'full_name_of_member': 'string',
    'abbreviated_state': 'string',
    'unabbreviated_state': 'string',
    'type_of_member': 'string',
    'party_of_member': 'string'
}

dtypes_json_path = "/Users/darvesh/Programming/R/fall2021/gr5702/us_congress_twitter_covid/data/created/tweet_column_data_types.json"
d_json = json.dumps(d)

with open(dtypes_json_path, "w+") as filepath:
    filepath.write(d_json)