# Political Influencers

A look at how connected legislators are with one another when posting tweets to twitter.

***

In [1]:
import pandas as pd
import numpy as np
import json
import sys
import os

In [2]:
ROOT_DIR = "../../"

In [4]:
# Load utils
sys.path.insert(0, os.path.abspath(ROOT_DIR + "/src"))
from utils import *

## Look at data

In [5]:
DATA_DIR = ROOT_DIR + "data/"

legislators_fname = DATA_DIR + "legislators-current.csv"
newspaper_fname = DATA_DIR + "usnewspaperSample.tsv"

# Likely won't need users data, tweets data contains sufficient info to compensate
users_fname = DATA_DIR + "users.json"
tweets_fname = DATA_DIR + "tweets.json"

In [6]:
explore_data(legislators_fname)

Column name                        Type           Sample
------------------------------------------------------------
last_name--------------------------object---------Stabenow
first_name-------------------------object---------Debbie
middle_name------------------------object---------Ann
suffix-----------------------------object---------nan
nickname---------------------------object---------nan
full_name--------------------------object---------Debbie Stabenow
birthday---------------------------object---------1950-04-29
gender-----------------------------object---------F
type-------------------------------object---------sen
state------------------------------object---------MI
district---------------------------float64--------nan
senate_class-----------------------int64----------1
party------------------------------object---------Democrat
url--------------------------------object---------https://www.stabenow.senate.gov
address----------------------------object---------731 Hart Senate Offic

In [7]:
explore_data(newspaper_fname)

Column name                        Type           Sample
------------------------------------------------------------
id---------------------------------int64----------4230319
publishdate------------------------object---------2020-09-08
title------------------------------object--------- As Brexit Deadlines Loom, the Posturing and Bic
news-------------------------------object---------LONDON — Bellicose threats to walk away from the


In [8]:
explore_data(users_fname)

Column name                        Type           Sample
------------------------------------------------------------
contributors_enabled---------------bool-----------False
created_at-------------------------datetime64[ns]-2009-04-10 13:38:03
default_profile--------------------bool-----------False
default_profile_image--------------bool-----------False
description------------------------object---------Proudly serving California's San Fernando Valley
entities---------------------------object---------{'description': {'urls': []}, 'url': {'urls': [{'display_url': 'BradSherman.house.gov', 'expanded_url': 'http://BradSherman.house.gov', 'indices': [0, 23], 'url': 'https://t.co/SyFUwfPtR8'}]}}
favourites_count-------------------int64----------0
follow_request_sent----------------bool-----------False
followers_count--------------------int64----------23126
following--------------------------bool-----------False
friends_count----------------------int64----------512
geo_enabled-----------------

In [9]:
explore_data(tweets_fname)

Column name                        Type           Sample
------------------------------------------------------------
contributors-----------------------float64--------nan
coordinates------------------------float64--------nan
created_at-------------------------datetime64[ns]-2008-08-28 18:38:20
display_text_range-----------------object---------[0, 80]
entities---------------------------object---------{'hashtags': [], 'symbols': [], 'urls': [], 'user_mentions': []}
favorite_count---------------------int64----------0
favorited--------------------------bool-----------False
geo--------------------------------float64--------nan
id---------------------------------int64----------901977122
id_str-----------------------------int64----------901977122
in_reply_to_screen_name------------float64--------nan
in_reply_to_status_id--------------float64--------nan
in_reply_to_status_id_str----------float64--------nan
in_reply_to_user_id----------------float64--------nan
in_reply_to_user_id_str----------

***

# v0.1

## Create jupyter notebook for retrieving twitter handles from legislators
[Issue](https://github.com/codencoding/political-influencers/issues/1)

In [6]:
legis_df = pd.read_csv(legislators_fname)

In [7]:
legis_df.twitter.head()

0    SenSherrodBrown
1    SenatorCantwell
2      SenatorCardin
3      SenatorCarper
4        SenBobCasey
Name: twitter, dtype: object

In [8]:
na_perc = legis_df.twitter.isna().sum() / legis_df.shape[0] * 100
print(f"{round(na_perc, 3)}% of legislator twitter handles are na.")

2.235% of legislator twitter handles are na.


### Load tweets

In [7]:
tweet_df = pd.read_json(tweets_fname, lines=True, nrows=5)

In [8]:
tweet_df

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,favorite_count,favorited,geo,id,id_str,...,is_quote_status,lang,place,retweet_count,retweeted,screen_name,source,text,truncated,user_id
0,,,2008-08-04 17:28:51,"[0, 74]","{'hashtags': [], 'symbols': [], 'urls': [], 'u...",0,False,,877418565,877418565,...,False,en,,0,False,JohnBoozman,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",On a conference call about a weekend trip to I...,False,5558312
1,,,2008-08-06 19:04:45,"[0, 25]","{'hashtags': [], 'symbols': [], 'urls': [], 'u...",0,False,,879618172,879618172,...,False,nl,,0,False,JohnBoozman,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Being interviewed by KTHV,False,5558312
2,,,2008-08-06 20:35:36,"[0, 65]","{'hashtags': [], 'symbols': [], 'urls': [], 'u...",0,False,,879695803,879695803,...,False,en,,0,False,JohnBoozman,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Being interviewed by KARN on his Arkansas Worl...,False,5558312
3,,,2008-08-07 13:52:52,"[0, 37]","{'hashtags': [], 'symbols': [], 'urls': [], 'u...",0,False,,880393665,880393665,...,False,en,,0,False,JohnBoozman,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","On KWHN in Fort Smith, that's 1320 AM",False,5558312
4,,,2008-08-07 15:12:05,"[0, 90]","{'hashtags': [], 'symbols': [], 'urls': [], 'u...",0,False,,880474266,880474266,...,False,en,,0,False,JohnBoozman,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Attending a Military of the Purple Heart Cerem...,False,5558312


### Fetch unique twitter handles from tweets dataset

In [68]:
unique_handles = set()
for chunk in pd.read_json(tweets_fname, lines=True, chunksize=10000):
    unique_handles = unique_handles.union(chunk.screen_name.unique())

In [80]:
sum([sname in unique_handles for sname in legis_df.twitter.unique()]) / legis_df.shape[0]

0.5661080074487895

`
!!! Only 56% of legislators are present in the tweets dataset !!!
`

### Create tweets subset for only legislators present in the dataset
~~That way, the data can be uploaded to github, and later used in neo4j.~~

Stored locally in neo4j data directory

In [61]:
legis_tweets = []
legislator_names = set(legis_df.twitter.unique())
loaded_rows = 0

for chunk in pd.read_json(tweets_fname, lines=True, chunksize=1000):
    loaded_rows += chunk.shape[0]
    print(f"Loaded {loaded_rows} rows", end='\r')
    for row in range(chunk.shape[0]):
        if chunk.iloc[row].screen_name in legislator_names:
            row = chunk.iloc[row].to_dict()
            # Remove all single and double quotes from text
            row["text"] = row["text"].replace('\'', '').replace("\"", "").replace('’', '').replace('\\', '/')
            # Add a hashtag key
            row["htags"] = row["entities"]["hashtags"]
            # Add a user_mentions key
            row["user_mentions"] = row["entities"]["user_mentions"]
            
            legis_tweets.append(row)

Loaded 1243370 rows

In [62]:
legis_tweets_df = pd.DataFrame(legis_tweets)

In [65]:
legis_tweets_df.to_csv(data_dir + "legis_tweets.csv", index=False)

## Create a legislator schema for neo4j
[Issue](https://github.com/codencoding/political-influencers/issues/2)

### Schema

- tweets and legislator datasets join on tweets.screen_name and legislator.twitter
- Extract entities, in_reply_to_screen_name, created_at, and text columns from the matched tweets rows

### Join legislators and tweets via SQL
`
SELECT entities, in_reply_to_screen_name, text, created_at
FROM tweets AS tw
RIGHT JOIN legislators AS lg
ON tw.screen_name = lg.twitter
`