In [7]:
from TwitterAPI import TwitterAPI
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import process
import os
from bs4 import BeautifulSoup as bs
import requests

# Import a list of states
from states import states as all_states
all_states = [ii.lower() for ii in all_states.values()]

## Congresspeople Twitter
First we'll pull the twitter handles for active congresspeople from a list curated by [CSPAN](http://twitter.com/CSPAN)

In [2]:
# Connect to twitter
keys = dict(consumer_key=os.environ['TWITTER_KEY'],
            consumer_secret=os.environ['TWITTER_SECRET'],
            access_token_key=os.environ['TWITTER_ACCESS_TOKEN_KEY'],
            access_token_secret=os.environ['TWITTER_ACCESS_TOKEN_SECRET'])
api = TwitterAPI(**keys)

In [3]:
# Find the ID for this list
slug = "members-of-congress"
handle = "cspan"
resp = api.request('lists/ownerships', params=dict(screen_name=handle))
this_list = [ii for ii in resp.json()['lists'] if ii['slug'] == slug][0]
this_list_id = this_list['id']

In [4]:
# Now pull the actual list
resp = api.request("lists/members",
                   params=dict(list_id=this_list_id, count=1000))

In [5]:
# Remove text relating just to the job title
jobs = {'senator': ['senate', 'sen'],
        'representative': ['representative', 'rep']}

# Prep names for matching
congresspeople_twitter = []
for person in resp:
    handle = person['screen_name']
    name = person['name']

    congresspeople_twitter.append(dict(name=name.lower(),
                                  handle=handle))

# Collect and save    
congresspeople_twitter = pd.DataFrame(congresspeople_twitter)
congresspeople_twitter.to_csv('../data/congressperson_twitter.csv')

## Congresspeople Information
Now we'll pull information about each congressperson's state / party affiliation. We'll use the information stored in the (excellent) website [ballotpedia](https://ballotpedia.org/List_of_current_members_of_the_U.S._Congress)

In [8]:
url = "https://ballotpedia.org/List_of_current_members_of_the_U.S._Congress"
congress = requests.get(url)

In [9]:
# Use beautifulsoup to parse the output and collect information we want
html = bs(congress.text, 'html5lib')
tables = html.find_all('table', attrs={'class': 'wikitable'})

congresspeople_info = []
for body, table in zip(['senate', 'house'], tables):
    people = table.find_all('tr') 
    for person in people[1:]:
        name, yrs, party, state, end = person.find_all('td')
        name, yrs, party, state, end = [ii.text.strip().lower()
                                        for ii in (name, yrs, party, state, end)]
        congresspeople_info.append(dict(name=name, yrs=yrs, body=body,
                                        party=party, state=state, end=end))
congresspeople_info = pd.DataFrame(congresspeople_info)

# Now save to disk
congresspeople_info.to_csv('../data/congressperson_info.csv')

## Putting them together

In [25]:
twitter = pd.read_csv('../data/congressperson_twitter.csv', index_col=0)
info = pd.read_csv('../data/congressperson_info.csv', index_col=0)

# Split state and district
for ii, row in info.iterrows():
    if row['body'] == 'house':
        state = [istate for istate in all_states if istate in row['state']]
        state = None if len(state) == 0 else state[0]
        
        district = row['state'].split('district')[-1].split(' ')[-1]
        info.loc[ii, 'state'] = state
        info.loc[ii, 'district'] = district

# Remove unnecessary text
remove_text = ['senator', 'rep.', 'u.s.']
for txt in remove_text:
    twitter['name'] = twitter['name'].str.replace(txt, '')
twitter['name'] = twitter['name'].str.strip()

In [27]:
# Find and merge the twitter information and the rep information
# We'll use the `fuzzywuzzy` package for this.
twitter_iter = twitter.copy()
person_names = info['name'].values

# On each step, fuzzywuzzy will pick the 'best' name match
# We'll on keep it if we're very confident in the match
# On each iteration, we'll loosen this confidence level
confidence_requirements = [90, 85, 80, 75]
matches = []
for conf in confidence_requirements:
    for name in tqdm(person_names):
        best_name, best_conf = process.extractBests(name, twitter_iter['name'].values)[0]
        if best_conf > conf:
            # Find the handle for this name
            handle = twitter_iter.query('name == @best_name')['handle'].values[0]
            
            # Remove that entry from names
            twitter_names = twitter_iter[twitter_iter['name'].values != best_name]
            person_names = person_names[person_names != name]
            matches.append({'name': name,
                            'twitter_name': best_name,
                            'conf': best_conf,
                            'handle': handle})
matches = pd.DataFrame(matches)

100%|██████████| 533/533 [00:32<00:00, 16.38it/s]
100%|██████████| 86/86 [00:06<00:00,  5.92it/s]
100%|██████████| 36/36 [00:05<00:00,  6.76it/s]
100%|██████████| 28/28 [00:01<00:00, 16.25it/s]


In [28]:
# Put it all together
info_with_twitter = info.merge(matches, how='outer', on='name').sort_values('conf')

# Make sure missing states are `None`
for ii, person in info_with_twitter.iterrows():
    if person['state'] not in all_states:
        info_with_twitter.loc[ii, 'state'] = None

In [29]:
# Now save to disk
percent_matches = 1 - pd.isnull(info_with_twitter['handle']).sum() / float(info_with_twitter.shape[0])
print('Percent people matched: {:.2f}%'.format(percent_matches * 100))

info_with_twitter.to_csv('../data/congressperson_all_info.csv')

Percent people matched: 95.31%
