__Subject:__ Topics of Tweets of US Congress through the Lens of Campaign Finance

__Date:__ 09/12/2018

__Author:__ Edmund D. Chitwood
***

__Summary:__<br>
<br>The following Notebook
- downloads metadata for each Congress member,
- resolves discrepancies between the Twitter usernames listed in the Congress metadata and Tweet data.
***

In [34]:
import pandas as pd
import pickle

In [35]:
# Download CSV file that contains geographic, party, social media, etc. data
# about members of the US Congress. Source:
# https://theunitedstates.io/congress-legislators/legislators-current.csv

In [44]:
congress_metadata = pd.read_csv('legislators-current.csv')
congress_metadata.columns

Index(['last_name', 'first_name', 'birthday', 'gender', 'type', 'state',
       'district', 'senate_class', 'party', 'url', 'address', 'phone',
       'contact_form', 'rss_url', 'twitter', 'facebook', 'youtube',
       'youtube_id', 'bioguide_id', 'thomas_id', 'opensecrets_id', 'lis_id',
       'fec_ids', 'cspan_id', 'govtrack_id', 'votesmart_id', 'ballotpedia_id',
       'washington_post_id', 'icpsr_id', 'wikipedia_id'],
      dtype='object')

In [45]:
# Indicate Congress members for whom no Twitter username was listed
congress_metadata.twitter.fillna(value='no twitter',inplace=True)

In [53]:
congress_metadata[congress_metadata.twitter=='no twitter']

Unnamed: 0,last_name,first_name,birthday,gender,type,state,district,senate_class,party,url,...,opensecrets_id,lis_id,fec_ids,cspan_id,govtrack_id,votesmart_id,ballotpedia_id,washington_post_id,icpsr_id,wikipedia_id
37,Amash,Justin,1980-04-18,M,rep,MI,3.0,,Republican,https://amash.house.gov,...,N00031938,,H0MI03126,1033767.0,412438,105566.0,Justin Amash,,21143.0,Justin Amash
52,Bordallo,Madeleine,1933-05-31,F,rep,GU,0.0,,Democrat,https://bordallo.house.gov,...,N00024866,,H2GU00033,1003568.0,400041,1751.0,,,,Madeleine Bordallo
66,Cassidy,Bill,1957-09-28,M,sen,LA,,2.0,Republican,https://www.cassidy.senate.gov,...,N00030245,S373,"H8LA00017,S4LA00107",1030546.0,412269,69494.0,Bill Cassidy,,20919.0,Bill Cassidy
72,Clay,Wm.,1956-07-27,M,rep,MO,1.0,,Democrat,https://lacyclay.house.gov,...,N00012460,,H0MO01066,88332.0,400074,8967.0,William Lacy Clay,,20147.0,Lacy Clay
223,Peterson,Collin,1944-06-29,M,rep,MN,7.0,,Democrat,https://collinpeterson.house.gov,...,N00004558,,H2MN07014,23978.0,400316,26926.0,Collin Peterson,,29127.0,Collin Peterson
386,Kaine,Timothy,1958-02-26,M,sen,VA,,1.0,Democrat,https://www.kaine.senate.gov,...,N00033177,S362,S2VA00142,49219.0,412582,50772.0,Tim Kaine,,41305.0,Tim Kaine
463,Comer,James,1972-08-19,M,rep,KY,1.0,,Republican,https://comer.house.gov,...,N00038260,,H6KY01110,76619.0,412676,35169.0,James Comer Jr.,,21565.0,James Comer (politician)
523,Gianforte,Greg,1961-04-17,M,rep,MT,0.0,,Republican,https://gianforte.house.gov,...,N00040733,,H8MT01182,104895.0,412736,168594.0,Greg Gianforte,,,Greg Gianforte
533,Cloud,Michael,1975-05-13,M,rep,TX,27.0,,Republican,https://cloud.house.gov/,...,N00041882,,H8TX27049,10322.0,412746,177350.0,Michael Cloud (Texas),,,Michael Cloud
534,Kyl,Jon,1942-04-25,M,sen,AZ,,3.0,Republican,,...,N00006406,S243,S4AZ00030,,300062,26721.0,,,15429.0,Jon Kyl


In [46]:
# Lowercase Twitter usernames in order to join with Tweet data
congress_metadata.twitter = congress_metadata.twitter.apply(lambda x:x.lower()) 

In [47]:
# Import Tweets to compare usernames to Congress metadata Twitter usernames
tweets = pd.read_pickle('preprocessed_tweets.pkl')

In [48]:
tweets.head()

Unnamed: 0,username,date,retweets,favorites,text,mentions,hashtags,id,permalink,length,year,tokens,clean_text
0,amyklobuchar,2018-09-06 16:15:52,264,947,"“A popular Government, without popular informa...",,,1037736146019729409,https://twitter.com/amyklobuchar/status/103773...,199,2018,"[popular, government, without, popular, inform...",popular government without popular information...
2,amyklobuchar,2018-09-05 21:31:45,975,2657,Yes..Hidden in all the legal density are 3 key...,,#Pattern,1037453252495855616,https://twitter.com/amyklobuchar/status/103745...,340,2018,"[yeshidden, legal, density, key, consumer, iss...",yeshidden legal density key consumer issues fo...
3,amyklobuchar,2018-09-05 16:19:55,908,2522,Re Supreme Court: The 4 “confidential” 2002 do...,,,1037374780725440513,https://twitter.com/amyklobuchar/status/103737...,229,2018,"[supreme, court, confidential, docs, got, made...",supreme court confidential docs got made publi...
6,amyklobuchar,2018-09-04 22:38:52,109,880,A little break in my day to celebrate this...t...,,,1037107756409479168,https://twitter.com/amyklobuchar/status/103710...,238,2018,"[little, break, day, celebrate, thisthe, bigge...",little break day celebrate thisthe biggest fai...
8,amyklobuchar,2018-09-04 19:29:45,235,1611,"Thanks @FullFrontalSamB for watching, for spea...",@FullFrontalSamB,,1037060164673839104,https://twitter.com/amyklobuchar/status/103706...,209,2018,"[thanks, watching, speaking, making, hearing, ...",thanks watching speaking making hearing real d...


In [49]:
# Lowercase Twitter usernames in order to join with Congress metadata
tweets.username = tweets.username.apply(lambda x:x.lower()) 

In [50]:
# Compare Twitter usernames in Congress metadata to those in Tweets DataFrame
cm_usernames = set(congress_metadata.twitter)
tweets_usernames = set(tweets.username.unique())

in_cm_not_tweets = cm_usernames - tweets_usernames
in_tweets_not_cm = tweets_usernames - cm_usernames

In [51]:
in_cm_not_tweets

{'blumenauermedia',
 'kilili_sablan',
 'no twitter',
 'repanthonybrown',
 'repbenraylujan',
 'repclayhiggins',
 'repconorlamb',
 'repdavidrouzer',
 'repdevinnunes',
 'repdlesko',
 'repgregorymeeks',
 'repguthrie',
 'repjimmygomez',
 'repjohncurtis',
 'repkhandel',
 'replbr',
 'reploucorrea',
 'repmarshall',
 'repmaxinewaters',
 'repmikejohnson',
 'repralphnorman',
 'reprobbishop',
 'reprohrabacher',
 'reprutherfordfl',
 'repscotttaylor',
 'repstevepearce',
 'reptomgarrett',
 'roslehtinen',
 'senamyklobuchar',
 'sendougjones',
 'senhydesmith',
 'senkamalaharris',
 'sentinasmith'}

In [52]:
in_tweets_not_cm

{'amyklobuchar',
 'anthonybrownmd4',
 'billcassidy',
 'brettguthrie',
 'danarohrabacher',
 'davidrouzer',
 'devinnunes',
 'farenthold',
 'gregorymeeks',
 'jasoninthehouse',
 'justinamash',
 'kamalaharris',
 'kycomer',
 'louiseslaughter',
 'lutherstrange',
 'maxinewaters',
 'patmeehanpa',
 'pattiberi',
 'repbecerra',
 'repblumenauer',
 'repcharliedent',
 'repjbridenstine',
 'repjohnconyers',
 'rogermarshallmd',
 'scotttaylorva',
 'senfranken',
 'senjohnmccain',
 'senthadcochran',
 'timkaine'}

In [54]:
# The following Twitter usernames were listed in Twitter username source: 
# https://gwu-libraries.github.io/sfm-ui/posts/2017-05-23-congress-seed-list
# but their owners were no longer in Congress as of 09/12/18
former_congress_members= ['farenthold','jasoninthehouse','louiseslaughter',\
                          'lutherstrange','patmeehanpa', 'pattiberi', \
                          'repbecerra', 'repcharliedent','repjbridenstine',\
                          'repjohnconyers','senfranken', 'senjohnmccain', 'senthadcochran']

In [56]:
tweets.username.nunique()

522

In [57]:
# Drop Tweets from preprocessed Tweets if Tweeters are no longer in Congress
for i in former_congress_members:
    tweets.reset_index(inplace=True,drop=True)
    tweets.drop(tweets[tweets.username==i].index,inplace=True)
    

In [58]:
tweets.to_pickle('updated_preprocessed_tweets.pkl')

In [59]:
# There were discrepancies between the two data sources for the following usernames:

twitter_usernames_tweets = (['maxinewaters','kamalaharris','gregorymeeks',\
                            'devinnunes','davidrouzer','danarohrabacher',\
                            'brettguthrie','anthonybrownmd4',\
                            'scotttaylorva','rogermarshallmd',\
                            'repblumenauer','amyklobuchar'])

twitter_usernames_cm = (['repmaxinewaters','senkamalaharris',\
                         'repgregorymeeks','repdevinnunes',\
                         'repdavidrouzer','reprohrabacher','repguthrie',\
                         'repanthonybrown','repscotttaylor','repmarshall',\
                         'blumenauermedia','senamyklobuchar'])


In [60]:
# Change names in T.U.C. to the corresponding ones in T.U.T. for purposes of 
# joining Tweet data to Congress metadata

for i in range(len(twitter_usernames_cm)):
    index = (congress_metadata[congress_metadata.twitter==\
                    twitter_usernames_cm\
                    [i]].twitter.index)
    congress_metadata.at[index, 'twitter'] = twitter_usernames_tweets[i]


In [61]:
# The Congress metadata source was missing Twitter usernames for
# several current members of Congress who have Twitter accounts
# Add Twitter names to Congress members with nan's where applicable

def add_username(last_name,username):
    
    index = (congress_metadata[congress_metadata.last_name==\
                    last_name]).index

    congress_metadata.at[index, 'twitter'] = username

In [62]:
add_username('Cassidy','billcassidy')
add_username('Amash','justinamash')
add_username('Comer','kycomer')
add_username('Kaine','timkaine')

In [63]:
# Compare Twitter usernames in Congress metadata to those in Tweets DataFrame
cm_usernames = set(congress_metadata.twitter)
tweets_usernames = set(tweets.username.unique())

in_cm_not_tweets = cm_usernames - tweets_usernames
in_tweets_not_cm = tweets_usernames - cm_usernames

In [64]:
# Discrepancies in Tweets DataFrame resolved
in_tweets_not_cm

set()

In [65]:
# Unresolved Discrepancies in Congress metadata
in_cm_not_tweets

{'kilili_sablan',
 'no twitter',
 'repbenraylujan',
 'repclayhiggins',
 'repconorlamb',
 'repdlesko',
 'repjimmygomez',
 'repjohncurtis',
 'repkhandel',
 'replbr',
 'reploucorrea',
 'repmikejohnson',
 'repralphnorman',
 'reprobbishop',
 'reprutherfordfl',
 'repstevepearce',
 'reptomgarrett',
 'roslehtinen',
 'sendougjones',
 'senhydesmith',
 'sentinasmith'}

In [31]:
# Drop the Congress members not accounted for in Tweets DataFrame from Congress 
# metadata
for i in in_cm_not_tweets:
    congress_metadata.reset_index(inplace=True,drop=True)
    (congress_metadata.drop(congress_metadata[congress_metadata\
                        .twitter==i].index,inplace=True))
    

In [66]:
tweets.username.nunique()

509

In [32]:
# Twitter usernames in two DataFrames now match 
congress_metadata.twitter.nunique()

509

In [69]:
congress_metadata.to_pickle('congress_metadata.pkl')