In [100]:
import numpy as np
import pandas as pd

In [101]:
bill_summaries = pd.read_csv('data/bill_summaries.csv')
bio_info = pd.read_csv('data/bioinfo.csv')
fec_ids = pd.read_csv('data/fec_ids.csv')
ideology = pd.read_csv('data/ideology.csv')
sponsored_legislation = pd.read_csv('data/sponsored_legislation.csv')
terms = pd.read_csv('data/terms.csv')
vote_compare = pd.read_csv('data/vote_compare.csv')
contrib = pd.read_csv('data/contrib.csv')


1. Do each of the CSVs have a primary key? If so, what?

In [102]:
bill_summaries[['bill.type', 'bill.number', 'versionCode']].duplicated().value_counts()

False    2751
Name: count, dtype: int64

In [103]:
bio_info[['bioguide_id']].duplicated().value_counts()

False    545
Name: count, dtype: int64

In [104]:
fec_ids[['bioguide_id']].duplicated().value_counts()

False    545
Name: count, dtype: int64

In [105]:
ideology[['bioguide_id']].duplicated().value_counts()

False    545
Name: count, dtype: int64

In [106]:
sponsored_legislation[['url']].duplicated().value_counts()

False    14379
Name: count, dtype: int64

In [107]:
terms[['bioguide_id', 'chamber', 'congress']].duplicated().value_counts()

False    3257
Name: count, dtype: int64

In [108]:
vote_compare[['bioname', 'comparison_member']].duplicated().value_counts()

False    206040
Name: count, dtype: int64

In [109]:
contrib=contrib.drop_duplicates()

2. Do any of the tables need to become multiple tables, or combined into one table?

    * Should be combined into one table (share the same primary key): bioinfo, fec_ids, ideology

In [110]:
members = pd.merge(bio_info, fec_ids, 
                   how='outer', on='bioguide_id', 
                   validate='one_to_one', indicator='matched')

In [111]:
members['matched'].value_counts()

matched
both          545
left_only       0
right_only      0
Name: count, dtype: int64

In [112]:
members = members.drop(['matched'], axis=1)

In [113]:
members = pd.merge(members, ideology,
                   how='outer', on='bioguide_id', 
                   validate='one_to_one', indicator='matched')

In [114]:
members.columns

Index(['bioguide_id', 'Full name', 'Chamber', 'State', 'Party', 'District',
       'birthYear', 'image', 'Office address', 'Phone', 'Website', 'fec_id',
       'bioname', 'chamber', 'left_right_ideology', 'state_abbrev',
       'district_code', 'icpsr', 'party', 'matched'],
      dtype='object')

In [115]:
members = members.drop(['Chamber', 'Party', 'District', 'State'], axis=1)

In [116]:
members.columns = [c.lower().replace(' ', '_') for c in members.columns]

In [117]:
members 

Unnamed: 0,bioguide_id,full_name,birthyear,image,office_address,phone,website,fec_id,bioname,chamber,left_right_ideology,state_abbrev,district_code,icpsr,party,matched
0,A000055,Robert B. Aderholt,1965.0,https://www.congress.gov/img/member/a000055_20...,"272 Cannon House Office Building, Washington, ...",(202) 225-4876,https://aderholt.house.gov/,H6AL04098,"ADERHOLT, Robert",House,0.405,AL,4,29701,Republican,both
1,A000148,Jake Auchincloss,1988.0,https://www.congress.gov/img/member/67817e391f...,"1524 Longworth House Office Building, Washingt...",(202) 225-5931,https://auchincloss.house.gov,H0MA04192,"AUCHINCLOSS, Jake",House,-0.288,MA,4,22100,Democrat,both
2,A000369,Mark E. Amodei,1958.0,https://www.congress.gov/img/member/a000369_20...,"104 Cannon House Office Building, Washington, ...",(202) 225-6155,https://amodei.house.gov,H2NV02395,"AMODEI, Mark E.",House,0.384,NV,2,21196,Republican,both
3,A000370,Alma S. Adams,1946.0,https://www.congress.gov/img/member/a000370_20...,"2436 Rayburn House Office Building, Washington...",(202) 225-1510,https://adams.house.gov,H4NC12100,"ADAMS, Alma",House,-0.462,NC,12,21545,Democrat,both
4,A000371,Pete Aguilar,1979.0,https://www.congress.gov/img/member/a000371_20...,"108 Cannon House Office Building, Washington, ...",(202) 225-3201,https://aguilar.house.gov/,H2CA31125,"AGUILAR, Peter Rey",House,-0.324,CA,33,21506,Democrat,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,W000830,George Whitesides,1974.0,https://www.congress.gov/img/member/68dc43db19...,"1504 Longworth House Office Building, Washingt...",(202) 225-1956,https://whitesides.house.gov/,H4CA27111,"WHITESIDES, George",House,-0.189,CA,27,22559,Democrat,both
541,W000831,James R. Walkinshaw,1982.0,https://www.congress.gov/img/member/68c1bd4ca9...,"2265 Rayburn House Office Building, Washington...",(202) 225-1492,https://walkinshaw.house.gov/,H6VA11066,"WALKINSHAW, James R.",House,-0.503,VA,11,22564,Democrat,both
542,Y000064,Todd Young,1972.0,https://www.congress.gov/img/member/y000064_20...,185 Dirksen Senate Office Building Washington...,(202) 224-5623,https://www.young.senate.gov,S0IN00194,"YOUNG, Todd",Senate,0.438,IN,0,21133,Republican,both
543,Y000067,Rudy Yakym III,1984.0,https://www.congress.gov/img/member/y000067_20...,"349 Cannon House Office Building, Washington, ...",(202) 225-3915,https://yakym.house.gov,H2IN02295,"YAKYM, Rudy, III",House,0.513,IN,2,22171,Republican,both


In [118]:
members.to_csv('data/thirdNF/members.csv', index=False)

In [119]:
for c in bill_summaries.columns: 
    d = bill_summaries.groupby(['bill.type', 'bill.number'])[c].nunique().value_counts()
    print(d)

actionDate
1    2725
2       9
4       1
Name: count, dtype: int64
actionDesc
1    2722
2      12
5       1
Name: count, dtype: int64
currentChamber
1    2728
2       6
3       1
Name: count, dtype: int64
currentChamberCode
1    2728
2       6
3       1
Name: count, dtype: int64
lastSummaryUpdateDate
1    2722
2      12
5       1
Name: count, dtype: int64
text
1    2730
2       4
4       1
Name: count, dtype: int64
updateDate
1    2722
2      12
5       1
Name: count, dtype: int64
versionCode
1    2722
2      12
5       1
Name: count, dtype: int64
bill.congress
1    2735
Name: count, dtype: int64
bill.number
1    2735
Name: count, dtype: int64
bill.originChamber
1    2735
Name: count, dtype: int64
bill.originChamberCode
1    2735
Name: count, dtype: int64
bill.title
1    2735
Name: count, dtype: int64
bill.type
1    2735
Name: count, dtype: int64
bill.updateDateIncludingText
1    2735
Name: count, dtype: int64
bill.url
1    2735
Name: count, dtype: int64


In [120]:
bills = bill_summaries[['bill.type', 'bill.number', 'bill.congress', 'bill.originChamber', 
                        'bill.originChamberCode', 'bill.title', 'bill.updateDateIncludingText', 'bill.url']].drop_duplicates()
bills.columns = [c.lower().replace(' ', '_') for c in bills.columns]
bills.to_csv('data/thirdNF/bills.csv', index=False)

In [121]:
bill_versions = bill_summaries.drop(['bill.congress', 'bill.originChamber', 'bill.originChamberCode', 
                                     'bill.title', 'bill.updateDateIncludingText', 'bill.url'], axis=1)

In [122]:
bill_versions.columns = [c.lower().replace(' ', '_') for c in bill_versions.columns]
bill_versions.to_csv('data/thirdNF/bill_versions.csv', index=False)

In [123]:
vote_compare.to_csv('data/thirdNF/vote_compare.csv', index=False)