In [1]:
import numpy as np
import pandas as pd
import os
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')
import psycopg2 
from sqlalchemy import create_engine

In [56]:
bills=pd.read_csv('Data/bills.csv')  
committee_members=pd.read_csv('Data/committee_members.csv')  
comittees=pd.read_csv('Data/committees.csv')  
congress_members=pd.read_csv('Data/congress_members.csv') 
leadership=pd.read_csv('Data/leadership.csv')  
os_contributions=pd.read_csv('Data/os_contributions.csv')  
os_legislators=pd.read_csv('Data/os_legislators.csv')  
terms=pd.read_csv('Data/terms.csv')  
votingaffinity=pd.read_csv('Data/votingaffinity.csv')

# Bills

In [3]:
bills.columns = [x.lower() for x in bills.columns]
bills.columns = [x.replace('.', '_') for x in bills.columns]
bills.columns

Index(['congress', 'introduceddate', 'number', 'title', 'type', 'url',
       'latestaction_actiondate', 'latestaction_text', 'policyarea_name',
       'amendmentnumber', 'latestaction', 'latestaction_actiontime'],
      dtype='object')

In [4]:
bills.head(3).T

Unnamed: 0,0,1,2
congress,118,118,118
introduceddate,2023-10-04,2023-09-29,2023-09-27
number,405.0,2998.0,373.0
title,A resolution expressing support for the design...,Land-Grant Research Equity and Accountability Act,A resolution designating the week of September...
type,SRES,S,SRES
url,https://api.congress.gov/v3/bill/118/sres/405?...,https://api.congress.gov/v3/bill/118/s/2998?fo...,https://api.congress.gov/v3/bill/118/sres/373?...
latestaction_actiondate,2023-10-04,2023-09-29,2023-09-27
latestaction_text,"Submitted in the Senate, considered, and agree...",Read twice and referred to the Committee on Ag...,"Submitted in the Senate, considered, and agree..."
policyarea_name,Armed Forces and National Security,Agriculture and Food,
amendmentnumber,,,


Primary key is congress, number, and type

# Congress Members

In [5]:
congress_members.columns = [x.lower() for x in congress_members.columns]
congress_members.columns = [x.replace('.', '_') for x in congress_members.columns]
congress_members.columns

Index(['bioguideid', 'birthyear', 'currentmember', 'directordername',
       'firstname', 'honorificname', 'invertedordername', 'lastname',
       'officialwebsiteurl', 'partyhistory', 'state', 'terms', 'updatedate',
       'addressinformation_city', 'addressinformation_district',
       'addressinformation_officeaddress', 'addressinformation_phonenumber',
       'addressinformation_zipcode', 'cosponsoredlegislation_count',
       'cosponsoredlegislation_url', 'depiction_attribution',
       'depiction_imageurl', 'sponsoredlegislation_count',
       'sponsoredlegislation_url', 'middlename', 'suffixname', 'nickname',
       'leadership', 'district'],
      dtype='object')

In [6]:
congress_members = congress_members.drop(['terms', 'leadership', 'partyhistory'], axis=1)

In [7]:
congress_members.head(3).T

Unnamed: 0,0,1,2
bioguideid,B000944,C000127,C000141
birthyear,1952,1958,1943
currentmember,True,True,True
directordername,Sherrod Brown,Maria Cantwell,Benjamin L. Cardin
firstname,Sherrod,Maria,Ben
honorificname,Mr.,Ms.,Mr.
invertedordername,"Brown, Sherrod","Cantwell, Maria","Cardin, Benjamin L."
lastname,Brown,Cantwell,Cardin
officialwebsiteurl,https://www.brown.senate.gov/,https://www.cantwell.senate.gov,https://www.cardin.senate.gov/
state,Ohio,Washington,Maryland


Primary key is the bioguide_id
There is not atomic data for partyhistory, terms, and leadership. We have a terms dataframe already though

## Terms

In [8]:
terms.columns = [x.lower() for x in terms.columns]
terms.columns = [x.replace('.', '_') for x in terms.columns]
terms.columns

Index(['chamber', 'congress', 'endyear', 'membertype', 'startyear',
       'statecode', 'statename', 'bioguideid', 'district'],
      dtype='object')

In [9]:
terms.head(3).T

Unnamed: 0,0,1,2
chamber,Senate,Senate,Senate
congress,107,108,109
endyear,2003.0,2005.0,2007.0
membertype,Senator,Senator,Senator
startyear,2002,2003,2005
statecode,TX,TX,TX
statename,Texas,Texas,Texas
bioguideid,C001056,C001056,C001056
district,,,


Primary key is bioguideid and congress

## Leadership

In [10]:
leadership.columns = [x.lower() for x in leadership.columns]
leadership.columns = [x.replace('.', '_') for x in leadership.columns]
leadership.columns

Index(['congress', 'type', 'bioguideid', 'current'], dtype='object')

In [11]:
leadership.head(3).T

Unnamed: 0,0,1,2
congress,113,114,115
type,Assistant Democratic Leader,Assistant Majority Leader,Majority Whip
bioguideid,C001056,C001056,C001056
current,,,


# os_legislators

In [12]:
os_legislators.columns = [x.lower() for x in os_legislators.columns]
os_legislators.columns = [x.replace('.', '_') for x in os_legislators.columns]
os_legislators.columns

Index(['cid', 'firstl', 'lastnam', 'party', 'offic', 'gend', 'first_elected',
       'xit_cod', 'commen', 'phon', 'fax', 'w', 'webform', 'congress_offic',
       'oguide_id', 'votesmart_id', 'feccandid', 'witter_id', 'youtube_url',
       'facebook_id', 'hd'],
      dtype='object')

In [17]:
os_legislators = os_legislators[['cid', 'party', 'oguide_id']]
os_legislators = os_legislators.rename({'oguide_id': 'bioguideid'}, axis = 1)

In [19]:
os_legislators.head(3)

Unnamed: 0,cid,party,bioguideid
0,N00050780,D,
1,N00035774,R,S001198
2,N00026050,R,M001153


Primary key is either bioguideid or cid. Because it shares with congress_members, merge them

In [35]:
members = pd.merge(congress_members, os_legislators,
                  on = 'bioguideid',
                  how = 'outer',
                  validate = 'one_to_many',
                  indicator = 'matched')

In [37]:
members['matched'].value_counts()

matched
both          506
left_only      31
right_only     31
Name: count, dtype: int64

In [38]:
# once we have fixed the issue
members = pd.merge(congress_members, os_legislators,
                  on = 'bioguideid',
                  how = 'inner')

In [28]:
# want to fix NA values
bio = congress_members.query("lastname == 'Peltola'").reset_index()['bioguideid'][0]

In [34]:
os_legislators[os_legislators['bioguideid'].isna()]

Unnamed: 0,cid,party,bioguideid
0,N00050780,D,
55,N00044298,R,
106,N00051369,R,
117,N00051178,R,
122,N00043504,D,
125,N00050596,D,
175,N00049102,D,
190,N00037427,R,
208,N00025766,D,
211,N00047972,R,


# os_contribtions

In [41]:
os_contributions.columns = [x.lower() for x in os_contributions.columns]
os_contributions.columns = [x.replace('.', '_') for x in os_contributions.columns]
os_contributions.columns

Index(['org_nam', 'otal', 'pac', 'ndiv'], dtype='object')

In [43]:
# Need to have cid in this table
os_contributions.head(3)

Unnamed: 0,org_nam,otal,pac,ndiv
0,State of Alaska,19541,0,19541
1,"Sonosky, Chambers et al",16050,0,16050
2,General Communication Inc,15150,2500,12650


# comittees

In [44]:
comittees.columns = [x.lower() for x in comittees.columns]
comittees.columns = [x.replace('.', '_') for x in comittees.columns]
comittees.columns

Index(['chamber', 'committeetypecode', 'name', 'systemcode', 'url',
       'parent_name', 'parent_systemcode', 'parent_url', 'subcommittees'],
      dtype='object')

In [50]:
comittees = comittees.drop(['subcommittees'], axis = 1)

In [51]:
comittees.head(3).T

Unnamed: 0,0,1,2
chamber,House,House,House
committeetypecode,Other,Standing,Select
name,Bicentenary Committee,Energy (Ad Hoc) Committee,U.S. Role in Iranian Arms Committee
systemcode,hcza00,hhah00,hlbz00
url,https://api.congress.gov/v3/committee/house/hc...,https://api.congress.gov/v3/committee/house/hh...,https://api.congress.gov/v3/committee/house/hl...
parent_name,,,
parent_systemcode,,,
parent_url,,,


primary key is systemcode, subcomittees is not atomic but it is included already and can be dropped

# committee_members

In [57]:
committee_members.columns = [x.lower() for x in committee_members.columns]
committee_members.columns = [x.replace('.', '_') for x in committee_members.columns]
committee_members.columns

Index(['committee_code', 'rank', 'title', 'bioguide'], dtype='object')

In [58]:
committee_members.head(3).T

Unnamed: 0,0,1,2
committee_code,HSII,HSII,HSII
rank,1.0,1.0,2.0
title,Chair,Ranking Member,
bioguide,W000821,G000551,L000564


primary key is the committee_code and bioguideid