In [1]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

from collections import defaultdict
import pandas as pd
import numpy as np

# Load data


In [2]:
engine = create_engine('postgresql://localhost:5432/wa_leg_raw')

In [3]:
bill_df = pd.read_sql_query('select * from "bill_api"',con=engine)
sponsor_df = pd.read_sql_query('select * from "sponsor_api"',con=engine)

In [4]:
def create_secondary_sponsor_column(sponsor_df):
    '''Create a column named secondary_sponsors that contains a list of secondary sponsors
    
    Input:
    sponsor_df: pandas dataframe retrieved from wa_leg raw database, sponsor_api table
    '''
    # Create dictionary with bill_id and biennium as keys and list of secondary sponsors as values
    s = defaultdict(list)
    for s_id, s_type, biennium, bill_id in zip(sponsor_df['sponsor_id'], 
                                               sponsor_df['sponsor_type'],
                                               sponsor_df['biennium'], 
                                               sponsor_df['bill_id']):
        if s_type == 'Secondary':
            s[(bill_id, biennium)].append(s_id)
            
    for k, v in s.items():
        s[k] = list(set(v))

    # Create a matrix that can be turned in to a dataframe. Column one is bill_id, column two is 
    # biennium, column three is the list of secondary sponsors
    sponsor_matrix = []
    for k, v in s.items():
        row = []
        row.append(k[0])
        row.append(k[1])
        row.append(v)
        sponsor_matrix.append(row)
        
    secondary_sponsors_df = pd.DataFrame(sponsor_matrix)
    secondary_sponsors_df.columns = ['bill_id', 'biennium', 'secondary_sponsors']
            
    sponsor_df_reformatted = sponsor_df[sponsor_df['sponsor_type'] == 'Primary']
    sponsor_df_reformatted['bill_num'] = sponsor_df_reformatted['bill_id'].apply(lambda x: x.split()[1])
    sponsor_df_reformatted['bill_num_unique'] = sponsor_df_reformatted['biennium'] + ' ' + sponsor_df_reformatted['bill_num']
    sponsor_df_reformatted = sponsor_df_reformatted.rename(index=str, columns={"sponsor_id": "primary_sponsor_id"})
    sponsor_df_reformatted = sponsor_df_reformatted.drop(['sponsor_type', 'sponsor_order', 'sponsor_last_name', 
                                 'sponsor_long_name', 'sponsor_first_name', 'sponsor_name'], axis = 1)
    
    sponsor_df_merged = sponsor_df_reformatted.merge(secondary_sponsors_df, how='outer', on=['bill_id', 'biennium'])
    
    # filter out repeating bills
    unique_bill_nums = []
    unique_bill_rows = []
    for i, row in sponsor_df_merged.iterrows():
        if row['bill_num_unique'] not in unique_bill_nums:
            unique_bill_nums.append(row['bill_num_unique'])
            unique_bill_rows.append(row)
            
    return pd.DataFrame(unique_bill_rows)

In [5]:
def create_staging_bill_df(bill_df, sponsor_df):
    '''Join sponsor_df to bill_df and output the merged pandas dataframe.
    Input
    bill_df: pandas dataframe retrieved from wa_leg raw database, bill_api table
    sponsor_df: pandas dataframe retrieved from wa_leg raw database, sponsor_api table
    '''
    sponsor_df_unique = create_secondary_sponsor_column(sponsor_df)
    bill_df['bill_num'] = bill_df['bill_id'].apply(lambda x: x.split()[1] if type(x) == str else x)
    bill_df['bill_num_unique'] = bill_df['biennium'] + ' ' + bill_df['bill_num']
    
    MERGED = bill_df.merge(sponsor_df_unique, how='left', on=['bill_num_unique', 'biennium'], suffixes=['', '_sp'])
    MERGED = MERGED.drop(['bill_id_sp', 'bill_num_sp'], axis=1)
    MERGED = MERGED[MERGED['primary_sponsor_id'].notnull()]
    return MERGED

In [6]:
staging_bill_df = create_staging_bill_df(bill_df, sponsor_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
staging_bill_df

Unnamed: 0,biennium,bill_id,class,description,htm_create_date,htm_last_modified_date,htm_url,long_friendly_name,name,type,bill_num,bill_num_unique,sponsor_agency,primary_sponsor_id,secondary_sponsors
0,1991-92,HB 1001,Bills,,1991-08-30T00:00:00,2006-07-10T17:13:53.543,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bill 1001,1001,House Bills,1001,1991-92 1001,House,251,"[295, 478, 72, 11, 406, 481, 254, 328, 188, 17..."
1,1991-92,SHB 1001,Bills,,1991-02-01T00:00:00,2006-07-10T17:13:54.903,http://app.leg.wa.gov/documents/billdocs/1991-...,Substitute House Bill 1001,1001-S,House Bills,1001,1991-92 1001,House,251,"[295, 478, 72, 11, 406, 481, 254, 328, 188, 17..."
2,1991-92,HB 1002,Bills,,1991-01-14T00:00:00,2006-07-10T17:13:11.637,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bill 1002,1002,House Bills,1002,1991-92 1002,House,251,"[320, 332, 23, 110, 474, 394, 7, 227, 481, 180..."
3,1991-92,HB 1003,Bills,,1991-01-14T00:00:00,2006-07-10T17:13:11.747,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bill 1003,1003,House Bills,1003,1991-92 1003,House,311,"[110, 474, 54]"
4,1991-92,SHB 1003,Bills,,1991-02-21T00:00:00,2006-07-10T17:14:07.357,http://app.leg.wa.gov/documents/billdocs/1991-...,Substitute House Bill 1003,1003-S,House Bills,1003,1991-92 1003,House,311,"[110, 474, 54]"
5,1991-92,HB 1004,Bills,,1991-01-14T00:00:00,2006-07-10T17:13:11.53,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bill 1004,1004,House Bills,1004,1991-92 1004,House,311,"[110, 54, 304, 475, 32]"
6,1991-92,HB 1005,Bills,,1991-01-14T00:00:00,2006-07-10T17:13:12.747,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bill 1005,1005,House Bills,1005,1991-92 1005,House,311,"[475, 23]"
7,1991-92,HB 1006,Bills,,1991-01-14T00:00:00,2006-07-10T17:13:11.857,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bill 1006,1006,House Bills,1006,1991-92 1006,House,311,[110]
8,1991-92,HB 1007,Bills,,1991-01-14T00:00:00,2006-07-10T17:13:12.373,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bill 1007,1007,House Bills,1007,1991-92 1007,House,311,"[110, 276, 48, 180, 472, 285, 475, 443]"
9,1991-92,HB 1008,Bills,,1991-01-14T00:00:00,2006-07-10T17:13:12.183,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bill 1008,1008,House Bills,1008,1991-92 1008,House,311,"[475, 110, 474]"
