# State Political Party Data

1. Create dataframe from NCSL pdfs
2. Export to csv files

Notes:
- This data was going to be used with a map to indicate party control per state along with hate crime rates and counts for different biases.
- Unfortunately, the only parceble data we could not find were pdfs for each year from 2009 to 2021.
- We attempted to us tabula.py to read in the pdfs but did not have sufficent time to address all of the import and cleaning issues. 

In [16]:
!pip install tabula-py



In [28]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import tabula
import requests
import os
from pathlib import Path

In [37]:
# Function to convert pdf to csv and create/return a dataframe of the csv

def get_party_data(year):
    
    pdf_path = '../resources/ncsl_legislative_control_' + str(year) + '.pdf'
    csv_path = 'data/ncsl_temp_data.csv'
    
    # Read and convert pdf to temporary csv file
    # Code ref: https://tabula-py.readthedocs.io/en/latest/faq.html#faq
    tabula.io.convert_into(pdf_path, csv_path, output_format='csv', stream=True, pages='all')
    
    # Create dataframe from csv file
    source_df = pd.read_csv(csv_path)
    
    # Delete temp file
    # Code ref: https://stackoverflow.com/questions/10840533/most-pythonic-way-to-delete-a-file-which-may-not-exist
    try:
        os.remove(csv_path)
    except OSError:
        pass
    
    # Drop non-state rows
    source_df.drop(source_df.tail(10).index, inplace = True)
    
    # Create empty dataframe
    df = pd.DataFrame(columns=['state', 'senate_seats', 'senate_dem', 'senate_rep', 'house_seats', 'house_dem', 'house_rep', 'temp'])

    # Get info from second row to identify data in each column
    header_row = source_df.iloc[0,:]
    column_names = header_row.index
    
    # Used to distinguish identical columns for senate and house
    house_flag = False
    
    # Loop through series and fill data in empty dataframe
    for i in range(len(header_row)):
        value = header_row[i]
        column = column_names[i]

        if source_df[column].dtype == 'object':
            #df.loc[ df[“column_name”] == “some_value”, “column_name”] = “value”
            source_df.loc[source_df[column].str.count(' ') > 1, column] = np.nan
            # print(source_df.loc[source_df[column].str.count(' ') > 1, column])

        if house_flag == False:
            match value:
                case 'STATE':
                    df['state'] = source_df[column].str.replace('*','')
                case 'Seats Senate':
                    df[['temp', 'senate_seats']] = source_df[column].str.split(' ',expand=True)
                case 'Senate':
                    df['senate_seats'] = source_df[column]
                case 'Senate Dem.':
                    df[['senate_seats','senate_dem']] = source_df[column].str.split(' ',expand=True)
                case 'Dem.':
                    df['senate_dem'] = source_df[column]
                case 'Dem. Rep.':
                    df[['senate_dem','senate_rep']] = source_df[column].str.split(' ',expand=True)
                case 'Rep.':
                    df['senate_rep'] = source_df[column]
                case 'House':
                    df['house_seats'] = source_df[column]
                    house_flag = True
                case 'House Dem.':
                    df[['house_seats','house_dem']] = source_df[column].str.split(' ',expand=True)
                    house_flag = True
                case "other House":
                    df[['temp', 'house_seats']] = source_df[column].str.split(' ',expand=True)
                    house_flag = True
        else:
            match value:
                case 'Dem.':
                    df['house_dem'] = source_df[column]
                case 'Rep.':
                    df['house_rep'] = source_df[column]
                case 'Dem. Rep.':
                    df[['house_dem','house_rep']] = source_df[column].str.split(' ',expand=True)
    

    
    # Drop unneeded columns and rows
    df.drop(columns=['temp'], inplace = True)
    df.drop(df[df['state'] == 'STATE'].index, inplace = True)
    df.drop(df[df['state'] == 'Total States'].index, inplace = True)
    
    # Add column for the year
    df['year'] = year
   
    return df

In [32]:
# Create dataframe with party data from 2009 to 2021
start_year = 2009
end_year = 2021
year_df = [None] * (end_year - start_year + 1)

for year in range(start_year, end_year + 1):
    print(f'Getting data for year: {year}')
    year_df[year - start_year] = get_party_data(year)

Getting data for year: 2009
Getting data for year: 2010
Getting data for year: 2011
Getting data for year: 2012
Getting data for year: 2013
Getting data for year: 2014
Getting data for year: 2015
Getting data for year: 2016
Getting data for year: 2017
Getting data for year: 2018
Getting data for year: 2019
Getting data for year: 2020
Getting data for year: 2021


In [38]:
# Combine years into one dtaframe
party_df = pd.concat(year_df, axis=0)

# Clean up columns
party_df['house_seats'] = pd.to_numeric(party_df['house_seats'], errors='coerce')

party_df

test = party_df.groupby('state').count()
test

Unnamed: 0_level_0,senate_seats,senate_dem,senate_rep,house_seats,house_dem,house_rep,year
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alabama,13,12,13,13,13,13,13
Alaska,13,12,13,12,13,13,13
Arizona,13,12,13,12,13,13,13
Arkansas,13,12,13,13,13,13,13
California,13,12,13,13,13,13,13
Colorado,13,12,13,12,13,13,13
Connecticut,13,12,13,13,13,13,13
Delaware,13,12,13,12,13,13,13
Florida,13,12,13,12,13,13,13
Georgia,13,12,13,12,13,13,13


## Tabula Wrapper Testing

In [None]:
# Tabula test - reading from an online pdf
# Issue: <urlopen error no host given>
# Tried using user_agent parameter with no success. 

path = Path('https://documents.ncsl.org/wwwncsl/Elections/LegisControl_2009.pdf')

test_df = tabula.read_pdf(path, lattice=True, user_agent='python-requests')
test_df

In [None]:
# Tabula test - converting online pdf to a csv file
# Issue: URLError: <urlopen error no host given>
# Unable to resolve issue

path = Path('https://documents.ncsl.org/wwwncsl/Elections/LegisControl_2009.pdf')

# Import and convert pdf to csv file
# Code ref: https://tabula-py.readthedocs.io/en/latest/faq.html#faq
tabula.convert_into(path, 'data/party_control_2009.csv', output_format='csv', lattice=True, pages='all')

# Read csv file into a dataframe
party_data_df = pd.read_csv('data/party_control_2009.csv')
party_data_df