## Public Finance: Data Cleaning
The following scripts performs the data cleaning on the Annual Survey of State and Local Government Finances. The data is provided in a standardised format from Willamette University. The data is available at a county level between 1967 - 2020 with improved coverage in more recent years.

In [1]:
import pandas as pd
import numpy as np
import pyreadr


In [2]:
temp_cty = pd.read_csv("../data/raw/CountyData.csv", 
                       dtype = {'GOVSid':str,
                                'FIPSid':str,
                                'Name':str,
                                'FIPS_Code_State':str,
                                'FIPS_County':str,
                                'FIPS_Combined':str})

# Removes variables that do not provide unique distinguishing characteristics
temp_cty.drop(['GOVSid', 
              'FIPSid', 
              'County', 
              'State_Code',
              'Type_Code', 
              'FIPS_County', 
              'FIPS_Place', 
              'FYEndDate', 
              'YearPop', 
              'SchLevCode',
              'FunctionCode',
              'Enrollment'], inplace = True, axis = 1)

temp_cty.rename({'Year4':'year', 'Name': 'county_name', 
                         'FIPS_Code_State':'fips_state', 
                         'FIPS_Combined':'fips'},
                inplace = True, axis = 1)


# Fixes inconsistent FIPS code for KETCHIKAN GATEWAY BOROUGH; FIPS changes in 
# 2007 - verified that population total is consistent at time of switch 
# (ie. unlikely there is a regrouping of counties)
temp_cty['fips'].mask(temp_cty['county_name'] == "KETCHIKAN GATEWAY BOROUGH", 
                      "02130", inplace = True)

# Converts to long format - each category name in "item_formal" and amount in "amount"
temp_cty = temp_cty.melt(id_vars = ['year', 'county_name', 'fips_state', 
                          'fips', 'Population'],
             var_name = 'item_formal', value_name = 'amount')

#temp_cty.to_csv('../data/temp/county_py_test.csv', index = False)


In [3]:
# # Save item codes for categorisation
## Below is created in R willamette_cleaning_master.Rmd

item_codes = pd.read_excel('../data/out/item_codes.xlsx',
                          dtype = {'retain_total':str})

item_codes.dropna(subset=['retain_total'])

Unnamed: 0,item,rev_exp_debt,Unsure,will_category,large_category,retain_total,sum_category,Notes,Unnamed: 8
126,total_expenditure,expenditure,,total expenditure,Total,True,,,
127,total_ig_expenditure,expenditure,,total IG expenditure,Total,True,,,
128,direct_expenditure,expenditure,,total direct expenditure,Total,True,,,
129,total_current_expend,expenditure,,total current expenditure,Total,True,,,
130,total_current_oper,expenditure,,total current operations,Total,True,,,
...,...,...,...,...,...,...,...,...,...
400,elec_util_total_exp,expenditure,,electric utilities,Electricity,True,,,
405,gas_util_total_exp,expenditure,,gas utilities,Gas Utilities,True,,,
410,trans_util_total_exp,expenditure,,transit utilities,Transit Utilities,True,,,
415,emp_ret_total_expend,expenditure,,employee retirement,Retirement,True,Retirement,,


### Short time series
Expenditure time series from 2000-2021.

In [4]:
temp_cty['item'] = temp_cty['item_formal'].str.lower()
temp_merged = temp_cty.merge(item_codes, on = "item", how = "left")
temp_merged.rename({"Unnamed: 8": "X9"}, inplace = True, axis = 1)
temp_short = temp_merged.loc[(temp_merged['year'] >= 2000) & (temp_merged['rev_exp_debt'] == "expenditure")]
    


In [5]:
#pyreadr.write_rds('../data/temp/county_short_expenditure_py.RDS', temp_short)
cty_r = pyreadr.read_r('../data/temp/county_short_expenditure.RDS')



In [6]:
cty_r_pd = cty_r[None]

#temp_short.sort_values(by = ['fips', 'year']).reset_index(drop = True).to_csv('../data/temp/county_short_expenditure_py.csv', index = False)


False

### Full time series
Expenditure time series from 1967-2021.

In [6]:
temp_full = temp_merged.loc[temp_merged['rev_exp_debt'] == "expenditure"]
temp_full.sort_values(by = ['fips', 'year']).reset_index(drop = True).to_csv('../data/temp/county_full_expenditure_py.csv', index = False)

