In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data_dir  = '../data/'
implan_dir = data_dir + 'implan/'

## Import Data

In [3]:
state_ind_sum_files = [x for x in os.walk(implan_dir)][0][2]
state_ind_sum_files = [x for x in state_ind_sum_files if 'Summary' in x]

In [4]:
state_df_list = []

for file in state_ind_sum_files:
    
    state_df = pd.read_csv(implan_dir + file, skiprows = 1)

    state_df['State Name'] = (' ').join(file.split(' ')[0:-2])
    state_df['State Name'] = state_df['State Name'].str.upper() 

    state_df_list.append(state_df)

In [5]:
state_ind_sum = pd.concat(state_df_list, ignore_index=True)

### Keys

In [6]:
# MSN Codes Key
msn_codes_file_loc = data_dir + 'keys/MSN_codes.csv'
msn_codes_key = pd.read_csv(msn_codes_file_loc)

# State FIPS Codes
state_fips_file_loc = data_dir + 'keys/state_FIPS.csv'
state_fips_key = pd.read_csv(state_fips_file_loc)
state_fips_indicators = ['State Abbreviation', 'State Name']

# State NCDC Codes
state_ncdc_file_loc = data_dir + 'keys/state_NCDC_codes.csv'
state_ncdc_key = pd.read_csv(state_ncdc_file_loc)
state_ncdc_key['State Name'] = state_ncdc_key['State Name'].str.upper()

# Merged State Codes Key
state_codes_key = state_ncdc_key.merge(state_fips_key)

# Month Key
month_key = dict(zip(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], range(1,13)))

The history saving thread hit an unexpected error (OperationalError('disk I/O error',)).History will not be written to the database.


## Clean Data

In [7]:
# Add input costs
state_ind_sum['TotalInputCosts'] = state_ind_sum['Output'] - state_ind_sum['TotalValueAdded'] 

# Pivot Industry Codes
state_ind_sum = state_ind_sum[['IndustryCode', 'State Name', 'TotalInputCosts']]
state_ind_sum = state_ind_sum.pivot(index = 'State Name', columns = 'IndustryCode', values = 'TotalInputCosts').reset_index()

# Grab relevant industry columns
state_ind_sum = state_ind_sum[['State Name'] + list(range(41,48))].rename(columns = {41: 'hydro_tot_input_costs', 42: 'ff_tot_input_costs', 43: 'nuclear_tot_input_costs',
                                                                    44: 'solar_tot_input_costs', 45: 'wind_tot_input_costs', 46: 'geothermal_tot_input_costs',
                                                                    47: 'biomass_tot_input_costs'})

state_ind_sum.columns = pd.Series(state_ind_sum.columns).values

# Add months
state_ind_sum_list = []

for month in range(1,13):
    
    temp_df = state_ind_sum.copy()
    temp_df['month'] = month
    
    state_ind_sum_list.append(temp_df)
    
state_ind_sum = pd.concat(state_ind_sum_list)

In [8]:
state_ind_sum.head()

Unnamed: 0,State Name,hydro_tot_input_costs,ff_tot_input_costs,nuclear_tot_input_costs,solar_tot_input_costs,wind_tot_input_costs,geothermal_tot_input_costs,biomass_tot_input_costs,month
0,ALABAMA,65607740.0,2463815000.0,672021500.0,0.0,7708770.0,0.0,90660610.0,1
1,ALASKA,85075010.0,393825300.0,0.0,0.0,5515352.0,0.0,10288720.0,1
2,ARIZONA,0.0,1470722000.0,300014.9,55583350.0,191145.1,0.0,348915.5,1
3,ARKANSAS,2236884.0,307277000.0,410285700.0,0.0,0.0,6248546.0,0.0,1
4,CALIFORNIA,513446800.0,8126158000.0,1495902000.0,155175300.0,204331800.0,238420300.0,484058700.0,1


## Merge Data

### Add FIPS codes

In [9]:
state_ind_sum = state_ind_sum.merge(state_fips_key).drop(state_fips_indicators, axis = 1)

# Export Data

In [10]:
state_ind_sum.to_csv(data_dir + 'processed/implan_data.csv', index = False)