In [1]:
import pandas as pd
import numpy as np
import urllib.request

In [2]:
data_dir = '../data/'
tmpr_dir = '../data/controls/'

### State Codes

In [3]:
# MSN Codes Key
msn_codes_file_loc = data_dir + 'keys/MSN.csv'
msn_codes_key = pd.read_csv(msn_codes_file_loc)

# State FIPS Codes
state_fips_file_loc = data_dir + 'keys/FIPS.csv'
state_fips_key = pd.read_csv(state_fips_file_loc)
state_fips_indicators = ['State Abbreviation', 'State Name']

# State NCDC Codes
state_ncdc_file_loc = data_dir + 'keys/NCDC.csv'
state_ncdc_key = pd.read_csv(state_ncdc_file_loc)
state_ncdc_key['State Name'] = state_ncdc_key['State Name'].str.upper()

# Merged State Codes Key
state_codes_key = state_ncdc_key.merge(state_fips_key)

In [4]:
# For converting NCDC to state name
ncdc_state_dict = dict(state_codes_key.set_index('NCDC Code')['State Name'].apply(lambda x: x.lower()))

### CDD and HDD

In [5]:
# CDD
cdd_file_loc = tmpr_dir + 'CDD_State.txt'
data_cdd = pd.read_fwf(cdd_file_loc, header = None, index = None, converters={0: lambda x: str(x)})

# HDD
hdd_file_loc = tmpr_dir + 'HDD_State.txt'
data_hdd = pd.read_fwf(hdd_file_loc, header = None, index = None, converters={0: lambda x: str(x)})

# Translate first column 
data_cdd['NCDC Code'] = data_cdd[0].apply(lambda x: int(str(x)[:3]))
data_cdd['year'] = data_cdd[0].apply(lambda x: str(x)[-4:])
data_hdd['NCDC Code'] = data_hdd[0].apply(lambda x: int(str(x)[:3]))
data_hdd['year'] = data_hdd[0].apply(lambda x: str(x)[-4:])

# Drop unnecessary columns
data_cdd = data_cdd.drop([0], axis = 1)
data_hdd = data_hdd.drop([0], axis = 1)

# Melt months columns
data_cdd = data_cdd.melt(id_vars = ['NCDC Code', 'year'], value_vars = data_cdd.columns[0:12], var_name='month', value_name='CDD')
data_hdd = data_hdd.melt(id_vars = ['NCDC Code', 'year'], value_vars = data_hdd.columns[0:12], var_name='month', value_name='HDD')

# Merge
data_dd = data_cdd.merge(data_hdd, on = ['year', 'month', 'NCDC Code'])
data_dd['state'] = data_dd['NCDC Code'].apply(lambda x: ncdc_state_dict.get(x))
data_dd = data_dd.drop('NCDC Code', axis = 1)

# Remove undefined
data_dd = data_dd.query('CDD != 99 & CDD != 999 & HDD != 9999')

In [27]:
pd.read_fwf(cdd_file_loc, header = None, index = None, converters={0: lambda x: str(x)}).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,10261895,5.0,0.0,18.0,49.0,156.0,360.0,420.0,437.0,366.0,29.0,4.0,1.0
1,10261896,4.0,3.0,12.0,118.0,320.0,355.0,488.0,522.0,306.0,61.0,20.0,2.0
2,10261897,4.0,11.0,56.0,39.0,144.0,465.0,497.0,415.0,309.0,121.0,8.0,3.0
3,10261898,19.0,2.0,45.0,13.0,269.0,446.0,457.0,421.0,298.0,37.0,1.0,1.0
4,10261899,5.0,0.0,24.0,32.0,331.0,436.0,466.0,490.0,226.0,105.0,12.0,2.0


In [91]:
data_dd.to_csv('../data/controls/state_monthly_hdd_cdd.csv', index=False)