In [1]:
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
all_shutoffs = pd.read_csv(
    '../data/processed/temp-shutoffs.csv', index_col=0, dtype=str
)
# See which shutoffs are not missing
nonmissing = all_shutoffs[~all_shutoffs.zip_code.isnull()]
nonmissing.deenergize_time = nonmissing.deenergize_time.\
    apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S'))
nonmissing.restoration_time = nonmissing.restoration_time.\
    apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S'))
# Write missing ZIP codes to file for external imputation
updated_missing = pd.read_csv(
    '../data/processed/updated-missing-zips.csv', index_col=0, dtype=str
).set_index('index')
# Convert latitude/longitude to floats (+/-)
updated_missing.longitude = updated_missing.longitude.\
    apply(lambda x: -float(re.sub('W', '', x)))
updated_missing.latitude = updated_missing.latitude.\
    apply(lambda x: float(re.sub('N', '', x)))

# Convert to datetime
updated_missing.deenergize_time = updated_missing.deenergize_time.\
    apply(lambda x: pd.to_datetime(x, format='%m/%d/%y %H:%M'))
updated_missing.restoration_time = updated_missing.restoration_time.\
    apply(lambda x: pd.to_datetime(x, format='%m/%d/%y %H:%M'))

In [3]:
psps_shutoffs = pd.concat([nonmissing, updated_missing]).reset_index(drop=True)
# Convert to binary variable
psps_shutoffs['substn_present'] = psps_shutoffs['Substation Name'] != 'MISSING'
psps_shutoffs.drop(columns='Substation Name', inplace=True)

In [4]:
for col in ['time_out_min','hftd_tier', 'total_affected',  
            'residential_affected', 'longitude', 'latitude']:
    psps_shutoffs[col] = psps_shutoffs[col].astype(float)

In [5]:
# Add census data by ZIP code/ZCTA
census_data = pd.read_csv('../data/raw/2020-population.csv')
census_data = census_data[
    [column for column in census_data.columns if re.search('E$', column)]
]
# ZCTA, total population cols, median age, white population
census_pop = census_data[['NAME', 'DP05_0001E', 'DP05_0018E', 'DP05_0037E']] 
census_pop.columns = ['name', 'total_pop', 'median_age', 'white_pop']
census_pop.drop(index=0, inplace=True)
census_pop['ZCTA'] = [re.findall('\d{5}', obs)[0] for obs in census_pop.name]

income_data = pd.read_csv('../data/raw/2020-median-income.csv')
# ZCTA, median HH income
census_income = income_data[['NAME', 'S1901_C01_012E']]
census_income.columns = ['name', 'median_income']
census_income.drop(index=0, inplace=True)
census_income['ZCTA'] = [re.findall('\d{5}', obs)[0] for obs in census_income.name]

census_pop_income = pd.merge(census_pop, census_income, how='inner', on='ZCTA')

# Read in converter between ZCTA (census) and ZIP
zip_zcta = pd.read_excel(
    '../data/raw/zip-code-zcta.xlsx', dtype='str'
)[['ZIP_CODE', 'ZCTA']]

def convert_to_float(x):
    if type(x) == str:
        return re.sub('[+,-]*', '', x) if x != '-' else 'nan'
    return x
# Join the two data sets
zip_census = pd.merge(census_pop_income, zip_zcta, how='inner', on='ZCTA')
for col in ['total_pop', 'median_age', 'white_pop', 'median_income']:
    zip_census[col] = zip_census[col].apply(convert_to_float).astype(float)
zip_census['white_pct'] = zip_census.white_pop / zip_census.total_pop

In [7]:
# Merge shutoff data with zip code and census information
shutoffs_pop = pd.merge(
    psps_shutoffs, zip_census[
        ['ZIP_CODE', 'total_pop', 'median_age', 'median_income', 'white_pct']
    ],
    how='left', left_on='zip_code', right_on='ZIP_CODE'
).drop(columns='ZIP_CODE')
shutoffs_pop.to_csv('../data/processed/processed-shutoffs.csv', index=False)
shutoffs_pop

Unnamed: 0,circuit_name,deenergize_time,restoration_time,time_out_min,key_communities,hftd_tier,total_affected,residential_affected,zip_code,longitude,latitude,substn_present,total_pop,median_age,median_income,white_pct
0,APPLE HILL-1103,2018-10-14 21:00:00,2018-10-15 16:29:00,1169.0,"Camino, Placerville",3.0,1809.0,1488.0,95709,-120.677165,38.739683,True,5634.0,50.9,71659.0,0.941605
1,APPLE HILL-1104,2018-10-14 21:17:00,2018-10-15 18:37:00,1280.0,"Camino, Placerville, Pollock Pines,",3.0,2261.0,2043.0,95709,-120.677165,38.739683,True,5634.0,50.9,71659.0,0.941605
2,APPLE HILL-2102,2018-10-14 21:05:00,2018-10-16 18:04:00,2699.0,"Camino, Grizzly Flats, Mount Aukum, Placervill...",3.0,4489.0,4013.0,95709,-120.677165,38.739683,True,5634.0,50.9,71659.0,0.941605
3,BONNIE NOOK-1102,2018-10-14 21:54:00,2018-10-15 18:04:00,1210.0,Alta,3.0,533.0,454.0,95701,-120.809292,39.214708,True,1064.0,58.2,85294.0,0.922932
4,CALISTOGA-1101,2018-10-14 20:34:00,2018-10-15 21:27:00,1493.0,Calistoga,3.0,1596.0,1216.0,94515,-122.578610,38.592088,True,7525.0,47.7,85263.0,0.791894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,VACA DIXON,2021-08-17 18:26:00,2021-08-18 16:36:00,1330.0,SOLANO,2.0,384.0,336.0,95687,-121.935900,38.328600,False,69073.0,38.0,92653.0,0.611860
2133,SCE TEHACHAPI,2021-09-21 01:09:00,2021-09-21 16:06:00,897.0,KERN,2.0,3.0,2.0,93561,-118.439700,35.077600,False,35133.0,41.0,65401.0,0.822702
2134,VACA DIXON,2021-10-11 07:01:00,2021-10-12 14:58:00,1917.0,SOLANO,2.0,6.0,4.0,95687,-121.935900,38.328600,False,69073.0,38.0,92653.0,0.611860
2135,VACA DIXON,2021-10-11 07:33:00,2021-10-12 14:31:00,1858.0,SOLANO,2.0,61.0,54.0,95687,-121.935900,38.328600,False,69073.0,38.0,92653.0,0.611860
