In [1]:
import datetime
from fuzzywuzzy import fuzz
import pandas as pd
import re



In [2]:
substations = pd.read_csv('../data/California_Electric_Substations.csv',
                          usecols=['Name', 'ZIP_CODE', 'COUNTY'], dtype='str')
pge_substations = substations.sort_values('Name').reset_index(drop=True)
pge_substations[pge_substations.COUNTY == 'Tehachapi County']

Unnamed: 0,Name,ZIP_CODE,COUNTY


In [55]:
def clean_df(file_path):
    psps = pd.read_csv(file_path, dtype='str')
    if 'Distribution / Transmission' in psps.columns:
        psps['Distribution / Transmission'] = [
            x.upper() for x in psps['Distribution / Transmission']
        ]
        psps = psps[psps['Distribution / Transmission'] == 'DISTRIBUTION']
        psps.drop(columns='Distribution / Transmission', inplace=True)
    cleaned_circuit = [
        re.findall('.*?(?=\s\d{4,}\*?)', circuit)[0].strip()
        for circuit in psps['Circuit Name']
    ]
    psps['Circuit Name'] = cleaned_circuit
    cleaned_hftd = [
        int(max(re.findall('\d', hftd)))
        if len(re.findall('\d', hftd)) > 0 else 0
        for hftd in psps['HFTD Tier(s)']
    ]
    psps['HFTD Tier'] = cleaned_hftd
    psps.columns = [col_name.strip() for col_name in psps.columns]
    shorter_names = [
        re.sub(' Customers$', '', col_name) for col_name in psps.columns
    ]
    psps.columns = shorter_names
    psps.rename(columns={
        'De-Energization Date and Time (PDT)': 'DeEnergization Date and Time',
        'De-Energization Date and Time': 'DeEnergization Date and Time',
        'Restoration Date and Time (PDT)': 'Restoration Date and Time',
        'Counties': 'Key Communities',
        'Commercial/Industrial': 'Commercial / Industrial'
        },
        inplace=True
    )
    fixed_communities = [
        re.sub('[\r\n\s]', ' ', comm) for comm in psps['Key Communities']
    ]
    def get_times(str_time):
        try:
            time = pd.to_datetime(str_time, format='%d/%m/%y %H:%M')
        except:
            time = pd.to_datetime(str_time)
        return time
    start_time = [
        get_times(time) for time in psps['DeEnergization Date and Time']
    ]
    end_time = [get_times(time) for time in psps['Restoration Date and Time']]
    psps['deenergize_time'] = start_time
    psps['restoration_time'] = end_time
    psps['time_out_min'] = (
        (psps.restoration_time - psps.deenergize_time) / pd.Timedelta('1m')
    )
    psps['Key Communities'] = fixed_communities
    for col in ['HFTD Tier', 'Total', 'Residential', 'Commercial / Industrial',
        'Medical Baseline', 'Other']:
        psps[col] = psps[col].astype(int)
    psps = psps[[    
        'Circuit Name', 'deenergize_time', 'restoration_time', 'time_out_min',
        'Key Communities', 'HFTD Tier', 'Total', 'Residential',
        'Commercial / Industrial', 'Medical Baseline', 'Other'
    ]]
    return psps

In [56]:
file_names = [
    '../data/PSPS-{}-circuits.csv'.format(dt)
    for dt in ['01.19.21', '08.17.21', '09.20.21', '10.11.21', '10.14.21']
]
file_names

data = pd.concat([clean_df(file) for file in file_names], axis=0).\
    reset_index(drop=True)

In [57]:
data

Unnamed: 0,Circuit Name,deenergize_time,restoration_time,time_out_min,Key Communities,HFTD Tier,Total,Residential,Commercial / Industrial,Medical Baseline,Other
0,AUBERRY,2021-01-19 01:06:00,2021-01-20 12:55:00,2149.0,"AUBERRY, SHAVER LAKE, TOLLHOUSE",3,923,827,94,61,2
1,BALCH NO 1,2021-01-19 05:53:00,2021-01-20 09:32:00,1659.0,FRESNO,2,25,13,12,1,0
2,CAL WATER,2021-01-19 06:49:00,2021-01-20 08:00:00,1511.0,BAKERSFIELD,2,13,0,10,0,3
3,DUNLAP,2021-01-19 05:46:00,2021-01-20 16:20:00,2074.0,"BADGER, KINGS CANYON, MIRAMONTE",2,663,597,56,28,10
4,DUNLAP,2021-01-19 02:46:00,2021-01-20 16:47:00,2281.0,"DUNLAP, HUME, KINGS CANYON, MIRAMONTE",2,477,407,65,19,5
...,...,...,...,...,...,...,...,...,...,...,...
226,LAMONT,2021-10-15 01:09:00,2021-10-15 15:10:00,841.0,KERN,2,2,0,1,0,1
227,MAGUNDEN,2021-10-15 01:05:00,2021-10-15 15:12:00,847.0,KERN,2,38,20,9,0,9
228,SCE TEHACHAPI,2021-10-15 01:16:00,2021-10-15 14:26:00,790.0,KERN,2,3,2,1,0,0
229,TEJON,2021-10-15 01:07:00,2021-10-16 16:32:00,2365.0,KERN,2,595,479,103,34,13


In [123]:
def most_similar_station(circuit, stns, thresh=80):
    sim_scores =[
        (
            fuzz.token_sort_ratio(circuit, stn.Name), stn.Name,
            stn.ZIP_CODE, stn.COUNTY
        )
        for _, stn in stns.iterrows()
    ]
    max_score = max([scores[0] for scores in sim_scores])
    if max_score < thresh:
        return (0, 'Default', None, None)
    return [scores for scores in sim_scores if scores[0] == max_score][0]

In [124]:
most_similar_station('CA', pge_substations)

(0, 'Default', None, None)

In [64]:
census_data = pd.read_csv('../data/ACSDP5Y2020.DP05-Data.csv')
census_data = census_data[
    [column for column in census_data.columns if re.search('E$', column)]
]
census_pop = census_data[['NAME', 'DP05_0001E']] # ZCTA, total population cols
census_pop.columns == ['name', 'total_pop']
census_pop.drop(index=0, inplace=True)
census_pop['ZCTA'] = [re.findall('\d{5}', obs)[0] for obs in census_pop.NAME]

zip_zcta = pd.read_excel(
    '../data/zip-code-zcta.xlsx', dtype='str'
)[['ZIP_CODE', 'ZCTA']]

# Join the two data sets
zip_census = pd.merge(census_pop, zip_zcta, how='inner', on='ZCTA')
zip_census.DP05_0001E = zip_census.DP05_0001E.astype(float)

  census_data = pd.read_csv('../data/ACSDP5Y2020.DP05-Data.csv')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  census_pop.drop(index=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  census_pop['ZCTA'] = [re.findall('\d{5}', obs)[0] for obs in census_pop.NAME]


In [65]:
zip_census.dtypes

NAME           object
DP05_0001E    float64
ZCTA           object
ZIP_CODE       object
dtype: object