# Notebook for extracting data for CEP-related analysis

In [3]:
import pandas as pd
import requests
import numpy as np
import os
import glob
import re
from datetime import datetime
from zipfile import ZipFile
from io import BytesIO
import helper_functions as hf
from importlib import reload

### Extract and store all raw XLSX files from EIA page

In [6]:
# Set location for data files
data_dir = os.path.join(os.getcwd(), 'raw_data')
process_dir = os.path.join(os.getcwd(), 'prepared_data')

In [147]:
# Read in EIA from web page
hf.download_eia_861(2012, 2022, data_dir)

Extracted Sales_Ult_Cust_2012.xlsx
Extracted Sales_Ult_Cust_2013.xls
Extracted Sales_Ult_Cust_2014.xls
Extracted Sales_Ult_Cust_2015.xlsx
Extracted Sales_Ult_Cust_2016.xlsx
Extracted Sales_Ult_Cust_2017.xlsx
Extracted Sales_Ult_Cust_2018.xlsx
Extracted Sales_Ult_Cust_2019.xlsx
Extracted Sales_Ult_Cust_2020.xlsx
Extracted Sales_Ult_Cust_2021.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2017.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2021.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2020.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2016.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2013.xls
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2012.xlsx
Reading in /Users/Darren/git-clones/data-projec

In [73]:
# Import to dataframes and merge
eia_df = hf.process_and_merge_861(data_dir=data_dir, process_dir=process_dir)

Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2017.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2021.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2020.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2016.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2013.xls
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2012.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2014.xls
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2015.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_2019.xlsx
Reading in /Users/Darren/git-clones/data-projects/CEPs/etl_scripts/raw_data/Sales_Ult_Cust_20

## Extract and process 

Unnamed: 0,YEAR,UTILITY_NAME,TESTING_VAR_NAME,TESTING_VALUE_NAME
0,2017,City of Aberdeen - (MS),UTILITY_NUMBER,55.0
1,2017,City of Abbeville - (LA),UTILITY_NUMBER,59.0
2,2017,A & N Electric Coop,UTILITY_NUMBER,84.0
3,2017,A & N Electric Coop,UTILITY_NUMBER,84.0
4,2017,Adams Electric Coop,UTILITY_NUMBER,97.0
...,...,...,...,...
75,2017,City of Aberdeen - (MS),INDUSTRIAL_CUSTOMERS,2
76,2017,City of Abbeville - (LA),INDUSTRIAL_CUSTOMERS,27
77,2017,A & N Electric Coop,INDUSTRIAL_CUSTOMERS,0
78,2017,A & N Electric Coop,INDUSTRIAL_CUSTOMERS,8


AttributeError: 'str' object has no attribute 'contains'

In [47]:
exclusion = '|'.join(['COMMERCIAL', 'INDUSTRIAL'])

exclude = f'^(?!{exclusion})'

eia_df.filter(regex=exclude)


Unnamed: 0,YEAR,UTILITY_NUMBER,UTILITY_NAME,PART,SERVICE_TYPE,DATA_TYPE,STATE,OWNERSHIP,BA_CODE,RESIDENTIAL_REVENUE,RESIDENTIAL_SALES_MWH,RESIDENTIAL_CUSTOMERS
0,2017,55.0,City of Aberdeen - (MS),A,Bundled,O,MS,Municipal,TVA,3644,32158,2595
1,2017,59.0,City of Abbeville - (LA),A,Bundled,O,LA,Municipal,MISO,5279.9,52746,4464
2,2017,84.0,A & N Electric Coop,A,Bundled,O,MD,Cooperative,PJM,235.3,2047,274
3,2017,84.0,A & N Electric Coop,A,Bundled,O,VA,Cooperative,PJM,38308.7,344508,31309
4,2017,97.0,Adams Electric Coop,A,Bundled,O,IL,Cooperative,MISO,16341,109301,8603
...,...,...,...,...,...,...,...,...,...,...,...,...
34831,2018,99999.0,Adjustment 2018,B,Energy,I,PA,,NYIS,606.4,7805,1241
34832,2018,99999.0,Adjustment 2018,B,Energy,I,PA,,PJM,44993.9,513574,11885
34833,2018,99999.0,Adjustment 2018,B,Energy,I,RI,,ISNE,9796,83136,4842
34834,2018,99999.0,Adjustment 2018,B,Energy,I,VA,,PJM,0,0,0


In [70]:
tiers = ['RESIDENTIAL', 'COMMERCIAL', 'INDUSTRIAL']
measures = ['REVENUE', 'SALES_MWH', 'CUSTOMERS']

dfs = []

for tier in tiers:
    ref_list = tiers.copy()
    ref_list.remove(tier)
    
    exclusion = '|'.join(ref_list)
    exclude_regex = f'^(?!{exclusion})'
    
    df = eia_df.filter(regex=exclude_regex).assign(CUSTOMER_TYPE=tier)

    df.columns = [col.replace(f'{tier}_', '') for col in df.columns]
    
    dfs.append(df)

pivot_df = pd.concat(dfs, axis=0).reset_index(drop=True)

Unnamed: 0,YEAR,UTILITY_NUMBER,UTILITY_NAME,PART,SERVICE_TYPE,DATA_TYPE,STATE,OWNERSHIP,BA_CODE,REVENUE,SALES_MWH,CUSTOMERS,CUSTOMER_TYPE
72477,2017,59313.0,"Palmco Power MA, LLC",B,Energy,O,MA,Retail Power Marketer,ISNE,0,0,0,INDUSTRIAL
87834,2014,3839.0,Coahoma Electric Power Assn,A,Bundled,O,MS,Cooperative,MISO,2770,29034,11,INDUSTRIAL
27910,2019,59557.0,RGS Energy,A,Bundled,O,CA,Behind the Meter,CISO,,75.2,372,RESIDENTIAL
82941,2013,10019.0,Kaw Valley Electric Coop Inc,A,Bundled,O,KS,Cooperative,WR,2372,20698,23,INDUSTRIAL
31021,2022,61131.0,Freepoint Energy Solutions LLC,B,Energy,O,NJ,Retail Power Marketer,PJM,0,0,0,RESIDENTIAL
101738,2018,7484.0,"Grand Electric Coop, Inc",A,Bundled,O,MT,Cooperative,SWPP,.,.,.,INDUSTRIAL
46833,2016,59624.0,"Vivint Solar, Inc.",A,Bundled,O,CA,Behind the Meter,LDWP,.,.,.,COMMERCIAL
54516,2014,18447.0,Tallahatchie Valley E P A,A,Bundled,O,MS,Cooperative,TVA,23834,185507,5865,COMMERCIAL
100562,2022,59943.0,Spruce Finance,A,Bundled,O,NV,Behind the Meter,NEVP,0,0,0,INDUSTRIAL
4368,2021,14006.0,Ohio Power Co,C,Delivery,O,OH,Investor Owned,PJM,486782,6092770,555097,RESIDENTIAL


In [68]:
dfs[1].head()

Unnamed: 0,YEAR,UTILITY_NUMBER,UTILITY_NAME,PART,SERVICE_TYPE,DATA_TYPE,STATE,OWNERSHIP,BA_CODE,REVENUE,SALES_MWH,CUSTOMERS,CUSTOMER_TYPE
0,2017,55.0,City of Aberdeen - (MS),A,Bundled,O,MS,Municipal,TVA,4980.0,47295,661,COMMERCIAL
1,2017,59.0,City of Abbeville - (LA),A,Bundled,O,LA,Municipal,MISO,4082.7,37537,1032,COMMERCIAL
2,2017,84.0,A & N Electric Coop,A,Bundled,O,MD,Cooperative,PJM,40.0,346,27,COMMERCIAL
3,2017,84.0,A & N Electric Coop,A,Bundled,O,VA,Cooperative,PJM,15960.0,156370,3992,COMMERCIAL
4,2017,97.0,Adams Electric Coop,A,Bundled,O,IL,Cooperative,MISO,5729.0,55057,222,COMMERCIAL


In [54]:
[col.replace('RESIDENTIAL_', '') for col in eia_df.columns]

['YEAR',
 'UTILITY_NUMBER',
 'UTILITY_NAME',
 'PART',
 'SERVICE_TYPE',
 'DATA_TYPE',
 'STATE',
 'OWNERSHIP',
 'BA_CODE',
 'REVENUE',
 'SALES_MWH',
 'CUSTOMERS',
 'COMMERCIAL_REVENUE',
 'COMMERCIAL_SALES_MWH',
 'COMMERCIAL_CUSTOMERS',
 'INDUSTRIAL_REVENUE',
 'INDUSTRIAL_SALES_MWH',
 'INDUSTRIAL_CUSTOMERS']

In [72]:
reload(hf)

<module 'helper_functions' from '/Users/Darren/git-clones/data-projects/CEPs/etl_scripts/helper_functions.py'>