# Load Data Prep

In [5]:
#importing packages needed for analysis
import os
import numpy as np
import pandas as pd
import math
from pandas import DataFrame

path = os.getcwd()
#print(path)

load_raw = pd.read_csv('inputs/load_duration_curves_raw_data.csv')
print(load_raw.head(2))
print()
print('number of rows in dataset =', load_raw.shape[0])

     Region  Month  Day    Hour 1    Hour 2    Hour 3    Hour 4    Hour 5  \
0  ERC_REST      1    1   34,807    34,551    34,788    35,531    36,633    
1  ERC_REST      1    2   34,716    34,719    35,076    35,891    37,091    

     Hour 6    Hour 7  ...   Hour 15   Hour 16   Hour 17   Hour 18   Hour 19  \
0   37,780    38,831   ...   38,507    40,084    41,198    40,959    40,549    
1   38,207    38,720   ...   33,211    34,968    37,573    38,213    38,257    

    Hour 20   Hour 21   Hour 22   Hour 23   Hour 24  
0   39,766    38,510    37,012    35,811    35,061   
1   37,911    36,743    35,379    34,598    34,444   

[2 rows x 27 columns]

number of rows in dataset = 27010


In [6]:
#Organizing regional data

#create temporary copy to make changes on
load_org = load_raw.copy()
print('number of rows in dataset (including CN) =',load_org.shape[0])

#Regional IDs
unique_r = pd.Series(load_org['Region'].unique()).dropna()
rl = unique_r.str.split("_",n=1,expand=True)
rl[2] = unique_r
#print(rl)
print('number of regions in dataset (including CN) =',unique_r.shape[0])

#Cleaning up the empty subgroups
#print(rl[rl.isna().any(axis=1)])
rl.loc[rl[0] == 'NENGREST', 1] = 'REST'
rl.loc[rl[0] == 'FRCC', 1] = 'FRCC'

#Cleaning up the misnamed groups
#unique_g = pd.Series(rl[0].unique()).dropna()
#print(unique_g)
rl[0] = rl[0].replace('NENGREST','NENG')
rl[0] = rl[0].replace('WECC','WEC')
unique_g = pd.Series(rl[0].unique()).dropna()
print('number of regional groups in dataset (including CN) =',unique_g.shape[0])
rl.rename(columns={0 : "R_Group", 1: 'R_Subgroup', 2:'Region'},inplace=True)
#print(rl.head())

#Merging Regional Data to DF
load_org = pd.merge(rl,load_org,on='Region',how='right')
print()
print(load_org.head(2))

#Removing Canada
load_org = load_org[load_org['R_Group']!="CN"]
print()
print('number of rows in dataset after removing CN =',load_org.shape[0])
unique_r = pd.Series(load_org['Region'].unique()).dropna()
print('number of regions in dataset (excluding CN) =',unique_r.shape[0])
unique_g = pd.Series(load_org['R_Group'].unique()).dropna()
print('number of regional groups in dataset (excluding CN) =',unique_g.shape[0])

#for testing only, otherwise comment out the lines below
#NOTE: use FRCC for one region, ERC for two regions
#load_org = load_org[load_org['R_Group']=="FRCC"]
#print('number of rows in dataset for testing =',load_org.shape[0])
#Organize temporal data

number of rows in dataset (including CN) = 27010
number of regions in dataset (including CN) = 74
number of regional groups in dataset (including CN) = 10

  R_Group R_Subgroup    Region  Month  Day    Hour 1    Hour 2    Hour 3  \
0     ERC       REST  ERC_REST      1    1   34,807    34,551    34,788    
1     ERC       REST  ERC_REST      1    2   34,716    34,719    35,076    

     Hour 4    Hour 5  ...   Hour 15   Hour 16   Hour 17   Hour 18   Hour 19  \
0   35,531    36,633   ...   38,507    40,084    41,198    40,959    40,549    
1   35,891    37,091   ...   33,211    34,968    37,573    38,213    38,257    

    Hour 20   Hour 21   Hour 22   Hour 23   Hour 24  
0   39,766    38,510    37,012    35,811    35,061   
1   37,911    36,743    35,379    34,598    34,444   

[2 rows x 29 columns]

number of rows in dataset after removing CN = 22995
number of regions in dataset (excluding CN) = 63
number of regional groups in dataset (excluding CN) = 9


In [8]:
#rename hour titles to just the value ('Hour 1' --> 1)
load_org.columns = load_org.columns.str.replace('Hour ', '')
#print(load_org.head(2))

#melt function converts values in wide format to long format
load_dur = pd.melt(load_org,id_vars=['R_Group','R_Subgroup','Region','Month','Day'],var_name='Hour',value_name='Load')

#print(load_dur.dtypes)

#days are counted 1 to 365, not 1 to 31
unique_d = pd.Series(load_dur['Day'].unique()).dropna()
#print(unique_d.tail(2))

#turn hour values to numeric 
load_dur['Hour'] = pd.to_numeric(load_dur['Hour'],errors='coerce')
unique_h = pd.Series(load_dur['Hour'].unique()).dropna()
#print(unique_h.tail(2))

#turn load values to numeric 
load_dur['Load'] = pd.to_numeric(load_dur['Load'].str.replace(",",""),errors='coerce')
#print(load_dur.head(2))

season_month = pd.read_csv('inputs/season_months.csv')
load_dur = pd.merge(load_dur,season_month, on='Month', how='left')

#organized long format data to new csv file
load_dur = load_dur[['Region','R_Group','R_Subgroup','Season','Month','Day','Hour','Load']]
load_dur.to_csv('outputs/load_long_format.csv')
print(load_dur.tail(2))
print('number of rows in dataset =',load_dur.shape[0])

         Region R_Group R_Subgroup  Season  Month  Day  Hour  Load
551878  WECC_WY     WEC         WY  winter     12  364    24  1791
551879  WECC_WY     WEC         WY  winter     12  365    24  1834
number of rows in dataset = 551880


# Solar Data Prep