In [1]:
import pandas as pd, tarfile

Since the data is formatted in different ways, we create a file that unifies the environment data in this notebook.

### 1. Skip first hour

As illustrated below, the solar irradiance and rate consumption datasets starts at hour 1, not at 0, as other datasets do. Thus, we think that it makes sense to skip the first hour in all other datasets to start at the same time.

In [2]:
# This dataset (and rate consumption) starts at hour 1, ...
solar_irradiance = pd.read_csv("../data/SolarIrradiance.csv")[["Year", "HOUR-PST", "Avg Global Horizontal [W/m^2]"]]
solar_irradiance.head(2)

Unnamed: 0,Year,HOUR-PST,Avg Global Horizontal [W/m^2]
0,2016,0,0.1291
1,2016,1,0.1592


In [3]:
# ... but has hour 0 somewhere appearing in the middle of it
solar_irradiance.iloc[22:25]

Unnamed: 0,Year,HOUR-PST,Avg Global Horizontal [W/m^2]
22,2016,22,0.068
23,2016,23,0.0742
24,2016,0,0.0417


In [4]:
# However, this dataset (and load data) start at hour 0, and ...
wind_speed = pd.read_csv("../data/WindSpeed.csv")[["Year","Hour","Wind Speed  "]]
wind_speed["Wind Speed  "] = wind_speed["Wind Speed  "].apply(lambda mps: 3.6 * mps) # convert m/s to km/h, like in the given loading code
wind_speed.head(2)

Unnamed: 0,Year,Hour,Wind Speed
0,2016.0,0.0,43.128
1,2016.0,1.0,45.9


In [5]:
# also contain an hour 0 for following days.
wind_speed.iloc[23:26]

Unnamed: 0,Year,Hour,Wind Speed
23,2016.0,23.0,23.904
24,2016.0,0.0,18.432
25,2016.0,1.0,14.04


Therefore, we skip the first line in all datasets other than the solar irradiance datasets.

In [6]:
rate_cons = pd.read_csv("../data/rate_consumption_charge.csv")[['Year', 'HOUR-PST', 'Grid Elecricity Price（$/kWh）',]]
rate_cons = rate_cons.rename(columns={'Grid Elecricity Price（$/kWh）': "Grid Electricity Price"})
rate_cons.head(5)

Unnamed: 0,Year,HOUR-PST,Grid Electricity Price
0,2016,0,0.06
1,2016,1,0.06
2,2016,2,0.06
3,2016,3,0.06
4,2016,4,0.06


In [7]:
import os

path = r"../data/residential_load_data_base"
households = {}
count = 0

for file in os.listdir(path):
    file_path = os.path.join(path, file)
    households[count] = pd.read_csv(file_path)[['Date/Time', 'Electricity:Facility [kW](Hourly)']]
    count += 1


In [8]:
households[0].head(3)

Unnamed: 0,Date/Time,Electricity:Facility [kW](Hourly)
0,01/01 01:00:00,0.793771
1,01/01 02:00:00,0.635087
2,01/01 03:00:00,0.579323


In [9]:
households[1].head(3)

Unnamed: 0,Date/Time,Electricity:Facility [kW](Hourly)
0,01/01 01:00:00,1.001662
1,01/01 02:00:00,0.823123
2,01/01 03:00:00,0.763152


In [10]:
# skip the first hour in every dataset except solar_irradiance such that they fit each other and the beginnings are synced
wind_speed = wind_speed.iloc[1:].reset_index(drop=True)
households = {name: household.iloc[1:].reset_index(drop=True) for name, household in households.items()}


### 2. Cap data

Furthermore, the different datasets are of different lengths. Therefore, we pick the length of the shortest dataset to have features of equal lengths.

Note that there is only data for half a year available for most datasets, whereas for the load data we have a full year available.

In [11]:
rate_cons.head(5)

Unnamed: 0,Year,HOUR-PST,Grid Electricity Price
0,2016,0,0.06
1,2016,1,0.06
2,2016,2,0.06
3,2016,3,0.06
4,2016,4,0.06


In [12]:
dataset_lengths = (len(wind_speed), len(solar_irradiance), len(rate_cons), *[len(households[name]) for name in households])
dataset_lengths

(8641, 8640, 8640, 8759, 8759)

In [13]:
min_length = min(dataset_lengths); min_length

8640

In [14]:
wind_speed = wind_speed.iloc[:min_length]
solar_irradiance = solar_irradiance.iloc[:min_length]
rate_cons = rate_cons.iloc[:min_length]
households = {name: household.iloc[:min_length] for name, household in households.items()}

In [15]:
len(wind_speed), len(solar_irradiance), len(rate_cons), *[len(household) for household in households.values()]

(8640, 8640, 8640, 8640, 8640)

### Export

In [16]:
df = pd.DataFrame({
  "Wind Speed": wind_speed["Wind Speed  "],
  "Solar Irradiance": solar_irradiance["Avg Global Horizontal [W/m^2]"],
  "Grid Electricity Price": rate_cons["Grid Electricity Price"],
  **{name: household["Electricity:Facility [kW](Hourly)"] for name, household in households.items()},
});
df

Unnamed: 0,Wind Speed,Solar Irradiance,Grid Electricity Price,0,1
0,45.900,0.1291,0.06,0.635087,0.823123
1,46.872,0.1592,0.06,0.579323,0.763152
2,47.340,0.0850,0.06,0.569831,0.757195
3,40.608,0.0000,0.06,0.604316,0.805053
4,42.732,0.0000,0.06,0.817850,1.068934
...,...,...,...,...,...
8635,16.920,0.1145,0.09,2.339000,2.501663
8636,15.840,0.0155,0.09,2.090589,2.277813
8637,17.064,0.1539,0.09,1.654266,1.867344
8638,11.808,0.0348,0.06,1.223062,1.471720


In [17]:
df.drop(df.tail(1).index,
        inplace = True)
df.to_csv("../data/cropped.csv", index=False)