In [73]:
import glob
import json
import pandas as pd
import numpy as np
from collections import defaultdict
from datetime import datetime
from dateutil import tz

PATH_ELECTRICITY='dataset/electricity_data_all/*'
PATH_WEATHER='dataset/melbourne-weather/melbourne-weather.json'
PATH_WEATHER2='dataset/melbourne-weather/melbourne-2020.csv'
#Index(['REGION', 'SETTLEMENTDATE', 'TOTALDEMAND', 'RRP', 'PERIODTYPE'], dtype='object')

In [74]:
# READ ELECTRICITY

day2rrp = defaultdict(list)
day2demand = defaultdict(list)

def get_day_month(day_str):
    day = day_str.split()[0]
    month = '/'.join(day.split('/')[:2])
    return day, month

for file in glob.glob(PATH_ELECTRICITY):
    df = pd.read_csv(file)
    for idx, row in df.iterrows():
        day, month = get_day_month(row['SETTLEMENTDATE'])        
        day2demand[day].append(row['TOTALDEMAND'])
        day2rrp[day].append(row['RRP'])    

# Daily Average
for day in day2rrp.keys():
    day2rrp[day] = np.mean(day2rrp[day])
    day2demand[day] = np.mean(day2demand[day])

In [75]:
# READ WHEATHER

def is_holiday(time):
    # month/day
    holidays = ['01/01', '01/27', '03/09', '04/10', '04/25', '06/08', \
     '10/23', '12/25', '12/26', '12/27', '12/28', '12/29', '12/30', '12/31']
    return int(time.strftime("%m/%d") in holidays or time.strftime('%A') in ['Saturday', 'Sunday'])
    
def read_date(str_date):
    from_zone = tz.gettz('UTC')
    to_zone = tz.gettz('Australia/Melbourne')
    utc = datetime.strptime(str_date, "%Y-%m-%d %H:%M:%S +0000 UTC")
    utc = utc.replace(tzinfo=from_zone)
    # Convert time zone
    aest = utc.astimezone(to_zone)
    return aest.strftime("%Y/%m/%d"), is_holiday(aest)

class Weather:
    def __init__(self, dic):
        self.temp_min = dic['main']['temp_min']
        self.temp_max = dic['main']['temp_max']
        self.feels_like = dic['main']['feels_like']
        self.pressure = dic['main']['pressure']
        self.humidity = dic['main']['humidity']
        self.wind_speed = dic['wind']['speed']
        self.wind_deg = dic['wind']['deg']       
        
day2weather = defaultdict(list)
day2offDay = defaultdict(int)
data = json.load(open(PATH_WEATHER))
for datum in data:
    day, is_offday = read_date(datum['dt_iso'])
    day2offDay[day] = is_offday
    day2weather[day].append(Weather(datum))
    
data2020 = pd.read_csv(PATH_WEATHER2)
for idx, row in data2020.iterrows():
    day, is_offday = read_date(row['dt_iso'])
    day2offDay[day] = is_offday
    dic = {'main': {'temp_min': row['temp_min'], 'temp_max': row['temp_max'], 'feels_like': row['feels_like'],
                   'pressure': row['pressure'], 'humidity': row['humidity']},
          'wind': {'speed': row['wind_speed'], 'deg': row['wind_deg']}}
    day2weather[day].append(Weather(dic))

In [76]:
def get_average(weather_list):
    temp_mins=[];temp_maxs=[];feels_likes=[];pressures=[]
    humidities=[];wind_speeds=[];wind_degs=[]
    for w in weather_list:
        temp_mins.append(w.temp_min)
        temp_maxs.append(w.temp_max)
        feels_likes.append(w.feels_like)
        pressures.append(w.pressure)
        humidities.append(w.humidity)
        wind_speeds.append(w.wind_speed)
        wind_degs.append(w.wind_deg)
    return np.mean(temp_mins), np.mean(temp_maxs), np.mean(feels_likes), np.mean(pressures), \
        np.mean(humidities), np.mean(wind_speeds), np.mean(wind_degs)

In [77]:
cols = ['day', 'temp_min', 'temp_max', 'feels_like', 'pressure', 'humidity', 
        'wind_speed', 'wind_deg', 'is_offday', 'demand', 'rrp']
df = pd.DataFrame(columns=cols)

for day in day2weather.keys():
    avgs = get_average(day2weather[day])
    df = df.append({
        'day':day,
        'temp_min':avgs[0],
        'temp_max':avgs[1],
        'feels_like': avgs[2],
        'pressure': avgs[3],
        'humidity': avgs[4],
        'wind_speed': avgs[5],
        'wind_deg': avgs[6],
        'is_offday': day2offDay[day],
        'demand': day2demand[day],
        'rrp': day2rrp[day]
    }, ignore_index=True)

In [80]:
df.tail(10)

Unnamed: 0,day,temp_min,temp_max,feels_like,pressure,humidity,wind_speed,wind_deg,is_offday,demand,rrp
2091,2020/09/22,11.447917,15.277917,7.94125,1005.958333,61.166667,6.983333,327.916667,0,4300.026667,23.526875
2092,2020/09/23,9.339583,13.177917,6.275417,1011.583333,70.541667,6.1625,282.916667,0,4683.851875,32.785833
2093,2020/09/24,8.31625,12.05375,5.530417,1011.083333,64.083333,5.191667,322.5,0,4897.321458,50.574375
2094,2020/09/25,5.482,9.6028,2.7892,1002.76,83.48,5.324,278.4,0,5264.778333,52.275833
2095,2020/09/26,7.172083,11.1575,3.08625,1012.291667,78.083333,7.5875,249.166667,1,4445.623542,31.090625
2096,2020/09/27,7.632917,11.917917,5.662083,1024.416667,72.25,4.370833,203.125,1,4237.645417,53.05875
2097,2020/09/28,7.25375,11.949583,6.875417,1027.041667,74.416667,2.401667,187.166667,0,4777.130833,56.345417
2098,2020/09/29,10.448333,14.254167,8.205,1020.958333,66.416667,4.604167,219.583333,0,4669.8525,40.691875
2099,2020/09/30,12.7996,16.2588,10.7692,1010.4,71.92,5.688,308.8,0,4734.175417,37.216667
2100,2020/10/01,8.038,11.724,6.399,1013.0,77.6,4.01,300.0,0,4803.82,40.3


In [81]:
df.to_csv('preprocessed/daily_data.csv', index=False)