In [1]:
from os.path import join as pjoin
import numpy as np
import pandas as pd
from datetime import datetime
from pytz import timezone
import pytz

In [2]:
RAW_DATA_DIR = "../data/raw"

print('Loading init weather data...')
# load and concatenate weather data
weather_dtypes = {
    'site_id': np.uint8,
    'air_temperature': np.float32,
    'cloud_coverage': np.float32,
    'dew_temperature': np.float32,
    'precip_depth_1_hr': np.float32,
    'sea_level_pressure': np.float32,
    'wind_direction': np.float32,
    'wind_speed': np.float32,
}

weather_train = pd.read_csv(
    pjoin(RAW_DATA_DIR, 'weather_train.csv'),
    dtype=weather_dtypes,
    parse_dates=['timestamp']
)
weather_test = pd.read_csv(
    pjoin(RAW_DATA_DIR, 'weather_test.csv'),
    dtype=weather_dtypes,
    parse_dates=['timestamp']
)

weather = pd.concat(
    [
        weather_train,
        weather_test
    ],
    ignore_index=True
)
# del redundant dfs
del weather_train, weather_test

Loading init weather data...


In [3]:
weather.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5
2,0,2016-01-01 02:00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [4]:
site_info = pd.read_csv('../data/site_info.csv', delimiter = ";")
site_info

Unnamed: 0,site_id,timezone,country_code,location
0,0,US/Eastern,US,"Orlando, FL"
1,1,Europe/London,UK,"UK, Southhampton"
2,2,US/Mountain,US,"Tempe, AZ"
3,3,US/Eastern,US,"Washington, WA"
4,4,US/Pacific,US,"San Francisco, CA"
5,5,Europe/London,UK,"UK, London"
6,6,US/Eastern,US,Philadelphia
7,7,Canada/Eastern,CA,Montreal/Ottawa
8,8,US/Eastern,US,"Orlando, FL"
9,9,US/Central,US,"Austin, TX"


In [5]:
weather = weather.merge(site_info, on = "site_id", how = "left")

In [6]:
weather.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timezone,country_code,location
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0,US/Eastern,US,"Orlando, FL"
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5,US/Eastern,US,"Orlando, FL"
2,0,2016-01-01 02:00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0,US/Eastern,US,"Orlando, FL"
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0,US/Eastern,US,"Orlando, FL"
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,US/Eastern,US,"Orlando, FL"


In [7]:
# define a time format
fmt = '%Y-%m-%d %H:%M:%S %Z%z'
# time object
utc = pytz.utc

In [8]:
utc.localize(weather.timestamp[1])

Timestamp('2016-01-01 01:00:00+0000', tz='UTC')

In [9]:
# bring all timestamps into utc timeformat
weather['timestamp'] = weather.timestamp.apply(lambda x: utc.localize(x))

In [10]:
weather.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timezone,country_code,location
0,0,2016-01-01 00:00:00+00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0,US/Eastern,US,"Orlando, FL"
1,0,2016-01-01 01:00:00+00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5,US/Eastern,US,"Orlando, FL"
2,0,2016-01-01 02:00:00+00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0,US/Eastern,US,"Orlando, FL"
3,0,2016-01-01 03:00:00+00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0,US/Eastern,US,"Orlando, FL"
4,0,2016-01-01 04:00:00+00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,US/Eastern,US,"Orlando, FL"


In [11]:
weather['timezone'] = weather.timezone.apply(lambda x: timezone(x))

In [12]:
weather['timestamp_local'] = weather.apply(lambda x: x.timestamp.astimezone(x.timezone), axis = 1)

In [13]:
weather.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timezone,country_code,location,timestamp_local
0,0,2016-01-01 00:00:00+00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0,US/Eastern,US,"Orlando, FL",2015-12-31 19:00:00-05:00
1,0,2016-01-01 01:00:00+00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5,US/Eastern,US,"Orlando, FL",2015-12-31 20:00:00-05:00
2,0,2016-01-01 02:00:00+00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0,US/Eastern,US,"Orlando, FL",2015-12-31 21:00:00-05:00
3,0,2016-01-01 03:00:00+00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0,US/Eastern,US,"Orlando, FL",2015-12-31 22:00:00-05:00
4,0,2016-01-01 04:00:00+00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,US/Eastern,US,"Orlando, FL",2015-12-31 23:00:00-05:00


In [15]:
weather.to_pickle("../data/weather_converted.pkl")

In [19]:
# use this function for weather data
# take all columns, maybe use geoloc later
def convert_timestamps(data, site_info):
    data = data.merge(site_info, on = "site_id", how = "left")
    data['timezone'] = data.timezone.apply(lambda x: pytz.timezone(x))
    data['timestamp_local'] = data.apply(lambda x: x.timestamp.astimezone(x.timezone), axis = 1)
    data
  

In [22]:
# use this function for meter readings
def localize_timestamps(data, site_info):
    data = data.merge(timezones, on = "site_id", how = "left")
    data['timezone'] = data.timezone.apply(lambda x: timezone(x))
    data['timestamp_local'] = data.apply(lambda x: x.timezone.localize(x.timestamp), axis = 1)
    data