In [1]:
# s/o https://www.kaggle.com/kernels/scriptcontent/22622838/download
# alignment but also improved correlation with log(target)

### Load raw weather data

In [2]:
from os.path import join as pjoin
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
RAW_DATA_DIR = "../data/raw"

print('Loading init weather data...')
# load and concatenate weather data
weather_dtypes = {
    'site_id': np.uint8,
    'air_temperature': np.float32,
    'cloud_coverage': np.float32,
    'dew_temperature': np.float32,
    'precip_depth_1_hr': np.float32,
    'sea_level_pressure': np.float32,
    'wind_direction': np.float32,
    'wind_speed': np.float32,
}

weather_train = pd.read_csv(
    pjoin(RAW_DATA_DIR, 'weather_train.csv'),
    dtype=weather_dtypes,
    parse_dates=['timestamp']
)
weather_test = pd.read_csv(
    pjoin(RAW_DATA_DIR, 'weather_test.csv'),
    dtype=weather_dtypes,
    parse_dates=['timestamp']
)

weather = pd.concat(
    [
        weather_train,
        weather_test
    ],
    ignore_index=True
)
# del redundant dfs
del weather_train, weather_test

Loading init weather data...


In [4]:
weather_key = ['site_id', 'timestamp']
temp_skeleton = weather[weather_key + ['air_temperature']]\
.drop_duplicates(subset=weather_key).sort_values(by=weather_key).copy()

In [5]:
# group by site_id and date and compute respective ranks for air_temperature
# calculate ranks of hourly temperatures within date/site_id chunks
temp_skeleton['temp_rank'] = temp_skeleton.groupby(
    ['site_id', temp_skeleton.timestamp.dt.date],
)['air_temperature'].rank('average')

# create 2D dataframe of site_ids (0-16) x mean hour rank of temperature within day (0-23)
df_2d = temp_skeleton.groupby(
    ['site_id', temp_skeleton.timestamp.dt.hour]
)['temp_rank'].mean().unstack(level=1)

# align scale, so that each value within row is in [0,1] range
# divide each row by its maximum
# .values returns the values as array and reshape turns them into 
# a 16 x 1 array
df_2d = df_2d / df_2d.max(axis=1).values.reshape((-1,1))  

# sort the index in df_2d by 'closeness' of hour with the highest temperature
site_ids_argmax_maxtemp = pd.Series(np.argmax(df_2d.values, axis=1)).sort_values().index

# assuming (1,5,12) tuple has the most correct temp peaks at 14:00
site_ids_offsets = pd.Series(df_2d.values.argmax(axis=1) - 14)

# align rows so that site_id's with similar temperature hour's peaks are near each other
df_2d = df_2d.iloc[site_ids_argmax_maxtemp]
df_2d.index = [f'idx={i:02d}_site_id={s:02d}' for (i,s) in zip(range(16), df_2d.index)]

In [6]:
# check what offsets (in hours) we have
site_ids_offsets.index.name = 'site_id'
site_ids_offsets.sort_values()

site_id
1     0
5     0
12    0
0     5
8     5
3     6
6     6
7     6
11    6
14    6
15    6
9     7
13    7
4     8
10    8
2     9
dtype: int64

In [7]:
temp_skeleton['offset'] = temp_skeleton.site_id.map(site_ids_offsets)

# add offset
temp_skeleton['timestamp_aligned'] = (
    temp_skeleton.timestamp 
    - pd.to_timedelta(temp_skeleton.offset, unit='H')
)

temp_skeleton.head()

Unnamed: 0,site_id,timestamp,air_temperature,temp_rank,offset,timestamp_aligned
0,0,2016-01-01 00:00:00,25.0,16.0,5,2015-12-31 19:00:00
1,0,2016-01-01 01:00:00,24.4,15.0,5,2015-12-31 20:00:00
2,0,2016-01-01 02:00:00,22.799999,13.0,5,2015-12-31 21:00:00
3,0,2016-01-01 03:00:00,21.1,9.0,5,2015-12-31 22:00:00
4,0,2016-01-01 04:00:00,20.0,3.5,5,2015-12-31 23:00:00
