In this notebook we create our train and test sets. The original dataset is from the 2014 Global Energy Forecasting Competition, and is split into 15 Tasks. For our capstone project we concatenate the 15 Tasks to1 Task.

In [196]:
# Import Libraries
import pandas as pd
import numpy as np

In [197]:
# Import Train data and concatenate all zones (wind farms)
df_train = pd.read_csv('../data/GEFCom2014Data/Wind/Task15/Task15_W_Zone1_10/Task15_W_Zone1.csv')

for zone in range(2,11):
    df_zone = pd.read_csv('../data/GEFCom2014Data/Wind/Task15/Task15_W_Zone1_10/Task15_W_Zone{}.csv'.format(zone))
    df_train = pd.concat([df_train, df_zone], axis=0)


df_train.describe()

Unnamed: 0,ZONEID,TARGETVAR,U10,V10,U100,V100
count,168000.0,167888.0,168000.0,168000.0,168000.0,168000.0
mean,5.5,0.36048,0.818764,0.089091,1.282886,0.009308
std,2.87229,0.310831,3.034753,3.116356,4.823644,5.113098
min,1.0,0.0,-8.371644,-12.923576,-12.425352,-19.791997
25%,3.0,0.081003,-1.501748,-2.158184,-2.410327,-3.864192
50%,5.5,0.281302,0.54512,0.363812,1.05605,0.709303
75%,8.0,0.606016,2.805488,2.34191,4.753575,4.072244
max,10.0,1.0,14.837177,11.401396,21.399071,16.304799


In [198]:
# Import test data (explanatory variables) and concatenate all wind farms
x_test = pd.read_csv('../data/GEFCom2014Data/Wind/Task15/TaskExpVars15_W_Zone1_10/TaskExpVars15_W_Zone1.csv')

for zone in range(2,11):
    df_zone = pd.read_csv('../data/GEFCom2014Data/Wind/Task15/TaskExpVars15_W_Zone1_10/TaskExpVars15_W_Zone{}.csv'.format(zone))
    x_test = pd.concat([x_test, df_zone], axis=0)

x_test.describe()
    

Unnamed: 0,ZONEID,U10,V10,U100,V100
count,7440.0,7440.0,7440.0,7440.0,7440.0
mean,5.5,0.812514,0.837639,1.044339,1.16958
std,2.872474,2.964068,2.7552,4.640233,4.569751
min,1.0,-7.689974,-8.70571,-12.403897,-14.556674
25%,3.0,-1.409411,-0.897103,-2.251543,-1.501312
50%,5.5,0.604219,1.260099,1.024533,2.08319
75%,8.0,2.802365,2.730859,4.320772,4.396414
max,10.0,9.901569,8.53498,13.751663,13.195673


In [199]:
# Import target variable (Test data) 
y_test = pd.read_csv('../data/GEFCom2014Data/Wind/SolutiontoTask15/solution15_W.csv')
y_test.head()

Unnamed: 0,ZONEID,TIMESTAMP,TARGETVAR
0,1,20131201 1:00,0.844469
1,1,20131201 2:00,0.795038
2,1,20131201 3:00,0.809792
3,1,20131201 4:00,0.550418
4,1,20131201 5:00,0.496476


In [200]:
y_test.describe()

Unnamed: 0,ZONEID,TARGETVAR
count,7440.0,7377.0
mean,5.5,0.315537
std,2.872474,0.289284
min,1.0,0.0
25%,3.0,0.067648
50%,5.5,0.227238
75%,8.0,0.507317
max,10.0,0.997743


In [201]:
# Merge explanatory and target variables for test data

x_test['key'] = x_test['ZONEID'].astype('str') + x_test['TIMESTAMP']
x_test.reset_index()
y_test['key'] = y_test['ZONEID'].astype('str') + y_test['TIMESTAMP']

df_test = pd.merge(y_test[['TARGETVAR', 'key']], x_test, on = 'key')
df_test.drop('key', inplace=True, axis= 1)

df_test


Unnamed: 0,TARGETVAR,ZONEID,TIMESTAMP,U10,V10,U100,V100
0,0.844469,1,20131201 1:00,0.269660,-6.067845,0.504866,-8.772318
1,0.795038,1,20131201 2:00,0.825381,-5.940164,1.213620,-8.551939
2,0.809792,1,20131201 3:00,0.898262,-5.868724,1.309446,-8.474350
3,0.550418,1,20131201 4:00,1.247326,-5.579415,1.885008,-8.363509
4,0.496476,1,20131201 5:00,1.590636,-4.981543,2.664878,-8.107269
...,...,...,...,...,...,...,...
7435,,10,20131231 20:00,1.032363,-6.281558,2.041033,-11.220655
7436,,10,20131231 21:00,1.702361,-6.202448,2.846245,-10.486079
7437,,10,20131231 22:00,5.086629,-1.261378,7.382256,-3.097656
7438,,10,20131231 23:00,4.183751,-1.580172,5.789054,-2.116548


In [202]:
# Merge test and train data in one dataframe
df = pd.concat([df_train, df_test], axis = 0)
df.describe()

Unnamed: 0,ZONEID,TARGETVAR,U10,V10,U100,V100
count,175440.0,175265.0,175440.0,175440.0,175440.0,175440.0
mean,5.5,0.358588,0.818499,0.120835,1.272769,0.058513
std,2.87229,0.310085,3.031781,3.105553,4.816235,5.096587
min,1.0,0.0,-8.371644,-12.923576,-12.425352,-19.791997
25%,3.0,0.080344,-1.498648,-2.116981,-2.404248,-3.768786
50%,5.5,0.27867,0.54888,0.41002,1.053103,0.785671
75%,8.0,0.602033,2.805379,2.364026,4.734727,4.091509
max,10.0,1.0,14.837177,11.401396,21.399071,16.304799


In [203]:
# Save merged dataframe to csv
df.to_csv('../data/GEFCom2014Data/Wind/raw_data.csv', index = False)

## Add Features

In [204]:
# Cast TIMESTAMP as datetime
df.TIMESTAMP = pd.to_datetime(df.TIMESTAMP)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175440 entries, 0 to 7439
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   ZONEID     175440 non-null  int64         
 1   TIMESTAMP  175440 non-null  datetime64[ns]
 2   TARGETVAR  175265 non-null  float64       
 3   U10        175440 non-null  float64       
 4   V10        175440 non-null  float64       
 5   U100       175440 non-null  float64       
 6   V100       175440 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 10.7 MB


In [205]:
# Add Column for HOUR, MONTH, WEEKDAY
df['HOUR'] = df.TIMESTAMP.dt.hour
df['MONTH'] = df.TIMESTAMP.dt.month
df['WEEKDAY'] = df.TIMESTAMP.dt.weekday

In [206]:
# Import holiday csv
df_holidays = pd.read_csv('../data/holidays.csv', delimiter=';')
df_holidays.YEAR = df_holidays.YEAR.astype('str')
df_holidays.eval('TIMESTAMP = Date + YEAR', inplace=True)
df_holidays.TIMESTAMP = pd.to_datetime(df_holidays.TIMESTAMP)

In [207]:
# Add column IS_HOLIDAY
df['IS_HOLIDAY'] = df.TIMESTAMP.dt.date.isin(df_holidays.TIMESTAMP.dt.date) * 1
#df[df.TIMESTAMP.dt.date.isin(df_holidays.TIMESTAMP.dt.date)]['TIMESTAMP'].dt.date.unique()
df.head()

Unnamed: 0,ZONEID,TIMESTAMP,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,IS_HOLIDAY
0,1,2012-01-01 01:00:00,0.0,2.1246,-2.681966,2.86428,-3.666076,1,1,6,1
1,1,2012-01-01 02:00:00,0.054879,2.521695,-1.79696,3.344859,-2.464761,2,1,6,1
2,1,2012-01-01 03:00:00,0.110234,2.67221,-0.822516,3.508448,-1.214093,3,1,6,1
3,1,2012-01-01 04:00:00,0.165116,2.457504,-0.143642,3.215233,-0.355546,4,1,6,1
4,1,2012-01-01 05:00:00,0.15694,2.245898,0.389576,2.957678,0.332701,5,1,6,1


In [208]:
# Add columns for windspeed at the two different heights
df.eval('WS10 = (U10 ** 2 + V10 ** 2) ** 0.5', inplace=True)
df.eval('WS100 = (U100 ** 2 + V100 ** 2) ** 0.5', inplace=True)

In [209]:
# Add columns for wind direction at the two different heights
def uv_to_winddir(u,v):
    return (180 + 180 / np.pi * np.arctan2(u,v)) % 360

df['WD10'] = uv_to_winddir(df.U10, df.V10)
df['WD100'] = uv_to_winddir(df.U100, df.V100)

In [210]:
# Add columns for cardinal wind directions
def degrees_to_cardinal(d):
    dirs = ["N", "NNE", "NE", "ENE", "E", "ESE", "SE", "SSE",
            "S", "SSW", "SW", "WSW", "W", "WNW", "NW", "NNW"]
    ix = int((d + 11.25)/22.5)
    return dirs[ix % 16]

df['WD100CARD'] = df.WD100.apply(lambda x: degrees_to_cardinal(x))
df['WD10CARD'] = df.WD10.apply(lambda x: degrees_to_cardinal(x))

In [211]:
# Add columns for normed wind vector components (normed by ws)
df.eval('U100NORM = U100 / WS100', inplace=True)
df.eval('V100NORM = V100 / WS100', inplace=True)

In [212]:
# Save dataframe including new features to csv
df.to_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', index = False)