In [13]:
import os
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from solver.processing import read_datasets
from sklearn.preprocessing import MinMaxScaler

In [14]:
TURBINE_ID = 'R80711'
WEATHER_DIR = '../datasets/weather_data/'
TURBINE_DIR = '../datasets/after_analysis/'

In [15]:
# define datasets
weather_link = os.path.join(WEATHER_DIR, 'weather_parameters.csv')
data_link = os.path.join(TURBINE_DIR, 'turbine_{}.csv'.format(TURBINE_ID))

# read datasets
weather_data, turbine_data = read_datasets(data_link, weather_link)

In [16]:
turbine_data.columns

Index(['Date_time', 'Ba_avg', 'Cm_avg', 'Cosphi_avg', 'DCs_avg', 'Db1t_avg',
       'Db2t_avg', 'Ds_avg', 'Dst_avg', 'Gb1t_avg', 'Gb2t_avg', 'Git_avg',
       'Gost_avg', 'Na_c_avg', 'Nf_avg', 'Nu_avg', 'Ot_avg', 'P_avg', 'Q_avg',
       'Rbt_avg', 'Rm_avg', 'Rs_avg', 'Rt_avg', 'S_avg', 'Va_avg', 'Wa_avg',
       'Wa_c_avg', 'Ws1_avg', 'Ws2_avg', 'Ws_avg', 'Ya_avg', 'Yt_avg'],
      dtype='object')

In [17]:
weather_data.columns

Index(['date_time', 'maxtempC', 'mintempC', 'totalSnow_cm', 'sunHour',
       'uvIndex', 'moon_illumination', 'moonrise', 'moonset', 'sunrise',
       'sunset', 'DewPointC', 'FeelsLikeC', 'HeatIndexC', 'WindChillC',
       'WindGustKmph', 'cloudcover', 'humidity', 'precipMM', 'pressure',
       'tempC', 'visibility', 'winddirDegree', 'windspeedKmph', 'location',
       'Date_time'],
      dtype='object')

In [18]:
# transform kmph to mps
scaler = MinMaxScaler()
weather_data['windspeedKmph'] = weather_data['windspeedKmph']/3.6
weather_data['windspeedKmph_scaled'] = scaler.fit_transform(weather_data[['windspeedKmph']])
weather_data['tempC_scaled'] = scaler.fit_transform(weather_data[['tempC']])
###
# Data engineer variables with angles: from angles to radians
turbine_data['Wa_avg'] = turbine_data['Wa_avg']*np.pi / 180
turbine_data['Ya_avg'] = turbine_data['Ya_avg']*np.pi / 180
turbine_data['Ba_avg'] = turbine_data['Ba_avg']*np.pi / 180
###
# Add datetime information
timestamp_s = turbine_data['Date_time'].map(datetime.datetime.timestamp)
day = 24*60*60
year = (365.2425)*day
turbine_data['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
turbine_data['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
turbine_data['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
turbine_data['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))

In [19]:
# downscale turbine data to remove some outliers
turbine_data.set_index('Date_time', inplace=True)
turbine_data = turbine_data.resample(rule='1H').mean()

In [20]:
# Merge two datasets
data = pd.merge(turbine_data, weather_data, on='Date_time')
# drop 'date_time' column
data.drop(['date_time'], axis=1, inplace=True)

In [21]:
# extra feature engineering: wind vector (SEE TENSORFLOW TUTORIAL)
wa = data['Wa_avg'].values.copy()
ws = data['Ws_avg'].values.copy()

# calculate the wind x and y components.
data['Wx'] = ws*np.cos(wa)
data['Wy'] = ws*np.sin(wa)

In [22]:
data.columns

Index(['Date_time', 'Ba_avg', 'Cm_avg', 'Cosphi_avg', 'DCs_avg', 'Db1t_avg',
       'Db2t_avg', 'Ds_avg', 'Dst_avg', 'Gb1t_avg', 'Gb2t_avg', 'Git_avg',
       'Gost_avg', 'Na_c_avg', 'Nf_avg', 'Nu_avg', 'Ot_avg', 'P_avg', 'Q_avg',
       'Rbt_avg', 'Rm_avg', 'Rs_avg', 'Rt_avg', 'S_avg', 'Va_avg', 'Wa_avg',
       'Wa_c_avg', 'Ws1_avg', 'Ws2_avg', 'Ws_avg', 'Ya_avg', 'Yt_avg',
       'Day sin', 'Day cos', 'Year sin', 'Year cos', 'maxtempC', 'mintempC',
       'totalSnow_cm', 'sunHour', 'uvIndex', 'moon_illumination', 'moonrise',
       'moonset', 'sunrise', 'sunset', 'DewPointC', 'FeelsLikeC', 'HeatIndexC',
       'WindChillC', 'WindGustKmph', 'cloudcover', 'humidity', 'precipMM',
       'pressure', 'tempC', 'visibility', 'winddirDegree', 'windspeedKmph',
       'location', 'windspeedKmph_scaled', 'tempC_scaled', 'Wx', 'Wy'],
      dtype='object')

In [23]:
# define directory to save
PATH_TO_DIR = '../datasets/after_feature_engineering'

# create directory if it does not exists
if not os.path.exists(PATH_TO_DIR):
    os.makedirs(PATH_TO_DIR)

# save
PATH_TO_SAVE = os.path.join(PATH_TO_DIR, 'turbine_{}.csv'.format(TURBINE_ID))
data.to_csv(PATH_TO_SAVE, sep='\t', index=False, encoding='utf-8')