# Splitting Data

For consistency across different jupyter notebooks, we pre-generate train and test set a priori and save them as .pkl files

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

from constants import *

In [10]:
def train_test_split_by_turbine(df, test_size=0.2):
    split_index = int(len(df) * (1-test_size))
    train_df = df.iloc[:split_index]
    test_df = df.iloc[split_index:]
    return train_df, test_df

In [11]:
file_path = DATA_PATH + '/cleaned/full.pkl'
df_full = pd.read_pickle(file_path)

In [12]:
df_full.shape

(847078, 117)

In [13]:
len(list(df_full.columns))

117

In [14]:
for col in list(df_full.columns): print(col)

Date.time
Wind.dir.std
Power.me
Power.sd
Power.min
Power.max
Pot.Power.me
Wind.speed.me
Wind.speed.sd
Wind.speed.min
Wind.speed.max
Power.factor.cosphi.me
Power.factor.cosphi.sd
Power.factor.cosphi.min
Power.factor.cosphi.max
Front.bearing.temp.me
Front.bearing.temp.sd
Front.bearing.temp.min
Front.bearing.temp.max
Rear.bearing.temp.me
Rear.bearing.temp.sd
Rear.bearing.temp.min
Rear.bearing.temp.max
Stator1.temp.me
Stator1.temp.sd
Stator1.temp.min
Stator1.temp.max
Nacelle.ambient.temp.me
Nacelle.ambient.temp.sd
Nacelle.ambient.temp.min
Nacelle.ambient.temp.max
Nacelle.temp.me
Nacelle.temp.sd
Nacelle.temp.min
Nacelle.temp.max
Transformer.temp.me
Transformer.temp.sd
Transformer.temp.min
Transformer.temp.max
Gear.oil.inlet.temp.me
Gear.oil.inlet.temp.sd
Gear.oil.inlet.temp.min
Gear.oil.inlet.temp.max
Generator.bearing.rear.temp.me
Generator.bearing.rear.temp.sd
Generator.bearing.rear.temp.min
Generator.bearing.rear.temp.max
Generator.bearing.front.temp.me
Generator.bearing.front.temp.sd
Ge

Split the data to have the same proportion of data point from all turbines in both train and test sets.

In [15]:
df = df_full
print(f"Total data points before removing NaNs: ", len(df))
df = df.dropna(subset=FEATURES + [OUTPUT_FEATURE] + [DATETIME_COL])
print(f"Total data points after removing NaNs: ", len(df))
df = df.reset_index(drop=False)

splits = df.groupby('turbine').apply(train_test_split_by_turbine)

df_full_train = pd.concat([split[0] for split in splits.tolist()])
df_full_train.reset_index(drop=True, inplace=True)
df_full_test = pd.concat([split[1] for split in splits.tolist()])
df_full_test.reset_index(drop=True, inplace=True)
print("Full Training Set Size: ", df_full_train.shape[0])
print("Full Testing Set Size: ", df_full_test.shape[0])

df_full_train.to_pickle(DATA_PATH + '/cleaned_temp/train.pkl')
df_full_test.to_pickle(DATA_PATH + '/cleaned_temp/test.pkl')

print("Preprocessed datasets saved successfully.")

Total data points before removing NaNs:  847078
Total data points after removing NaNs:  846968


  splits = df.groupby('turbine').apply(train_test_split_by_turbine)


Full Training Set Size:  677572
Full Testing Set Size:  169396
Preprocessed datasets saved successfully.


In [16]:
df_full_test.head()

Unnamed: 0,index,Date.time,Wind.dir.std,Power.me,Power.sd,Power.min,Power.max,Pot.Power.me,Wind.speed.me,Wind.speed.sd,...,Tower.Acceleration.x,Tower.Acceleration.y,turbine,year,Wind.dir.sin.me,Wind.dir.cos.me,Wind.dir.sin.min,Wind.dir.cos.min,Wind.dir.sin.max,Wind.dir.cos.max
0,155070,2020-10-08 12:20:00,16.658822,187.858789,61.860316,81.08596,338.451843,84.91643,3.877249,0.934621,...,99.675149,40.89904,1,2020,-0.530778,0.847511,0.143456,0.989657,-0.218287,0.975885
1,155071,2020-10-08 12:30:00,14.96006,211.689046,105.694689,58.154499,509.492035,107.945353,4.125634,1.062345,...,116.788789,38.283415,1,2020,-0.436318,0.899793,0.073106,0.997324,-0.112625,0.993638
2,155072,2020-10-08 12:40:00,15.584726,192.010172,119.162212,65.784187,540.276917,93.2612,3.990016,1.073107,...,109.23962,32.009303,1,2020,-0.511287,0.85941,-0.930056,0.367417,-0.042261,0.999107
3,155073,2020-10-08 12:50:00,15.180244,166.048429,69.653627,38.46735,338.739929,115.919239,4.197471,0.924172,...,92.240239,44.811341,1,2020,-0.654641,0.75594,-0.925617,0.378462,-0.059721,0.998215
4,155074,2020-10-08 13:00:00,15.787337,174.402625,47.739957,97.08139,323.598419,118.323776,4.219133,0.999748,...,100.512641,41.369501,1,2020,-0.662168,0.749355,-0.899591,0.436733,-0.198743,0.980052
