# Splitting Data

For consistency across different jupyter notebooks, we pre-generate train and test set a priori and save them as .pkl files

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from constants import *

In [2]:
def train_test_split_by_turbine(group, test_size=0.2):
    train_set, test_set = train_test_split(group, test_size=test_size, random_state=42)
    return train_set, test_set

In [4]:
file_path = DATA_PATH + '/cleaned/full.pkl'
df_full = pd.read_pickle(file_path)

In [14]:
df_full.shape

(1018494, 110)

Split the data to have the same proportion of data point from all turbines in both train and test sets.

In [5]:
df = df_full
print(f"Total data points before removing NaNs: ", len(df))
df = df.dropna(subset=FEATURES + [OUTPUT_FEATURE] + [DATETIME_COL])
print(f"Total data points after removing NaNs: ", len(df))
df = df.reset_index(drop=False)

splits = df.groupby('turbine').apply(train_test_split_by_turbine)

df_full_train = pd.concat([split[0] for split in splits.tolist()])
df_full_train.reset_index(drop=True, inplace=True)
df_full_test = pd.concat([split[1] for split in splits.tolist()])
df_full_test.reset_index(drop=True, inplace=True)
print("Full Training Set Size: ", df_full_train.shape[0])
print("Full Testing Set Size: ", df_full_test.shape[0])

df_full_train.to_pickle(DATA_PATH + '/cleaned/train.pkl')
df_full_test.to_pickle(DATA_PATH + '/cleaned/test.pkl')

print("Preprocessed datasets saved successfully.")

Total data points before removing NaNs:  1018494
Total data points after removing NaNs:  1009707


  splits = df.groupby('turbine').apply(train_test_split_by_turbine)


Full Training Set Size:  807764
Full Testing Set Size:  201943
Preprocessed datasets saved successfully.


In [16]:
df_full_test.head()

Unnamed: 0,index,Date.time,Power.me,Power.sd,Power.min,Power.max,Pot.Power.me,Wind.speed.me,Wind.speed.sd,Wind.speed.min,...,Blade.ang.pitch.pos.C.sd,Blade.ang.pitch.pos.C.min,Blade.ang.pitch.pos.C.max,Gear.oil.inlet.press.me,Gear.oil.pump.press.me,Drive.train.acceleration.me,Tower.Acceleration.x,Tower.Acceleration.y,turbine,year
0,153867,2020-09-30 03:50:00,385.860034,56.342456,295.566345,449.544281,338.06798,5.715419,0.44795,4.89388,...,0.0,0.0,0.0,89.851377,376.600291,4.389306,26.209076,14.347769,1,2020
1,88280,2019-06-29 01:30:00,75.980338,26.151646,25.22488,135.335602,25.878889,3.079444,0.437013,2.10208,...,0.341673,0.0,1.0,75.669812,355.064588,6.624571,84.897789,33.753158,1,2019
2,49198,2018-09-03 00:00:00,262.332092,4.282705,252.99028,274.033264,260.394271,5.297819,0.396049,4.433159,...,,,,80.716408,380.343781,3.440585,8.381617,7.973517,1,2018
3,182807,2021-04-21 07:30:00,693.568533,241.455054,262.049866,1142.049316,529.457246,6.545107,1.369241,3.045797,...,0.0,0.0,0.0,125.330433,484.002443,7.685233,40.745541,31.793805,1,2021
4,8230,2017-11-21 04:10:00,1747.494141,240.466431,1200.684692,2095.986084,1712.140026,9.802714,1.107425,8.151685,...,,,,204.65889,739.837952,155.046234,74.603737,48.718048,1,2017
