## Import libraries

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

tqdm.pandas()
np.random.seed(42)

## Load source datasets

In [2]:
train = pd.read_csv("../input/mh-renew-data/train.csv")
print(f"train: {train.shape}")
train.head()

train: (909604, 16)


Unnamed: 0,timestamp,active_power_calculated_by_converter,active_power_raw,ambient_temperature,generator_speed,generator_winding_temp_max,grid_power10min_average,nc1_inside_temp,nacelle_temp,reactice_power_calculated_by_converter,reactive_power,wind_direction_raw,wind_speed_raw,wind_speed_turbulence,turbine_id,Target
0,2021-02-19 20:18:00,816.636759,834.917206,31.69438,1159.616602,65.954214,917.897085,31.881972,31.504713,141.457644,165.501518,280.864782,7.057,0.544082,Turbine_108,47.582787
1,2021-04-27 04:55:00,419.107829,421.050873,12.894948,928.747996,59.571319,445.55425,32.423705,32.75577,89.186457,113.835236,299.55246,5.474937,0.469031,Turbine_18,46.070328
2,2021-01-25 06:26:00,1303.530558,1337.566142,16.648388,1201.219775,61.270498,1364.716003,11.446849,18.332985,230.622309,281.452253,84.960106,8.092457,0.622318,Turbine_105,39.989236
3,2021-10-30 03:47:00,61.494872,53.481008,28.388141,769.806122,40.674348,14.324897,34.253204,32.662889,66.211015,75.017531,87.261119,4.071032,0.760719,Turbine_15,46.056587
4,2021-03-15 00:39:00,593.514364,611.659108,31.519527,1046.916768,64.341763,599.020172,32.405586,31.466387,137.163938,160.202421,313.724818,6.357943,0.346068,Turbine_01,54.346095


In [3]:
test = pd.read_csv("../input/mh-renew-data/test.csv")
print(f"test: {test.shape}")
test.head()

test: (303202, 15)


Unnamed: 0,timestamp,active_power_calculated_by_converter,active_power_raw,ambient_temperature,generator_speed,generator_winding_temp_max,grid_power10min_average,nc1_inside_temp,nacelle_temp,reactice_power_calculated_by_converter,reactive_power,wind_direction_raw,wind_speed_raw,wind_speed_turbulence,turbine_id
0,2021-05-06 15:02:00,15.507537,19.956882,40.233264,410.538834,57.357366,84.008399,48.826597,45.363477,2.419202,1.468876,156.399396,2.438629,0.790435,Turbine_14
1,2021-12-04 06:11:00,472.577255,478.373881,12.306687,965.077563,51.796727,492.531362,27.457511,24.045565,96.15207,119.378235,56.443802,5.900371,0.544198,Turbine_19
2,2021-08-31 19:03:00,447.829615,451.158264,29.106771,948.035899,60.493098,393.151576,39.410553,35.715697,90.251529,117.192635,224.540263,5.924923,0.513481,Turbine_14
3,2021-09-19 19:01:00,254.014336,249.438965,27.0749,804.963776,55.741899,178.715101,29.476841,31.151895,89.273838,105.086257,182.151426,4.409707,0.448881,Turbine_120
4,2021-04-04 02:21:00,1174.930566,1202.654077,32.148434,1199.754858,67.272313,1115.850317,31.920549,30.864277,189.731989,236.05332,261.67218,7.822326,0.561465,Turbine_158


## Feature Engineering

In [4]:
train['turbine_id'] = train['turbine_id'].progress_apply(lambda x: int(x.split('_')[1]))
train['active_power_avg'] = (train['active_power_raw'] + train['active_power_calculated_by_converter'])/2.0
train['active_power_ratio'] = train['active_power_raw']/train['active_power_calculated_by_converter']

train['reactice_power_avg'] = (train['reactive_power'] + train['reactice_power_calculated_by_converter'])/2.0
train['reactice_power_ratio'] = train['reactive_power']/train['reactice_power_calculated_by_converter']

train['power_avg'] = (train['active_power_raw'] + train['active_power_calculated_by_converter'] + \
                      train['reactive_power'] + train['reactice_power_calculated_by_converter'])/4.0

train['power_ratio1'] = train['active_power_calculated_by_converter']/train['grid_power10min_average']
train['power_ratio2'] = train['active_power_calculated_by_converter']/train['reactice_power_calculated_by_converter']
train['power_ratio3'] = train['active_power_raw']/train['reactive_power']
train['power_ratio4'] = train['reactive_power']/train['grid_power10min_average']
train['power_ratio5'] = train['active_power_avg']/train['grid_power10min_average']
train['power_ratio6'] = train['reactice_power_avg']/train['grid_power10min_average']
train['power_ratio7'] = train['power_avg']/train['grid_power10min_average']

train['nacelle_temp_ratio'] = train['nacelle_temp']/train['nc1_inside_temp']
train['nacelle_temp_diff'] = train['nacelle_temp'] - train['nc1_inside_temp']
train['nacelle_ambient_temp_diff'] = train['nacelle_temp'] - train['ambient_temperature']
train['nacelle_ambient_temp_ratio'] = train['nacelle_temp']/train['ambient_temperature']
train['temp_avg'] = (train['ambient_temperature'] + train['generator_winding_temp_max'] + \
                     train['nc1_inside_temp'] + train['nacelle_temp'])/4.0

train['wind_direction_sin'] = np.sin(2 * np.pi * train['wind_direction_raw']/360)
train['wind_direction_cos'] = np.cos(2 * np.pi * train['wind_direction_raw']/360)

train.head()

100%|██████████| 909604/909604 [00:01<00:00, 563671.56it/s]


Unnamed: 0,timestamp,active_power_calculated_by_converter,active_power_raw,ambient_temperature,generator_speed,generator_winding_temp_max,grid_power10min_average,nc1_inside_temp,nacelle_temp,reactice_power_calculated_by_converter,...,power_ratio5,power_ratio6,power_ratio7,nacelle_temp_ratio,nacelle_temp_diff,nacelle_ambient_temp_diff,nacelle_ambient_temp_ratio,temp_avg,wind_direction_sin,wind_direction_cos
0,2021-02-19 20:18:00,816.636759,834.917206,31.69438,1159.616602,65.954214,917.897085,31.881972,31.504713,141.457644,...,0.89964,0.167208,0.533424,0.988167,-0.377259,-0.189667,0.994016,40.25882,-0.982075,0.188492
1,2021-04-27 04:55:00,419.107829,421.050873,12.894948,928.747996,59.571319,445.55425,32.423705,32.75577,89.186457,...,0.942824,0.22783,0.585327,1.010241,0.332065,19.860822,2.540202,34.411435,-0.869904,0.49322
2,2021-01-25 06:26:00,1303.530558,1337.566142,16.648388,1201.219775,61.270498,1364.716003,11.446849,18.332985,230.622309,...,0.967636,0.187612,0.577624,1.601575,6.886136,1.684597,1.101187,26.92468,0.996134,0.087849
3,2021-10-30 03:47:00,61.494872,53.481008,28.388141,769.806122,40.674348,14.324897,34.253204,32.662889,66.211015,...,4.013149,4.929479,4.471314,0.953572,-1.590315,4.274748,1.150582,33.994646,0.998858,0.047784
4,2021-03-15 00:39:00,593.514364,611.659108,31.519527,1046.916768,64.341763,599.020172,32.405586,31.466387,137.163938,...,1.005954,0.248211,0.627082,0.971017,-0.939198,-0.05314,0.998314,39.933316,-0.722668,0.691196


In [5]:
test['turbine_id'] = test['turbine_id'].progress_apply(lambda x: int(x.split('_')[1]))
test['active_power_avg'] = (test['active_power_raw'] + test['active_power_calculated_by_converter'])/2.0
test['active_power_ratio'] = test['active_power_raw']/test['active_power_calculated_by_converter']

test['reactice_power_avg'] = (test['reactive_power'] + test['reactice_power_calculated_by_converter'])/2.0
test['reactice_power_ratio'] = test['reactive_power']/test['reactice_power_calculated_by_converter']

test['power_avg'] = (test['active_power_raw'] + test['active_power_calculated_by_converter'] + \
                      test['reactive_power'] + test['reactice_power_calculated_by_converter'])/4.0

test['power_ratio1'] = test['active_power_calculated_by_converter']/test['grid_power10min_average']
test['power_ratio2'] = test['active_power_calculated_by_converter']/test['reactice_power_calculated_by_converter']
test['power_ratio3'] = test['active_power_raw']/test['reactive_power']
test['power_ratio4'] = test['reactive_power']/test['grid_power10min_average']
test['power_ratio5'] = test['active_power_avg']/test['grid_power10min_average']
test['power_ratio6'] = test['reactice_power_avg']/test['grid_power10min_average']
test['power_ratio7'] = test['power_avg']/test['grid_power10min_average']

test['nacelle_temp_ratio'] = test['nacelle_temp']/test['nc1_inside_temp']
test['nacelle_temp_diff'] = test['nacelle_temp'] - test['nc1_inside_temp']
test['nacelle_ambient_temp_diff'] = test['nacelle_temp'] - test['ambient_temperature']
test['nacelle_ambient_temp_ratio'] = test['nacelle_temp']/test['ambient_temperature']
test['temp_avg'] = (test['ambient_temperature'] + test['generator_winding_temp_max'] + \
                     test['nc1_inside_temp'] + test['nacelle_temp'])/4.0

test['wind_direction_sin'] = np.sin(2 * np.pi * test['wind_direction_raw']/360)
test['wind_direction_cos'] = np.cos(2 * np.pi * test['wind_direction_raw']/360)

test.head()

100%|██████████| 303202/303202 [00:00<00:00, 556042.45it/s]


Unnamed: 0,timestamp,active_power_calculated_by_converter,active_power_raw,ambient_temperature,generator_speed,generator_winding_temp_max,grid_power10min_average,nc1_inside_temp,nacelle_temp,reactice_power_calculated_by_converter,...,power_ratio5,power_ratio6,power_ratio7,nacelle_temp_ratio,nacelle_temp_diff,nacelle_ambient_temp_diff,nacelle_ambient_temp_ratio,temp_avg,wind_direction_sin,wind_direction_cos
0,2021-05-06 15:02:00,15.507537,19.956882,40.233264,410.538834,57.357366,84.008399,48.826597,45.363477,2.419202,...,0.211077,0.023141,0.117109,0.929073,-3.46312,5.130212,1.127512,47.945176,0.400359,-0.916359
1,2021-12-04 06:11:00,472.577255,478.373881,12.306687,965.077563,51.796727,492.531362,27.457511,24.045565,96.15207,...,0.965371,0.218799,0.592085,0.875737,-3.411947,11.738878,1.953862,28.901623,0.833344,0.552755
2,2021-08-31 19:03:00,447.829615,451.158264,29.106771,948.035899,60.493098,393.151576,39.410553,35.715697,90.251529,...,1.14331,0.263822,0.703566,0.906247,-3.694856,6.608926,1.227058,41.18153,-0.70141,-0.712758
3,2021-09-19 19:01:00,254.014336,249.438965,27.0749,804.963776,55.741899,178.715101,29.476841,31.151895,89.273838,...,1.408536,0.543771,0.976153,1.056826,1.675053,4.076995,1.150582,35.861384,-0.037541,-0.999295
4,2021-04-04 02:21:00,1174.930566,1202.654077,32.148434,1199.754858,67.272313,1115.850317,31.920549,30.864277,189.731989,...,1.065369,0.19079,0.628079,0.966909,-1.056273,-1.284158,0.960055,40.551393,-0.989456,-0.144837


In [6]:
train.replace([np.nan, np.inf, -np.inf], 0, inplace=True)
test.replace([np.nan, np.inf, -np.inf], 0, inplace=True)

In [7]:
train.drop(['timestamp','active_power_raw','reactive_power','wind_direction_raw'], axis=1, inplace=True)
test.drop(['timestamp','active_power_raw','reactive_power','wind_direction_raw'], axis=1, inplace=True)

In [8]:
features = test.columns.tolist()
cat_cols = ['turbine_id']
num_cols = [col for col in features if col not in cat_cols]

In [9]:
scaler = MinMaxScaler().fit(train[num_cols])
train[num_cols] = scaler.transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])
print(f"train: {train.shape} \ntest:{test.shape}")

train: (909604, 31) 
test:(303202, 30)


## Save processed datasets

In [10]:
train.to_csv("./train.csv", index=False)
test.to_csv("./test.csv", index=False)

In [11]:
## Good Day!!