In [6]:
# data libraries
import numpy as np 
import pandas as pd 

# visualization libraries
import matplotlib.pyplot as plt 
import seaborn as sns
#import plotly.express as px


# custom eda utils
from utils.eda_utils import get_data_dict 

In [7]:
static_information = pd.read_csv("../data/static-information.csv", delimiter=';')
data_description = pd.read_csv("../data/data_description.csv", delimiter=';')

In [8]:
static_information

Unnamed: 0,Wind_turbine_name,Wind_turbine_long_name,Manufacturer,Model,Rated power (kW),Hub height (m),Rotor diameter (m),GPS,Altitude (m),Commissioning date,Department,"Region,"
0,R80736,FRHBO_E04_80736,Senvion,MM82,2050,80,82,"48.4461,5.5925",411,2009-01-15,Meuse,Grand Est
1,R80721,FRHBO_E03_80721,Senvion,MM82,2050,80,82,"48.4497,5.5869",411,2009-01-15,Meuse,Grand Est
2,R80711,FRHBO_E01_80711,Senvion,MM82,2050,80,82,"48.4569,5.5847",411,2009-01-15,Meuse,Grand Est
3,R80790,FRHBO_E02_80790,Senvion,MM82,2050,80,82,"48.4536,5.5875",411,2009-01-15,Meuse,Grand Est


In [4]:
data_description

Unnamed: 0,Variable_name,Variable_long_name,Unit_long_name,Comment
0,Q,Reactive_power,kVAr,
1,Ws,Wind_speed,m/s,Average wind speed
2,Va2,Vane_position_2,deg,Second wind vane on the nacelle
3,Git,Gearbox_inlet_temperature,deg_C,
4,Ot,Outdoor_temperature,deg_C,
5,Ws2,Wind_speed_2,m/s,Second anemometer on the nacelle
6,Nf,Grid_frequency,Hz,
7,Nu,Grid_voltage,V,
8,Dst,Generator_stator_temperature,deg_C,
9,Wa_c,Absolute_wind_direction_corrected,deg,


In [5]:
data_description.iloc[[0,26,27,28]]

Unnamed: 0,Variable_name,Variable_long_name,Unit_long_name,Comment
0,Q,Reactive_power,kVAr,
26,S,Apparent_power,kVA,Should be the square root of the sum of P squa...
27,P,Active_power,kW,
28,Cosphi,Power_factor,,Should equal P/S


In [6]:
static_information['Rated power (kW)']

0    2050
1    2050
2    2050
3    2050
Name: Rated power (kW), dtype: int64

Rated Power: 
- The maximum continuous electrical power that the wind turbine is designed to produce under ideal conditions.
- Usually expressed in kW or MW
- Given by the manufacturer
- Determined by turbine design, generator limits, and grid connection capacity
- Represents capacity at rated wind speed (e.g., 12–14 m/s)

Active Power: 
- The usable, real, or working power produced by the turbine that is fed into the grid and can do work.
- Measured in kW or MW
- Converts wind energy → rotational → electrical energy
- This is the power that counts for:
- Energy production (kWh/MWh)
- Revenue
- Performance calculations

Reactive Power:
- Power that does not produce useful work but is necessary for maintaining the voltage and magnetic fields in the electrical system.
- Measured in kVAr or MVAr
- Supports voltage control in the grid
- Does not contribute to energy generation
- Is often required by grid codes
- Can be produced or absorbed by turbines

Power Factor 
- measure of how effectively electrical power is being converted into useful work.
- Power Factor=Active Power (P)Apparent Power (S)
- Power Factor= Apparent Power (S) /Active Power (P)
- Values range:
- 1.0 (perfect) — all power is useful
- <1.0 — some power is reactive
- Lagging (consuming reactive power)
- Leading (generating reactive power)
- In wind turbines, power factor:
- Is controlled by adjusting reactive power
- Must remain within limits specified by grid operators
- Is often kept close to 1 except when voltage support is needed

Apparent power :
- np.sqrt( active^2 + reactive^2 )


# Load Sensor Data

In [9]:
names_list = static_information['Wind_turbine_name'].to_list()
data_dict = get_data_dict(names_list)

In [4]:
data_dict[0].head(1)

Unnamed: 0,Date_time,Date_time_nr,Wind_turbine_name,Ba_avg,P_avg,Q_avg,Ya_avg,Yt_avg,Ws1_avg,Ws2_avg,...,Rs_avg,Rbt_avg,Rm_avg,temp,pressure,humidity,wind_speed,wind_deg,rain_1h,snow_1h
0,2013-01-01T00:00:00+01:00,1356994800,R80736,-1.0,801.22998,67.559998,286.0,20.129999,7.52,7.76,...,16.950001,26.049999,4298.0498,5.39,1011.0,75.0,5.66,180.0,0.0,0.0


# First EDA

In [10]:
data_dict.keys()

dict_keys([0, 1, 2, 3])

In [10]:
t0 = data_dict[0]
t1 = data_dict[1]
t2 = data_dict[2]
t3 = data_dict[3]

In [40]:
#t0.info()
t1.info()
#t2.info()
#t3.info()



<class 'pandas.core.frame.DataFrame'>
Index: 261471 entries, 2013-01-01T00:00:00+01:00 to 2018-01-13T00:00:00+01:00
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Date_time_nr       261471 non-null  int64  
 1   Wind_turbine_name  261471 non-null  object 
 2   Ba_avg             261471 non-null  float64
 3   P_avg              261471 non-null  float64
 4   Q_avg              261471 non-null  float64
 5   Ya_avg             261471 non-null  float64
 6   Yt_avg             261471 non-null  float64
 7   Ws1_avg            261471 non-null  float64
 8   Ws2_avg            261471 non-null  float64
 9   Ws_avg             261471 non-null  float64
 10  Wa_avg             261471 non-null  float64
 11  Va_avg             261471 non-null  float64
 12  Ot_avg             261471 non-null  float64
 13  Rs_avg             261471 non-null  float64
 14  Rbt_avg            261471 non-null  float64
 15  Rm_avg       

In [45]:
#t0.columns
#t1.columns
#t2.columns
t3.columns

Index(['Date_time_nr', 'Wind_turbine_name', 'Ba_avg', 'P_avg', 'Q_avg',
       'Ya_avg', 'Yt_avg', 'Ws1_avg', 'Ws2_avg', 'Ws_avg', 'Wa_avg', 'Va_avg',
       'Ot_avg', 'Rs_avg', 'Rbt_avg', 'Rm_avg', 'temp', 'pressure', 'humidity',
       'wind_speed', 'wind_deg', 'rain_1h', 'snow_1h'],
      dtype='object')

In [12]:
var_index = ['Date_time_nr',  'Wind_turbine_name']
var_turb = ['Ba_avg', 'P_avg', 'Q_avg', 'Ya_avg', 'Yt_avg', 'Va_avg', 'Rs_avg', 'Rbt_avg', 'Rm_avg']
var_turb_sens=['Ws1_avg', 'Ws2_avg', 'Ws_avg', 'Wa_avg', 'Ot_avg']
var_ext = ['temp', 'pressure', 'humidity','wind_speed', 'wind_deg', 'rain_1h', 'snow_1h']

Index-type variables
- data_time
- WindTurbine


Machine variables:
- Ba - pitch angle
- P - active power  !!!!!
- Q - reactive power
- Ya - nacelle angle
- Yt - nacelle temperture
- Va1 - vane position 
- Rs - rotor_speed
- Rbt- Rotor_bearing_temperature
- Rm - Torque

wheater sensor variables:
- Ws1 - wind speed first sensor 
- Ws2 - wind speed second sensor 
- Ws - wind speed 
- Wa - wind angle
- Ot - outdoor temperature

wheater external:
- temp
- pressure
- humidity
- wind speed 
- wind deg
- rain_1h
- snow_1h

In [48]:
display(t0.shape)
display(t1.shape)
display(t2.shape)
display(t3.shape)

(261138, 23)

(261471, 23)

(261812, 23)

(261414, 23)

In [52]:
#is nas
#t0.isna().sum()
#t1.isna().sum()
#t2.isna().sum()
t3.isna().sum()

Date_time_nr         0
Wind_turbine_name    0
Ba_avg               0
P_avg                0
Q_avg                0
Ya_avg               0
Yt_avg               0
Ws1_avg              0
Ws2_avg              0
Ws_avg               0
Wa_avg               0
Va_avg               0
Ot_avg               0
Rs_avg               0
Rbt_avg              0
Rm_avg               0
temp                 0
pressure             0
humidity             0
wind_speed           0
wind_deg             0
rain_1h              0
snow_1h              0
dtype: int64

In [13]:
t0[var_turb+var_ext]

Unnamed: 0,Ba_avg,P_avg,Q_avg,Ya_avg,Yt_avg,Va_avg,Rs_avg,Rbt_avg,Rm_avg,temp,pressure,humidity,wind_speed,wind_deg,rain_1h,snow_1h
0,-1.00,801.22998,67.559998,286.00000,20.129999,0.16,16.950001,26.049999,4298.0498,5.39,1011.0,75.0,5.66,180.0,0.0,0.0
1,-1.00,943.16998,70.260002,286.00000,21.420000,2.33,17.139999,26.100000,5011.9600,5.39,1011.0,75.0,5.66,180.0,0.0,0.0
2,-1.00,998.48999,75.330002,286.00000,22.049999,7.01,17.150000,26.219999,5303.4302,5.39,1011.0,75.0,5.66,180.0,0.0,0.0
3,-1.00,837.96002,82.739998,286.00000,22.299999,8.05,16.910000,26.309999,4498.1299,5.39,1011.0,75.0,5.66,180.0,0.0,0.0
4,-1.00,871.57001,82.349998,294.17999,22.600000,4.40,16.920000,26.389999,4674.0801,5.39,1011.0,75.0,5.66,180.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261133,-0.64,52.09000,15.810000,49.63000,26.180000,-9.06,9.270000,24.720000,509.9100,1.86,1025.0,93.0,3.09,80.0,0.0,0.0
261134,0.23,7.08000,0.830000,49.63000,25.640000,-7.04,9.230000,24.840000,69.6500,1.86,1025.0,93.0,3.09,80.0,0.0,0.0
261135,11.96,0.00000,0.310000,49.63000,25.240000,-6.61,7.210000,24.840000,-48.3400,1.86,1025.0,93.0,3.09,80.0,0.0,0.0
261136,45.00,0.00000,0.000000,49.63000,25.240000,-8.58,0.830000,24.690000,-96.8900,1.86,1025.0,93.0,3.09,80.0,0.0,0.0


In [None]:
t0[var_ext].head()

Unnamed: 0_level_0,temp,pressure,humidity,wind_speed,wind_deg,rain_1h,snow_1h
Date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01T00:00:00+01:00,5.39,1011.0,75.0,5.66,180.0,0.0,0.0
2013-01-01T00:10:00+01:00,5.39,1011.0,75.0,5.66,180.0,0.0,0.0
2013-01-01T00:20:00+01:00,5.39,1011.0,75.0,5.66,180.0,0.0,0.0
2013-01-01T00:30:00+01:00,5.39,1011.0,75.0,5.66,180.0,0.0,0.0
2013-01-01T00:40:00+01:00,5.39,1011.0,75.0,5.66,180.0,0.0,0.0


In [28]:
t0[var_ext].describe()

Unnamed: 0,temp,pressure,humidity,wind_speed,wind_deg,rain_1h,snow_1h
count,261138.0,261138.0,261138.0,261138.0,261138.0,261138.0,261138.0
mean,10.830234,1017.182348,79.102011,3.500007,172.146195,0.064361,0.001342
std,7.553452,8.240761,18.045931,2.163735,99.575915,0.315314,0.035095
min,-10.7,976.0,14.0,0.13,0.0,0.0,0.0
25%,5.13,1013.0,68.0,2.06,89.0,0.0,0.0
50%,10.38,1018.0,86.0,3.09,190.0,0.0,0.0
75%,16.0,1022.0,93.0,4.63,240.0,0.0,0.0
max,35.94,1044.0,100.0,19.03,360.0,8.06,2.65


In [29]:
t0[var_turb_sens].head(10)

Unnamed: 0_level_0,Ws1_avg,Ws2_avg,Ws_avg,Wa_avg,Ot_avg
Date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-01T00:00:00+01:00,7.52,7.76,7.64,286.19,5.44
2013-01-01T00:10:00+01:00,8.18,8.45,8.31,288.32999,5.74
2013-01-01T00:20:00+01:00,8.29,8.66,8.47,293.04001,6.09
2013-01-01T00:30:00+01:00,7.89,8.24,8.06,294.01999,6.35
2013-01-01T00:40:00+01:00,7.86,8.2,8.03,299.22,6.51
2013-01-01T00:50:00+01:00,7.62,7.93,7.78,294.82001,6.58
2013-01-01T01:00:00+01:00,7.92,8.2,8.06,297.51001,6.53
2013-01-01T01:10:00+01:00,7.83,8.14,7.99,298.57999,6.55
2013-01-01T01:20:00+01:00,7.09,7.33,7.21,296.01999,6.53
2013-01-01T01:30:00+01:00,6.95,7.16,7.05,294.09,6.44


In [15]:
t0.head(1)

Unnamed: 0,Date_time,Date_time_nr,Wind_turbine_name,Ba_avg,P_avg,Q_avg,Ya_avg,Yt_avg,Ws1_avg,Ws2_avg,...,Rs_avg,Rbt_avg,Rm_avg,temp,pressure,humidity,wind_speed,wind_deg,rain_1h,snow_1h
0,2013-01-01T00:00:00+01:00,1356994800,R80736,-1.0,801.22998,67.559998,286.0,20.129999,7.52,7.76,...,16.950001,26.049999,4298.0498,5.39,1011.0,75.0,5.66,180.0,0.0,0.0


In [60]:
def check_timestamp_consistency(df):
    idx = df.index
    
    # 1. Ensure datetime index
    if not isinstance(idx, pd.DatetimeIndex):
        return "Index is not a DatetimeIndex"

    results = {}

    # 2. Check sorting
    results['is_sorted'] = idx.is_monotonic_increasing

    # 3. Calculate time differences
    diffs = idx.to_series().diff().dropna()
    results['unique_diffs'] = diffs.unique()

    # 4. Check if step length is constant
    results['consistent_step'] = len(diffs.unique()) == 1

    # 5. Check for missing timestamps
    expected = pd.date_range(start=idx.min(), end=idx.max(), freq=diffs.mode()[0])
    missing = expected.difference(idx)
    results['missing_timestamps'] = list(missing)

    # 6. Check for duplicates
    results['duplicates'] = idx[idx.duplicated()].tolist()

    return results

result = check_timestamp_consistency(t0)
print(result)

Index is not a DatetimeIndex


In [17]:
time = t0['Date_time_nr'] 

In [18]:
time

0         1356994800
1         1356995400
2         1356996000
3         1356996600
4         1356997200
             ...    
261133    1515795600
261134    1515796200
261135    1515796800
261136    1515797400
261137    1515798000
Name: Date_time_nr, Length: 261138, dtype: int64

In [22]:
def give_time_discrepancies(data):
    time =  data['Date_time_nr'] 
    return time.diff().unique()

In [24]:
display(give_time_discrepancies(t0))
display(give_time_discrepancies(t1))
display(give_time_discrepancies(t2))
display(give_time_discrepancies(t3))

array([    nan,    600.,   1200.,  87000.,   4200.,   7800.,  28200.,
         2400.,  34800.,   6600.,   1800.,  19800.,  37200.,  42000.,
        17400.,  11400., 123600.,   3000.,  25200.,   6000.,  24600.,
        83400.,   4800.,  22800.,  27000.,  13800.,   8400.,   3600.,
         9600.,  16200.,  21600.,  42600.,  59400.,  97800.,  10800.,
        20400.,   5400.,  10200.,  12000.,   7200.,  54600.,  34200.,
        49800.,  12600.,   9000.,  15600.,  37800.,  18000.,  36000.,
       109200.,  43800.,  44400.])

array([    nan,    600.,   1200.,  87000.,   4200.,   7800.,  27600.,
        14400.,  34800.,  10800.,  15000.,   4800.,  35400.,   9000.,
       478800.,   6600.,   1800.,   3600.,   2400.,  17400.,  13200.,
         7200., 124200.,   8400.,  22200.,   6000.,  26400.,   5400.,
        20400.,  27000.,   3000.,  24600.,  21000.,  30600.,  58800.,
        12000.,  23400.,  10200.,  12600.,  37800.,  53400.])

array([    nan,    600.,   1200.,  87000.,   4200.,   9000.,   1800.,
        28200.,   2400.,  34800.,   7800.,   3000.,   4800.,  19800.,
        36000.,  14400.,  39000.,  16800.,   6600., 124200.,  18000.,
         6000.,   5400.,  22800.,  26400.,   3600.,  11400.,   9600.,
        45000.,  16200.,   8400.,  18600.,  48600.,  13200.,  54000.,
        43200.,  36600.,  54600.,  10200.,  27000.,  53400.,   7200.,
        15600.])

array([    nan,    600.,   1200.,  87000.,   4200.,   5400.,  28200.,
         2400.,  34800.,  37800.,   7800., 161400.,   7200.,  21000.,
        35400.,   1800.,   6000.,  39000.,  18600.,   4800., 124200.,
        20400.,   3600.,  23400.,  24600.,  21600.,  33000.,  12600.,
        89400.,  18000.,   6600.,   9600.,  16200.,   3000.,   9000.,
        10200.,   8400.,  52200.,  11400.,  17400.,  12000.,  27000.,
        27600.,  10800.,  14400.,  13800.,  26400.])

In [28]:
def give_time_interval(data):
    col =  data['Date_time_nr'] 
    return col.max(), col.min(), col.shape

In [29]:
display(give_time_interval(t0))
display(give_time_interval(t1))
display(give_time_interval(t2))
display(give_time_interval(t3))

(np.int64(1515798000), np.int64(1356994800), (261138,))

(np.int64(1515798000), np.int64(1356994800), (261471,))

(np.int64(1515798000), np.int64(1356994800), (261812,))

(np.int64(1515798000), np.int64(1356994800), (261414,))