In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import KNNImputer

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

from xgboost import XGBRegressor

#import tensorflow as tf

from IPython.display import Image, display
import warnings
warnings.filterwarnings('ignore');

In [2]:
df_plt1_gen = pd.read_csv('Plant_1_Generation_Data.csv')
df_plt1_gen.head()

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
0,15-05-2020 00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0
1,15-05-2020 00:00,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.0,6183645.0
2,15-05-2020 00:00,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.0,6987759.0
3,15-05-2020 00:00,4135001,7JYdWkrLSPkdwr4,0.0,0.0,0.0,7602960.0
4,15-05-2020 00:00,4135001,McdE0feGgRqW7Ca,0.0,0.0,0.0,7158964.0


In [3]:
df_gen1 = df_plt1_gen.drop('PLANT_ID', axis=1)

df_gen1['INVERTER'] = df_gen1.SOURCE_KEY.map({df_gen1.SOURCE_KEY.unique()[i-1]: f'INVERTER_{i}' for i in range(1,len(df_gen1.SOURCE_KEY.unique()) +1)})

df_gen1 = df_gen1.drop('SOURCE_KEY', axis=1)
inverters = df_gen1.INVERTER.unique()
inverters

array(['INVERTER_1', 'INVERTER_2', 'INVERTER_3', 'INVERTER_4',
       'INVERTER_5', 'INVERTER_6', 'INVERTER_7', 'INVERTER_8',
       'INVERTER_9', 'INVERTER_10', 'INVERTER_11', 'INVERTER_12',
       'INVERTER_13', 'INVERTER_14', 'INVERTER_15', 'INVERTER_16',
       'INVERTER_17', 'INVERTER_18', 'INVERTER_19', 'INVERTER_20',
       'INVERTER_21', 'INVERTER_22'], dtype=object)

In [4]:
df_gen1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68778 entries, 0 to 68777
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATE_TIME    68778 non-null  object 
 1   DC_POWER     68778 non-null  float64
 2   AC_POWER     68778 non-null  float64
 3   DAILY_YIELD  68778 non-null  float64
 4   TOTAL_YIELD  68778 non-null  float64
 5   INVERTER     68778 non-null  object 
dtypes: float64(4), object(2)
memory usage: 3.1+ MB


In [5]:
df_gen1.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
DATE_TIME,68778.0,3158.0,1/6/2020 12:45,22.0,,,,,,,
DC_POWER,68778.0,,,,3147.426211,4036.457169,0.0,0.0,429.0,6366.964286,14471.125
AC_POWER,68778.0,,,,307.802752,394.396439,0.0,0.0,41.49375,623.61875,1410.95
DAILY_YIELD,68778.0,,,,3295.968737,3145.178309,0.0,0.0,2658.714286,6274.0,9163.0
TOTAL_YIELD,68778.0,,,,6978711.760671,416271.982856,6183645.0,6512002.53575,7146685.0,7268705.90625,7846821.0
INVERTER,68778.0,22.0,INVERTER_11,3155.0,,,,,,,


In [6]:
df_gen1.groupby('INVERTER').count().DATE_TIME.sort_values()

INVERTER
INVERTER_22    3104
INVERTER_7     3118
INVERTER_3     3118
INVERTER_10    3119
INVERTER_2     3119
INVERTER_20    3119
INVERTER_9     3123
INVERTER_5     3124
INVERTER_15    3124
INVERTER_16    3124
INVERTER_21    3124
INVERTER_12    3125
INVERTER_14    3125
INVERTER_17    3125
INVERTER_19    3126
INVERTER_18    3126
INVERTER_13    3130
INVERTER_8     3130
INVERTER_4     3133
INVERTER_6     3133
INVERTER_1     3154
INVERTER_11    3155
Name: DATE_TIME, dtype: int64

In [17]:
df_gen1['DATE_TIME'] = pd.to_datetime(df_gen1.DATE_TIME)
df_gen1.dtypes

DATE_TIME      datetime64[ns]
DC_POWER              float64
AC_POWER              float64
DAILY_YIELD           float64
TOTAL_YIELD           float64
INVERTER               object
dtype: object

## Weather Sensor Data

In [18]:
df_plt1_weather = pd.read_csv('Plant_1_Weather_Sensor_Data.csv')
df_plt1_weather.sample(10, random_state=1)

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
1194,5/28/2020 2:30,4135001,HmiyD2TTLFNqkNe,22.398088,20.53678,0.0
2385,6/9/2020 16:45,4135001,HmiyD2TTLFNqkNe,29.618705,37.964126,0.30209
99,5/16/2020 3:30,4135001,HmiyD2TTLFNqkNe,21.731421,20.351884,0.0
2779,6/13/2020 19:15,4135001,HmiyD2TTLFNqkNe,24.000767,21.948254,0.0
13,5/15/2020 3:15,4135001,HmiyD2TTLFNqkNe,24.985215,24.351508,0.0
2304,6/8/2020 20:30,4135001,HmiyD2TTLFNqkNe,24.091545,20.824284,0.0
442,5/19/2020 18:00,4135001,HmiyD2TTLFNqkNe,25.046377,25.426198,0.001575
2847,6/14/2020 12:15,4135001,HmiyD2TTLFNqkNe,24.93256,36.606858,0.472765
45,5/15/2020 11:15,4135001,HmiyD2TTLFNqkNe,30.216062,50.006989,0.585787
705,5/22/2020 22:45,4135001,HmiyD2TTLFNqkNe,23.214526,21.719092,0.0


In [19]:
df_plt1_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3182 entries, 0 to 3181
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DATE_TIME            3182 non-null   object 
 1   PLANT_ID             3182 non-null   int64  
 2   SOURCE_KEY           3182 non-null   object 
 3   AMBIENT_TEMPERATURE  3182 non-null   float64
 4   MODULE_TEMPERATURE   3182 non-null   float64
 5   IRRADIATION          3182 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 149.3+ KB


In [22]:
df_plt1_weather.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
DATE_TIME,3182.0,3182.0,5/15/2020 0:00,1.0,,,,,,,
PLANT_ID,3182.0,,,,4135001.0,0.0,4135001.0,4135001.0,4135001.0,4135001.0,4135001.0
SOURCE_KEY,3182.0,1.0,HmiyD2TTLFNqkNe,3182.0,,,,,,,
AMBIENT_TEMPERATURE,3182.0,,,,25.531606,3.354856,20.398505,22.705182,24.613814,27.920532,35.252486
MODULE_TEMPERATURE,3182.0,,,,31.091015,12.261222,18.140415,21.090553,24.61806,41.30784,65.545714
IRRADIATION,3182.0,,,,0.228313,0.300836,0.0,0.0,0.024653,0.449588,1.221652


In [23]:
# Remove PLANT_ID and SOURCE_KEY since they are same throughout and making DATE_TIME a datetime object.

df_weather = df_plt1_weather.drop(['PLANT_ID', 'SOURCE_KEY'], axis=1)
df_weather['DATE_TIME'] = pd.to_datetime(df_weather.DATE_TIME)

In [25]:
# Filling out missing datetimes
# In order to fill in the missing datetimes for our generation and weather sensor data set let's make a 
# datetime object with the full range of expected datetimes.




DATE_TIME              0
AMBIENT_TEMPERATURE    0
MODULE_TEMPERATURE     0
IRRADIATION            0
dtype: int64