In [9]:
!pip install flaml
!pip install suntime

In [10]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from suntime import Sun
import pytz
import warnings
from flaml import AutoML
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
warnings.filterwarnings(action='ignore')

# 1 - Preprocessing

In [12]:
df_temp = pd.read_csv('../input/enerjisa-enerji-veri-maratonu/temperature.csv',sep=';')
df_temp.replace(',','.',regex=True,inplace=True)
df_temp.head(2)

In [13]:
df_gen = pd.read_csv('../input/enerjisa-enerji-veri-maratonu/generation.csv',sep=';')
df_gen.replace(',','.',regex=True,inplace=True)
df_gen.head(2)

In [14]:
df_samp = pd.read_csv('../input/enerjisa-enerji-veri-maratonu/sample_submission.csv',sep=',')
df_samp.head(2)

In [15]:
df_temp.isna().sum()

In [16]:
df_temp.DateTime = pd.to_datetime(df_temp.DateTime,infer_datetime_format=True)
df_temp.head(2)

In [17]:
#***
df_temp.iloc[:,1:] = df_temp.iloc[:,1:].astype('float')
df_temp

In [18]:
df_temp.describe()

In [19]:
#Check if date shift exists
df_temp.DateTime.index

In [20]:
#check continuity of nan values
df_temp[df_temp.DateTime.isna()].index.values.all() == np.arange(26304,26399,1).all()

In [21]:
#Check if nan values are dropped, whole dataset dates cover full days wwithout any exception like 16 hours in a day.
last_nan_index = df_temp[df_temp.DateTime.isna()].index[0]
df_temp.DateTime.iloc[23] , df_temp.DateTime.iloc[last_nan_index-1]

In [22]:
#drop date nan values
df_temp.dropna(subset=['DateTime'],inplace=True)
df_temp

In [23]:
#check nan values of whole columns. There are some nan values in WWCode
df_temp.isna().sum()

In [24]:
df_temp[df_temp.WWCode.isna()].index

In [25]:
#In this part, WWCode nan values are filled according to 2 values which are nearest values by date.
df_temp.loc[df_temp.WWCode.isna(),'WWCode'] = -1
df_temp.loc[df_temp.WWCode==0,'WWCode'] = -1
for i in df_temp[df_temp.WWCode==-1].index:
    for j in range(i-1,0,-24):
        wwcode_previous = 0
        diff_p = -1
        if df_temp.WWCode.iloc[j] != -1:
            wwcode_previous = df_temp.WWCode.iloc[j]
            diff_p = abs(i-j)
            break
    for j in range(i,len(df_temp),24):
        wwcode_next = 0
        diff_n = -1
        if df_temp.WWCode.iloc[j] != -1:
            wwcode_next = df_temp.WWCode.iloc[j]
            diff_n = abs(i-j)
            break
    if diff_p < diff_n:
        df_temp.WWCode.iloc[i] = wwcode_previous
    else:
        df_temp.WWCode.iloc[i] = wwcode_next
    

In [26]:
    #Now, there is no any nan values anymore.
    df_temp.isna().sum()

In [27]:
#distribution of WWCode values
plt.figure(figsize=(10,5))
plt.bar(df_temp.WWCode.value_counts().index,df_temp.WWCode.value_counts())      

# 2- Feature Engineering

In [28]:
#Time features are added.
df_temp['Hour'] = df_temp.DateTime.dt.hour
df_temp['Day'] = df_temp.DateTime.dt.day
df_temp['Dayofweek'] = df_temp.DateTime.dt.dayofweek
df_temp['Weekofyear'] = df_temp.DateTime.dt.weekofyear
df_temp['Month'] = df_temp.DateTime.dt.month
df_temp['Quarter'] = df_temp.DateTime.dt.quarter
df_temp['Year'] = df_temp.DateTime.dt.year

In [29]:
df_temp.head(2)

In [30]:
#WindDirection column actually means angle of wind, so values are rated for 0 to 360 and its sin and cos values are also added. 
df_temp['WindDirection_sin'] = np.sin(2*np.pi*df_temp['WindDirection']/df_temp['WindDirection'].max())
df_temp['WindDirection_cos'] = np.cos(2*np.pi*df_temp['WindDirection']/df_temp['WindDirection'].max())
df_temp.head(3)

In [31]:
#for sun angle calculation
ankara_coord = [40.239, 33.029]
sun = Sun(ankara_coord[0], ankara_coord[1])
tz =pytz.timezone('Europe/Istanbul')

In [32]:
#check if it is day or night
def is_day(x):
    sun_rise_time = Sun.get_local_sunrise_time(sun,x.DateTime.date(),tz).hour
    sun_set_time = Sun.get_local_sunset_time(sun,x.DateTime.date(),tz).hour
    if x.DateTime.hour in np.arange(sun_rise_time,sun_set_time+1):
        return 1
    else:
        return 0

In [33]:
#is_day column is added.
df_temp['is_day'] = df_temp.apply(lambda x: is_day(x),axis=1)

In [34]:
#Sun angle information is extracted according to sunrise and sunset time of current location.
def get_sun_angle(x):
    sun_rise_time = Sun.get_local_sunrise_time(sun,x.DateTime.date(),tz).hour
    sun_set_time = Sun.get_local_sunset_time(sun,x.DateTime.date(),tz).hour
    return np.sin(np.pi*(x.Hour - sun_rise_time)/(sun_set_time))

In [35]:
#sun_angle column is added.
df_temp['sun_angle'] = df_temp.apply(lambda x: get_sun_angle(x),axis=1)

In [36]:
df_temp.head(2)

In [37]:
#check if dates of test datas have any inconsistency
df_test = df_temp[df_temp.DateTime>dt.datetime(2021,11,30,23)]
df_test

In [38]:
#train datas are seperated from test datas.
df_train = df_temp[df_temp.DateTime.isin(df_test.DateTime)==False]
df_train

In [39]:
#In dataset, there is only one column described as categorical feature.
#So it needs to check if there is a different category in test datas which is not exist in train data.
set(df_test.WWCode.tolist()) - set(df_train.WWCode.tolist())

In [40]:
#check if any nan value exists in test data.
df_test.isna().sum()

In [41]:
#There is only 2 nan value, so it is labeled as 0 which means unseen data.
def fill_wwcode_for_test(x):
    try:
        if x.WWCode == 0:
            return df_test[df_test.Hour == x.Hour].WWCode.mode()[0]
        else:
            return x.WWCode
    except:
        print('something s wrong!')
df_test['WWCode'] = df_test.apply(lambda x: fill_wwcode_for_test(x),axis=1)

In [42]:
df_test[df_test.WWCode == 84]

In [43]:
#84 is unique value for test data. So it is replaced with 83 which has nearest mean to 84
df_test.loc[df_test.WWCode == 84,'WWCode'] = 83

In [44]:
#Correlation matrix of columns
import seaborn as sb
corr = df_train.corr()
plt.figure(figsize=(15,15))
sb.heatmap(corr,annot=True)

In [45]:
#Most correlated columns are considered as dublicate, so one of them is dropped.
df_train.drop(columns=['Month','ComfortTemperature'],inplace=True)
df_test.drop(columns=['Month','ComfortTemperature'],inplace=True)

In [47]:
#y_label dataset
df_gen = pd.read_csv('../input/enerjisa-enerji-veri-maratonu/generation.csv',sep=';')
df_gen = df_gen.replace(',','.',regex=True)
df_gen

In [48]:
##check continuity of dates
(df_gen[df_gen.DateTime.isna()].index == np.arange(25560,26304,1)).all()

In [49]:
df_gen.dropna(subset=['DateTime'],inplace=True)
df_gen.DateTime = pd.to_datetime(df_gen.DateTime)
df_gen

In [50]:
#Features and y_labels are merged into one dataframe.
df_train = df_train.merge(df_gen,on='DateTime',how='inner')

In [51]:
df_train.columns

In [52]:
df_train

In [53]:
df_train.iloc[:,1:] = df_train.iloc[:,1:].astype('float')

# 3- Modeling

In [54]:
#Train and evaluation datas are seperated.
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(columns=['Generation','DateTime']), df_train.loc[:,['Generation']], test_size=0.2, random_state=40)

In [55]:
#AutoML function is useful for detecting most optimal model as its metric,task and time_budget.
#After finishing whole iteration of this function, it is possible to learn best estimator and its best configurations.
automl3 = AutoML()
settings = {
    "time_budget": 500,  
    "metric": 'rmse',
    "estimator_list": ['lgbm','xgboost','catboost'],
    "task": 'regression',
}
automl3.fit(X_train=X_train, y_train=y_train.values, **settings)

In [56]:
#best estimator is xgboost
automl3.best_estimator

In [57]:
xgboost_best_config = automl3.best_config
xgboost_best_config

In [58]:
from xgboost import XGBRegressor
model_xgb = XGBRegressor(**xgboost_best_config)
model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse')

In [59]:
val_pred = model_xgb.predict(X_val)
print('RMSE of model is: ',(mean_squared_error(y_val,val_pred))**0.5)