## Import libraries

In [164]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer
import category_encoders as ce

## Get the data

In [165]:
df = pd.read_csv("../../../datasets/parte2/train.csv")

## Feature engineering

#### Split date

In [166]:
df['Data'] = pd.to_datetime(df['Data'], format="%Y-%m-%d", utc=True)
df['Ano'] = df['Data'].dt.year
df['Mês'] = df['Data'].dt.month
df['Dia'] = df['Data'].dt.day
df.drop(['Data'], inplace=True, axis=1)

df.head()

Unnamed: 0,Hora,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),Injeção na rede (kWh),dt,dt_iso,city_name,temp,feels_like,...,diffuse_radiation (W/m²),direct_normal_irradiance (W/m²),terrestrial_radiation (W/m²),direct_radiation_instant (W/m²),diffuse_radiation_instant (W/m²),direct_normal_irradiance_instant (W/m²),terrestrial_radiation_instant (W/m²),Ano,Mês,Dia
0,0,0.0,0.0,0.0,,1632873600,2021-09-29 00:00:00+00:00,local,13.97,13.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
1,1,0.0,0.0,0.0,,1632877200,2021-09-29 01:00:00+00:00,local,13.48,13.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
2,2,0.0,0.0,0.0,,1632880800,2021-09-29 02:00:00+00:00,local,12.93,12.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
3,3,0.0,0.0,0.0,,1632884400,2021-09-29 03:00:00+00:00,local,12.61,12.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
4,4,0.0,0.0,0.0,,1632888000,2021-09-29 04:00:00+00:00,local,12.61,12.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29


#### Remove city_name

In [167]:
df.drop('city_name', axis=1, inplace=True)

df.head()

Unnamed: 0,Hora,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),Injeção na rede (kWh),dt,dt_iso,temp,feels_like,temp_min,...,diffuse_radiation (W/m²),direct_normal_irradiance (W/m²),terrestrial_radiation (W/m²),direct_radiation_instant (W/m²),diffuse_radiation_instant (W/m²),direct_normal_irradiance_instant (W/m²),terrestrial_radiation_instant (W/m²),Ano,Mês,Dia
0,0,0.0,0.0,0.0,,1632873600,2021-09-29 00:00:00+00:00,13.97,13.54,11.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
1,1,0.0,0.0,0.0,,1632877200,2021-09-29 01:00:00+00:00,13.48,13.02,13.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
2,2,0.0,0.0,0.0,,1632880800,2021-09-29 02:00:00+00:00,12.93,12.47,12.72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
3,3,0.0,0.0,0.0,,1632884400,2021-09-29 03:00:00+00:00,12.61,12.15,10.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
4,4,0.0,0.0,0.0,,1632888000,2021-09-29 04:00:00+00:00,12.61,12.17,9.79,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29


#### Remove day, month, year and hour

In [168]:
df.drop('Dia', axis=1, inplace=True)
#df.drop('Mês', axis=1, inplace=True)
df.drop('Ano', axis=1, inplace=True)

df.head()

Unnamed: 0,Hora,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),Injeção na rede (kWh),dt,dt_iso,temp,feels_like,temp_min,...,shortwave_radiation (W/m²),direct_radiation (W/m²),diffuse_radiation (W/m²),direct_normal_irradiance (W/m²),terrestrial_radiation (W/m²),direct_radiation_instant (W/m²),diffuse_radiation_instant (W/m²),direct_normal_irradiance_instant (W/m²),terrestrial_radiation_instant (W/m²),Mês
0,0,0.0,0.0,0.0,,1632873600,2021-09-29 00:00:00+00:00,13.97,13.54,11.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
1,1,0.0,0.0,0.0,,1632877200,2021-09-29 01:00:00+00:00,13.48,13.02,13.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2,2,0.0,0.0,0.0,,1632880800,2021-09-29 02:00:00+00:00,12.93,12.47,12.72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
3,3,0.0,0.0,0.0,,1632884400,2021-09-29 03:00:00+00:00,12.61,12.15,10.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
4,4,0.0,0.0,0.0,,1632888000,2021-09-29 04:00:00+00:00,12.61,12.17,9.79,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9


#### Remove dt and dt_iso

In [169]:
df.drop('dt', axis=1, inplace=True)
df.drop('dt_iso', axis=1, inplace=True)


df.head()

Unnamed: 0,Hora,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),Injeção na rede (kWh),temp,feels_like,temp_min,temp_max,pressure,...,shortwave_radiation (W/m²),direct_radiation (W/m²),diffuse_radiation (W/m²),direct_normal_irradiance (W/m²),terrestrial_radiation (W/m²),direct_radiation_instant (W/m²),diffuse_radiation_instant (W/m²),direct_normal_irradiance_instant (W/m²),terrestrial_radiation_instant (W/m²),Mês
0,0,0.0,0.0,0.0,,13.97,13.54,11.45,14.04,1027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
1,1,0.0,0.0,0.0,,13.48,13.02,13.43,13.9,1028,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2,2,0.0,0.0,0.0,,12.93,12.47,12.72,13.43,1027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
3,3,0.0,0.0,0.0,,12.61,12.15,10.34,12.93,1027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
4,4,0.0,0.0,0.0,,12.61,12.17,9.79,12.93,1027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9


#### Remove direct_radiation

In [170]:
#df.drop('direct_radiation', axis=1, inplace=True)
#df.drop('direct_radiation_instant', axis=1, inplace=True)
#df.drop('shortwave_radiation', axis=1, inplace=True)
#df.drop('shortwave_radiation_instant', axis=1, inplace=True)
#df.drop('diffuse_radiation', axis=1, inplace=True)
#df.drop('diffuse_radiation_instant', axis=1, inplace=True)
#df.drop('direct_normal_irradiance', axis=1, inplace=True)
#df.drop('direct_normal_irradiance_instant', axis=1, inplace=True)
#df.drop('terrestrial_radiation', axis=1, inplace=True)
#df.drop('terrestrial_radiation_instant', axis=1, inplace=True)

#df.drop('wind_speed', axis=1, inplace=True)

#### Remove feels_like, temp_min, temp_max

In [171]:
df.drop(['feels_like', 'temp_min', 'temp_max'], axis=1, inplace=True)

## One Hot Encoding

In [172]:
#LABEL ENCODING
replace_map = {'heavy intensity rain': 0, 'moderate rain': 1, 'few clouds': 2, 'scattered clouds': 3, 'broken clouds': 4,'light rain': 5,'overcast clouds': 6,'sky is clear': 7, 'clear sky': 7}
#
df['weather_description'] = df['weather_description'].replace(replace_map).astype(int)

# ONEHOT
#lb = LabelBinarizer()

#lb_results = lb.fit_transform(df['weather_description'])

#lb_results_df = pd.DataFrame(lb_results, columns=lb.classes_)

#df = pd.concat([df, lb_results_df], axis=1)

#df.drop('weather_description', axis=1, inplace=True)

# BINARY ENCODING
#encoder = ce.BinaryEncoder(cols=['weather_description'])
#
#df = encoder.fit_transform(df)

## Missing Values

#### Remove sea_level, grnd_level and rain_1h

In [173]:
df.drop('sea_level', axis=1, inplace=True)
df.drop('grnd_level', axis=1, inplace=True)
df.drop('rain_1h', axis=1, inplace=True) # experimentar a media

In [174]:
print(df.isna().sum())

Hora                                          0
Normal (kWh)                                  0
Horário Económico (kWh)                       0
Autoconsumo (kWh)                             0
Injeção na rede (kWh)                      7777
temp                                          0
pressure                                      0
humidity                                      0
wind_speed                                    0
clouds_all                                    0
weather_description                           0
shortwave_radiation (W/m²)                    0
direct_radiation (W/m²)                       0
diffuse_radiation (W/m²)                      0
direct_normal_irradiance (W/m²)               0
terrestrial_radiation (W/m²)                  0
direct_radiation_instant (W/m²)               0
diffuse_radiation_instant (W/m²)              0
direct_normal_irradiance_instant (W/m²)       0
terrestrial_radiation_instant (W/m²)          0
Mês                                     

## Write new csv

In [175]:
if 'Injeção na rede (kWh)' in df:
    df['Injeção na rede (kWh)'] = df['Injeção na rede (kWh)'].fillna("None")

In [176]:
df.head()

Unnamed: 0,Hora,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),Injeção na rede (kWh),temp,pressure,humidity,wind_speed,clouds_all,...,shortwave_radiation (W/m²),direct_radiation (W/m²),diffuse_radiation (W/m²),direct_normal_irradiance (W/m²),terrestrial_radiation (W/m²),direct_radiation_instant (W/m²),diffuse_radiation_instant (W/m²),direct_normal_irradiance_instant (W/m²),terrestrial_radiation_instant (W/m²),Mês
0,0,0.0,0.0,0.0,,13.97,1027,81,1.96,87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
1,1,0.0,0.0,0.0,,13.48,1028,82,1.83,91,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2,2,0.0,0.0,0.0,,12.93,1027,84,1.96,93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
3,3,0.0,0.0,0.0,,12.61,1027,85,1.85,95,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
4,4,0.0,0.0,0.0,,12.61,1027,86,1.83,93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9


In [177]:
df.to_csv('../../../datasets/parte2/train_prepared.csv', index=False)