## Import libraries

In [71]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer

## Get the data

In [72]:
df = pd.read_csv("../../../datasets/parte2/test.csv")

## Feature engineering

#### Split date

In [73]:
df['Data'] = pd.to_datetime(df['Data'], format="%Y-%m-%d", utc=True)
df['Ano'] = df['Data'].dt.year
df['Mês'] = df['Data'].dt.month
df['Dia'] = df['Data'].dt.day
df.drop(['Data'], inplace=True, axis=1)

df.head()

Unnamed: 0,Hora,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),dt,dt_iso,city_name,temp,feels_like,temp_min,...,direct_normal_irradiance,terrestrial_radiation,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,terrestrial_radiation_instant,Ano,Mês,Dia
0,0,0.0,0.467,0.0,1672531200,2023-01-01 00:00:00+00:00,local,12.93,12.76,12.72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,1,1
1,1,0.0,0.577,0.0,1672534800,2023-01-01 01:00:00+00:00,local,13.49,13.38,13.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,1,1
2,2,0.0,0.346,0.0,1672538400,2023-01-01 02:00:00+00:00,local,13.55,13.44,13.48,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,1,1
3,3,0.0,0.27,0.0,1672542000,2023-01-01 03:00:00+00:00,local,13.61,13.51,12.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,1,1
4,4,0.0,0.252,0.0,1672545600,2023-01-01 04:00:00+00:00,local,13.59,13.49,12.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,1,1


#### Remove city_name

In [74]:
df.drop('city_name', axis=1, inplace=True)

df.head()

Unnamed: 0,Hora,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),dt,dt_iso,temp,feels_like,temp_min,temp_max,...,direct_normal_irradiance,terrestrial_radiation,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,terrestrial_radiation_instant,Ano,Mês,Dia
0,0,0.0,0.467,0.0,1672531200,2023-01-01 00:00:00+00:00,12.93,12.76,12.72,13.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,1,1
1,1,0.0,0.577,0.0,1672534800,2023-01-01 01:00:00+00:00,13.49,13.38,13.43,13.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,1,1
2,2,0.0,0.346,0.0,1672538400,2023-01-01 02:00:00+00:00,13.55,13.44,13.48,14.82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,1,1
3,3,0.0,0.27,0.0,1672542000,2023-01-01 03:00:00+00:00,13.61,13.51,12.01,14.82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,1,1
4,4,0.0,0.252,0.0,1672545600,2023-01-01 04:00:00+00:00,13.59,13.49,12.01,14.82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,1,1


#### Remove day, month, year and hour

In [75]:
df.drop('Dia', axis=1, inplace=True)
df.drop('Mês', axis=1, inplace=True)
df.drop('Ano', axis=1, inplace=True)

df.head()

Unnamed: 0,Hora,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),dt,dt_iso,temp,feels_like,temp_min,temp_max,...,shortwave_radiation,direct_radiation,diffuse_radiation,direct_normal_irradiance,terrestrial_radiation,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,terrestrial_radiation_instant
0,0,0.0,0.467,0.0,1672531200,2023-01-01 00:00:00+00:00,12.93,12.76,12.72,13.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.577,0.0,1672534800,2023-01-01 01:00:00+00:00,13.49,13.38,13.43,13.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.346,0.0,1672538400,2023-01-01 02:00:00+00:00,13.55,13.44,13.48,14.82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.27,0.0,1672542000,2023-01-01 03:00:00+00:00,13.61,13.51,12.01,14.82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.252,0.0,1672545600,2023-01-01 04:00:00+00:00,13.59,13.49,12.01,14.82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Remove dt and dt_iso

In [76]:
df.drop('dt', axis=1, inplace=True)
df.drop('dt_iso', axis=1, inplace=True)


df.head()

Unnamed: 0,Hora,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),temp,feels_like,temp_min,temp_max,pressure,sea_level,...,shortwave_radiation,direct_radiation,diffuse_radiation,direct_normal_irradiance,terrestrial_radiation,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,terrestrial_radiation_instant
0,0,0.0,0.467,0.0,12.93,12.76,12.72,13.43,1019,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.577,0.0,13.49,13.38,13.43,13.9,1018,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.346,0.0,13.55,13.44,13.48,14.82,1017,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.27,0.0,13.61,13.51,12.01,14.82,1016,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.252,0.0,13.59,13.49,12.01,14.82,1015,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Remove direct_radiation

In [77]:
#df.drop('direct_radiation', axis=1, inplace=True)
df.drop('direct_radiation_instant', axis=1, inplace=True)
#df.drop('shortwave_radiation', axis=1, inplace=True)
df.drop('shortwave_radiation_instant', axis=1, inplace=True)
#df.drop('diffuse_radiation', axis=1, inplace=True)
df.drop('diffuse_radiation_instant', axis=1, inplace=True)
#df.drop('direct_normal_irradiance', axis=1, inplace=True)
df.drop('direct_normal_irradiance_instant', axis=1, inplace=True)
#df.drop('terrestrial_radiation', axis=1, inplace=True)
df.drop('terrestrial_radiation_instant', axis=1, inplace=True)

#### Remove feels_like, temp_min, temp_max

In [78]:
df.drop(['feels_like', 'temp_min', 'temp_max'], axis=1, inplace=True)

## One Hot Encoding

In [79]:
#LABEL ENCODING
replace_map = {'heavy intensity rain': 0, 'moderate rain': 1, 'few clouds': 2, 'scattered clouds': 3, 'broken clouds': 4,'light rain': 5,'overcast clouds': 6,'sky is clear': 7, 'clear sky': 7}
#
df['weather_description'] = df['weather_description'].replace(replace_map).astype(int)

# ONEHOT
#lb = LabelBinarizer()

#lb_results = lb.fit_transform(df['weather_description'])

#lb_results_df = pd.DataFrame(lb_results, columns=lb.classes_)

#df = pd.concat([df, lb_results_df], axis=1)

#df.drop('weather_description', axis=1, inplace=True)

# BINARY ENCODING
#encoder = ce.BinaryEncoder(cols=['weather_description'])
#
#df = encoder.fit_transform(df)

## Missing Values

#### Remove sea_level, grnd_level and rain_1h

In [80]:
df.drop('sea_level', axis=1, inplace=True)
df.drop('grnd_level', axis=1, inplace=True)
df.drop('rain_1h', axis=1, inplace=True) # experimentar a media

In [81]:
print(df.isna().sum())

Hora                        0
Normal (kWh)                0
Horário Económico (kWh)     0
Autoconsumo (kWh)           0
temp                        0
pressure                    0
humidity                    0
wind_speed                  0
clouds_all                  0
weather_description         0
shortwave_radiation         0
direct_radiation            0
diffuse_radiation           0
direct_normal_irradiance    0
terrestrial_radiation       0
dtype: int64


## Write new csv

In [82]:
if 'Injeção na rede (kWh)' in df:
    df['Injeção na rede (kWh)'] = df['Injeção na rede (kWh)'].fillna("None")

In [83]:
df.head()

Unnamed: 0,Hora,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),temp,pressure,humidity,wind_speed,clouds_all,weather_description,shortwave_radiation,direct_radiation,diffuse_radiation,direct_normal_irradiance,terrestrial_radiation
0,0,0.0,0.467,0.0,12.93,1019,95,2.02,100,1,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.577,0.0,13.49,1018,95,2.18,100,1,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.346,0.0,13.55,1017,95,2.88,100,1,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.27,0.0,13.61,1016,95,3.63,100,1,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.252,0.0,13.59,1015,95,4.58,100,1,0.0,0.0,0.0,0.0,0.0


In [84]:
df.to_csv('../../../datasets/parte2/test_prepared.csv', index=False)