# Dataset de competição

## Fase 0: Imports e Leitura dos Dados

In [342]:
import sklearn as skl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re # Used in Data Processing
from sklearn import preprocessing

# Definir o número máximo de colunas do pandas para 21
pd.set_option('display.max_columns', 21)

The following configuration was implemented to avoid pandas from considering that the 'None' value in the column 'Injeçao na rede (kWh)' is NaN.

In [343]:
allowed_nans = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN',
                '-NaN', '-nan', '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA',
                'NULL', 'NaN', 'n/a', 'nan', 'null']

treino = pd.read_csv('datasets/treino.csv', na_values=allowed_nans, keep_default_na=False)

## Fase 1: Exploração Simples dos Dados

In [344]:
treino.shape

(11688, 21)

In [345]:
treino.columns

Index(['Data', 'Hora', 'Normal (kWh)', 'Horario Economico (kWh)',
       'Autoconsumo (kWh)', 'Injeçao na rede (kWh)', 'dt', 'dt_iso',
       'city_name', 'temp', 'feels_like', 'temp_min', 'temp_max', 'pressure',
       'sea_level', 'grnd_level', 'humidity', 'wind_speed', 'rain_1h',
       'clouds_all', 'weather_description'],
      dtype='object')

In [346]:
treino.head()

Unnamed: 0,Data,Hora,Normal (kWh),Horario Economico (kWh),Autoconsumo (kWh),Injeçao na rede (kWh),dt,dt_iso,city_name,temp,feels_like,temp_min,temp_max,pressure,sea_level,grnd_level,humidity,wind_speed,rain_1h,clouds_all,weather_description
0,,,,,,,1630454400,2021-09-01 00:00:00 +0000 UTC,local,18.74,18.84,15.72,20.34,1015,,,83,1.18,,78,broken clouds
1,,,,,,,1630458000,2021-09-01 01:00:00 +0000 UTC,local,18.73,18.83,15.72,20.34,1014,,,83,1.46,,92,overcast clouds
2,,,,,,,1630461600,2021-09-01 02:00:00 +0000 UTC,local,17.84,17.98,16.72,20.34,1014,,,88,1.05,,91,overcast clouds
3,,,,,,,1630465200,2021-09-01 03:00:00 +0000 UTC,local,18.27,18.4,16.68,20.34,1014,,,86,0.46,0.14,94,light rain
4,,,,,,,1630468800,2021-09-01 04:00:00 +0000 UTC,local,17.81,17.97,16.12,20.34,1013,,,89,0.93,1.26,95,moderate rain


In [347]:
treino.describe()

Unnamed: 0,Hora,Normal (kWh),Horario Economico (kWh),Autoconsumo (kWh),dt,temp,feels_like,temp_min,temp_max,pressure,sea_level,grnd_level,humidity,wind_speed,rain_1h,clouds_all
count,11016.0,11016.0,11016.0,11016.0,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0,0.0,0.0,11688.0,11688.0,2435.0,11688.0
mean,11.5,0.202278,0.159714,0.117314,1651491000.0,16.406638,16.076164,14.433353,17.50299,1018.304073,,,76.619011,2.647688,0.88476,54.258214
std,6.922501,0.349478,0.271792,0.176762,12147050.0,5.715977,6.217605,4.96059,6.112344,6.109727,,,16.157421,1.458574,1.073453,40.443374
min,0.0,0.0,0.0,0.0,1630454000.0,0.32,-2.19,-0.64,1.33,994.0,,,19.0,0.06,0.1,0.0
25%,5.75,0.0,0.0,0.0,1640973000.0,12.35,11.76,10.9,12.93,1015.0,,,66.0,1.61,0.21,7.0
50%,11.5,0.0,0.0,0.0,1651491000.0,15.76,15.675,14.54,16.72,1018.0,,,81.0,2.38,0.45,60.0
75%,17.25,0.314,0.288,0.227,1662009000.0,19.54,19.56,17.72,20.7,1022.0,,,91.0,3.4,1.07,98.0
max,23.0,3.251,6.978,1.192,1672528000.0,40.85,41.33,36.72,41.45,1034.0,,,100.0,11.1,7.45,100.0


In [348]:
treino.duplicated().any()

False

In [349]:
treino.isna().any()

Data                        True
Hora                        True
Normal (kWh)                True
Horario Economico (kWh)     True
Autoconsumo (kWh)           True
Injeçao na rede (kWh)       True
dt                         False
dt_iso                     False
city_name                  False
temp                       False
feels_like                 False
temp_min                   False
temp_max                   False
pressure                   False
sea_level                   True
grnd_level                  True
humidity                   False
wind_speed                 False
rain_1h                     True
clouds_all                 False
weather_description        False
dtype: bool

In [350]:
treino.dtypes

Data                        object
Hora                       float64
Normal (kWh)               float64
Horario Economico (kWh)    float64
Autoconsumo (kWh)          float64
Injeçao na rede (kWh)       object
dt                           int64
dt_iso                      object
city_name                   object
temp                       float64
feels_like                 float64
temp_min                   float64
temp_max                   float64
pressure                     int64
sea_level                  float64
grnd_level                 float64
humidity                     int64
wind_speed                 float64
rain_1h                    float64
clouds_all                   int64
weather_description         object
dtype: object

In [352]:
contagem_relativa = treino['city_name'].value_counts(normalize=True)
contagem_absoluta = (contagem_relativa * len(treino)).round().astype(int)
print(contagem_absoluta)

city_name
local    11688
Name: proportion, dtype: int64


'city_name' is a useless column because it only has 1 value. 

In [353]:
treino['dt_iso'].isna().sum()

0

'dt_iso' has 0 missing values so it can replace the columns 'Data' e 'Hora'

In [354]:
treino['rain_1h'].isna().sum()

9253

## Fase 2: Tratamento dos Dados

The column 'city_name' only has 1 possible value so it is useless and must the removed from the dataset. 

In [355]:
treino.drop('city_name', inplace=True, axis=1)

The column 'dt_iso' will replace the columns 'Data' and 'Hora'. The 'dt_iso' column is going to also be split into two new columns: 'Date' and 'Hour'.

In [356]:
def toDate(d):
    return re.search(r'\d+\-\d+\-\d+', d).group()

def toHour(h):
    return re.search(r'\d+\:\d+\:\d+', h).group()

treino.drop('Data', inplace=True, axis=1)
treino.drop('Hora', inplace=True, axis=1)
hour = treino['dt_iso']
treino.insert(0, 'Hour', hour)
treino.rename(columns = {'dt_iso':'Date'}, inplace = True)

dates = treino['Date'].apply(toDate)
treino['Date'] = dates
hours = treino['Hour'].apply(toHour)
treino['Hour'] = hours

dates = treino.pop('Date') 
treino.insert(0, 'Date', dates) 
dts = treino.pop('dt') 
treino.insert(0, 'dt', dts)
treino 

Unnamed: 0,dt,Date,Hour,Normal (kWh),Horario Economico (kWh),Autoconsumo (kWh),Injeçao na rede (kWh),temp,feels_like,temp_min,temp_max,pressure,sea_level,grnd_level,humidity,wind_speed,rain_1h,clouds_all,weather_description
0,1630454400,2021-09-01,00:00:00,,,,,18.74,18.84,15.72,20.34,1015,,,83,1.18,,78,broken clouds
1,1630458000,2021-09-01,01:00:00,,,,,18.73,18.83,15.72,20.34,1014,,,83,1.46,,92,overcast clouds
2,1630461600,2021-09-01,02:00:00,,,,,17.84,17.98,16.72,20.34,1014,,,88,1.05,,91,overcast clouds
3,1630465200,2021-09-01,03:00:00,,,,,18.27,18.40,16.68,20.34,1014,,,86,0.46,0.14,94,light rain
4,1630468800,2021-09-01,04:00:00,,,,,17.81,17.97,16.12,20.34,1013,,,89,0.93,1.26,95,moderate rain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11683,1672513200,2022-12-31,19:00:00,1.693,0.000,0.0,,15.70,15.57,15.57,15.72,1018,,,86,5.56,3.58,100,moderate rain
11684,1672516800,2022-12-31,20:00:00,1.327,0.000,0.0,,15.55,15.43,12.72,15.70,1018,,,87,4.47,4.20,100,heavy intensity rain
11685,1672520400,2022-12-31,21:00:00,0.757,0.000,0.0,,13.45,13.28,12.23,13.99,1019,,,93,3.29,4.23,100,heavy intensity rain
11686,1672524000,2022-12-31,22:00:00,0.000,0.675,0.0,,12.93,12.73,12.23,13.43,1019,,,94,1.49,3.90,100,moderate rain


## Fase 3: Exploração Detalhada

## Fase 4: Construção e avaliação de modelos ML