## Import libraries

In [1039]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer
import category_encoders as ce

## Get the data

In [1040]:
df = pd.read_csv("../../../datasets/parte2/train.csv")

## Feature engineering

#### Split date

In [1041]:
df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d", utc=True)
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df.drop(['date'], inplace=True, axis=1)


df.head()

Unnamed: 0,hour,normal,economic_schedule,self-consumption,injection,dt,city_name,temp,feels_like,temp_min,...,terrestrial_radiation,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,terrestrial_radiation_instant,sunshine_duration,year,month,day
0,0,0.0,0.0,0.0,,1632873600,local,13.97,13.54,11.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
1,1,0.0,0.0,0.0,,1632877200,local,13.48,13.02,13.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
2,2,0.0,0.0,0.0,,1632880800,local,12.93,12.47,12.72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
3,3,0.0,0.0,0.0,,1632884400,local,12.61,12.15,10.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
4,4,0.0,0.0,0.0,,1632888000,local,12.61,12.17,9.79,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29


#### Remove city_name

In [1042]:
df.drop('city_name', axis=1, inplace=True)

df.head()

Unnamed: 0,hour,normal,economic_schedule,self-consumption,injection,dt,temp,feels_like,temp_min,temp_max,...,terrestrial_radiation,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,terrestrial_radiation_instant,sunshine_duration,year,month,day
0,0,0.0,0.0,0.0,,1632873600,13.97,13.54,11.45,14.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
1,1,0.0,0.0,0.0,,1632877200,13.48,13.02,13.43,13.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
2,2,0.0,0.0,0.0,,1632880800,12.93,12.47,12.72,13.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
3,3,0.0,0.0,0.0,,1632884400,12.61,12.15,10.34,12.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29
4,4,0.0,0.0,0.0,,1632888000,12.61,12.17,9.79,12.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021,9,29


#### Remove day, month, year and hour

In [1043]:
df.drop('day', axis=1, inplace=True)
df.drop('month', axis=1, inplace=True)
df.drop('year', axis=1, inplace=True)

df.head()

Unnamed: 0,hour,normal,economic_schedule,self-consumption,injection,dt,temp,feels_like,temp_min,temp_max,...,direct_radiation,diffuse_radiation,direct_normal_irradiance,terrestrial_radiation,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,terrestrial_radiation_instant,sunshine_duration
0,0,0.0,0.0,0.0,,1632873600,13.97,13.54,11.45,14.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,,1632877200,13.48,13.02,13.43,13.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,,1632880800,12.93,12.47,12.72,13.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,,1632884400,12.61,12.15,10.34,12.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,,1632888000,12.61,12.17,9.79,12.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Remove dt

In [1044]:
df.drop('dt', axis=1, inplace=True)

df.head()

Unnamed: 0,hour,normal,economic_schedule,self-consumption,injection,temp,feels_like,temp_min,temp_max,pressure,...,direct_radiation,diffuse_radiation,direct_normal_irradiance,terrestrial_radiation,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,terrestrial_radiation_instant,sunshine_duration
0,0,0.0,0.0,0.0,,13.97,13.54,11.45,14.04,1027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,,13.48,13.02,13.43,13.9,1028,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,,12.93,12.47,12.72,13.43,1027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,,12.61,12.15,10.34,12.93,1027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,,12.61,12.17,9.79,12.93,1027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Remove direct_radiation

In [1045]:
#df.drop('direct_radiation', axis=1, inplace=True)
df.drop('direct_radiation_instant', axis=1, inplace=True)
#df.drop('shortwave_radiation', axis=1, inplace=True)
df.drop('shortwave_radiation_instant', axis=1, inplace=True)
#df.drop('diffuse_radiation', axis=1, inplace=True)
df.drop('diffuse_radiation_instant', axis=1, inplace=True)
#df.drop('direct_normal_irradiance', axis=1, inplace=True)
df.drop('direct_normal_irradiance_instant', axis=1, inplace=True)
#df.drop('terrestrial_radiation', axis=1, inplace=True)
df.drop('terrestrial_radiation_instant', axis=1, inplace=True)

df.drop('sunshine_duration', axis=1, inplace=True)
df.drop('sealevel_pressure', axis=1, inplace=True)
df.drop('surface_pressure', axis=1, inplace=True)
df.drop('rain', axis=1, inplace=True)
#df.drop('weather_description', axis=1, inplace=True)

df.drop('wind_speed', axis=1, inplace=True)

#### Remove feels_like, temp_min, temp_max

In [1046]:
#df.drop(['feels_like', 'temp_min', 'temp_max'], axis=1, inplace=True)

## One Hot Encoding

In [1047]:
df['weather_description'] = df['weather_description'].replace('clear sky', 'sky is clear')

In [1048]:
#LABEL ENCODING
replace_map = {'overcast clouds':5, 'broken clouds':0, 'few clouds':1, 'scattered clouds':6, 'sky is clear':7, 'light rain':3, 'moderate rain':4, 'heavy intensity rain':2, }

df['weather_description'] = df['weather_description'].replace(replace_map).astype(int)

# ONEHOT
#lb = LabelBinarizer()
#
#lb_results = lb.fit_transform(df['weather_description'])
#
#lb_results_df = pd.DataFrame(lb_results, columns=lb.classes_)
#
#df = pd.concat([df, lb_results_df], axis=1)
#
#df.drop('weather_description', axis=1, inplace=True)

# BINARY ENCODING
#encoder = ce.BinaryEncoder(cols=['weather_description'])
#
#df = encoder.fit_transform(df)

## Missing Values

#### Remove sea_level, grnd_level and rain_1h

In [1049]:
df.drop('sea_level', axis=1, inplace=True)
df.drop('grnd_level', axis=1, inplace=True)
df.drop('rain_1h', axis=1, inplace=True) # experimentar a media

In [1050]:
print(df.isna().sum())

hour                      0
normal                    0
economic_schedule         0
self-consumption          0
injection              7777
temp                      0
feels_like                0
temp_min                  0
temp_max                  0
pressure                  0
humidity                  0
clouds_all                0
weather_description       0
direct_radiation          0
dtype: int64


## Write new csv

In [1051]:
if 'injection' in df:
    df['injection'] = df['injection'].fillna("None")

In [1052]:
df.head()

Unnamed: 0,hour,normal,economic_schedule,self-consumption,injection,temp,feels_like,temp_min,temp_max,pressure,humidity,clouds_all,weather_description,direct_radiation
0,0,0.0,0.0,0.0,,13.97,13.54,11.45,14.04,1027,81,87,5,0.0
1,1,0.0,0.0,0.0,,13.48,13.02,13.43,13.9,1028,82,91,5,0.0
2,2,0.0,0.0,0.0,,12.93,12.47,12.72,13.43,1027,84,93,5,0.0
3,3,0.0,0.0,0.0,,12.61,12.15,10.34,12.93,1027,85,95,5,0.0
4,4,0.0,0.0,0.0,,12.61,12.17,9.79,12.93,1027,86,93,5,0.0


In [1053]:
df.to_csv('../../../datasets/parte2/train_prepared.csv', index=False)