## Data acquisition and processing

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from esios import *

### 1. Download data using ESIOS.py 

#### Dictionary with Indicator Name and Number of Indicator

In [2]:
indicatorsDict = {
                    'demand': 460,
                    'price': 805,
                    'wind':541,
                    'solar':10034
                 }

indicatorsItems = indicatorsDict.items()   # To iterate over name and number at the same time

#### Download indicators data from 01-01-2014 to 01-06-2020 and save it into `".csv"` files.

In [3]:
start_date = "2020-08-18"
end_date = "2020-08-28"
start_ = start_date + 'T00:00:00'
end_ = end_date + 'T23:50:00'
token = '6cc21e0b60e9931e7522a6ce72a1a09f3a6fadc6f08b142f956db142c6858bc2'    # Introduce ESIOS token
esios = ESIOS(token)
country = 'Spain' #Spain, France or Portugal are the options

for indicatorName, indicatorValue in indicatorsItems:
    print ('Start Date: ' + start_date)
    print ('End Date: ' + end_date)
    indicators_ = list()
    indicators_.append(indicatorValue)
    dfmul , df_list, names = esios.get_multiple_series(indicators_, start_, end_, country)
    df = dfmul[names]
    df = df.reset_index()
    df.columns = ['Date', indicatorName]
    df['Date'] = df['Date'].str.replace('.', ' ')
    df['Date'] = df['Date'].str.split().str[0]
    df['Date'] = df['Date'].str.replace('T', ' ')
    # Export to .csv file
    try:
        os.stat("Files/")
    except:
        os.mkdir("Files/")
    df.to_csv(path_or_buf= 'Files/' + str(indicatorName) + '.csv', sep='^', index=False)
    print('Generated:' + str(indicatorName))

Analyzing indicators...
Start Date: 2020-08-18
End Date: 2020-08-28
Downloading Previsión diaria de la demanda eléctrica peninsular
Generated:demand
Start Date: 2020-08-18
End Date: 2020-08-28
Downloading Precio medio horario componente mercado diario 
Generated:price
Start Date: 2020-08-18
End Date: 2020-08-28
Downloading Previsión de la producción eólica peninsular
Generated:wind
Start Date: 2020-08-18
End Date: 2020-08-28
Downloading Generación prevista Solar
Generated:solar


### 2. Preprocesing esios data

In [4]:
from functools import reduce

In [5]:
with open('Files/demand.csv', 'r') as fichero:     
    dfDemand = pd.read_csv(fichero, sep = '^', dtype='object')
    dfDemand['Date']=pd.to_datetime(dfDemand['Date'])
    
with open('Files/solar.csv', 'r') as fichero:     
    dfSolar = pd.read_csv(fichero, sep = '^', dtype='object')
    dfSolar['Date']=pd.to_datetime(dfSolar['Date']) 
    
with open('Files/wind.csv', 'r') as fichero:     
    dfWind = pd.read_csv(fichero, sep = '^', dtype='object')
    dfWind['Date']=pd.to_datetime(dfWind['Date'])
    
with open('Files/price.csv', 'r') as fichero:     
    dfPrice = pd.read_csv(fichero, sep = '^', dtype='object')
    dfPrice['Date']=pd.to_datetime(dfPrice['Date']) 

#### Merge all files in one DataFrame

#### It's necessary to drop duplicates values to avoid repeated rows before merging DataFrames. 

In [6]:
dfDemand = dfDemand.drop_duplicates(subset='Date', keep='first')
dfSolar = dfSolar.drop_duplicates(subset='Date', keep='first')
dfWind = dfWind.drop_duplicates(subset='Date', keep='first')
dfPrice = dfPrice.drop_duplicates(subset='Date', keep='first')

In [7]:
data = [dfDemand, dfSolar, dfWind, dfPrice]
data = reduce(lambda left,right: pd.merge(left,right,on='Date', how = 'right'), data)
data.head(3)

Unnamed: 0,Date,demand,solar,wind,price
0,2020-08-18 00:00:00,24946.0,613.5,3679.0,37.7
1,2020-08-18 01:00:00,23544.0,580.9,3691.0,33.68
2,2020-08-18 02:00:00,22550.0,488.1,3611.0,31.27


#### Is needed to cast objects to floats

In [8]:
data.dtypes

Date      datetime64[ns]
demand            object
solar             object
wind              object
price             object
dtype: object

In [9]:
data['demand'] = data['demand'].astype(float)
data['solar'] = data['solar'].astype(float)
data['wind'] = data['wind'].astype(float)
data['price'] = data['price'].astype(float)

In [10]:
data.dtypes

Date      datetime64[ns]
demand           float64
solar            float64
wind             float64
price            float64
dtype: object

#### Check data distribution using seaborn boxplot

In [None]:
import seaborn as sns
import matplotlib.dates as dates
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
fig, axs = plt.subplots(ncols=4,figsize=(15,7))
plt.subplots_adjust(wspace=1, hspace=None)
sns.boxplot(data['demand'], orient='v', ax=axs[0])
sns.boxplot(data['solar'], orient='v', ax=axs[1])
sns.boxplot(data['wind'], orient='v', ax=axs[2])
sns.boxplot(data['price'], orient='v', ax=axs[3])
plt.show()

#### 2a. OUTLIERS

In [None]:
data.describe()

Calculate Interquantile range as the difference between upper and lower quartiles (IQR = Q3 - Q1)



In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
IQR

Calculate the number of outliers for each DataFrame 'df_esios' series.

#### 'DEMAND' Outliers:


In [None]:
numOutliersQ1_D = data['demand'][data['demand']<(Q1.iloc[0] - 1.5 * IQR.iloc[0])].count()
numOutliersQ3_D = data['demand'][data['demand']>(Q3.iloc[0] + 1.5 * IQR.iloc[0])].count()
numOutliersDemand = numOutliersQ1_D + numOutliersQ3_D
numOutliersDemand

In [None]:
sns.set(rc={"figure.figsize": (15, 5)})
sns.distplot(data['demand'])
plt.show()

#### 'SOLAR' Outliers:


In [None]:
numOutliersQ1_S = data['solar'][data['solar']<(Q1.iloc[1] - 1.5 * IQR.iloc[1])].count()
numOutliersQ3_S = data['solar'][data['solar']>(Q3.iloc[1] + 1.5 * IQR.iloc[1])].count()
numOutliersSolar = numOutliersQ1_S + numOutliersQ3_S
numOutliersSolar

In [None]:
sns.set(rc={"figure.figsize": (15, 5)})
sns.distplot(data['solar'])
plt.show()

#### 'WIND' Outliers:


In [None]:
numOutliersQ1_W = data['wind'][data['wind']<(Q1.iloc[2] - 1.5 * IQR.iloc[2])].count()
numOutliersQ3_W = data['wind'][data['wind']>(Q3.iloc[2] + 1.5 * IQR.iloc[2])].count()
numOutliersWind = numOutliersQ1_W + numOutliersQ3_W
numOutliersWind

In [None]:
sns.set(rc={"figure.figsize": (15, 5)})
sns.distplot(data['wind'])
plt.show()

#### 'PRICE' Outliers:


In [None]:
numOutliersQ1_P = data['price'][data['price']<(Q1.iloc[3] - 1.5 * IQR.iloc[3])].count()
numOutliersQ3_P = data['price'][data['price']>(Q3.iloc[3] + 1.5 * IQR.iloc[3])].count()
numOutliersPrice = numOutliersQ1_P + numOutliersQ3_P
numOutliersPrice

In [None]:
sns.set(rc={"figure.figsize": (15, 5)})
sns.distplot(data['price'])
plt.show()

In 'demand' series -> 0 outliers.<br>
In 'solar' series -> 403 outliers(0.707% of data are outliers).<br>
In 'wind' series -> 506 outliers (0,888% of data are outliers).<br>
In 'price' series -> 1609 outliers (2.814% of data are outliers).<br>

#### 3. Distribution

In [None]:
ax = plt.figure(figsize=(15,10)).add_subplot(111)
plt.scatter(data['Date'].tolist(), data['price'],alpha=0.5) 
plt.xlim([datetime.date(2014, 1, 1), datetime.date(2020, 12, 31)])
ax.xaxis.set_major_locator(dates.MonthLocator())
hfmt = dates.DateFormatter('%Y-%m')
ax.xaxis.set_major_formatter(hfmt)
plt.xticks(rotation=45)
plt.show()

In [11]:
data.to_csv('Data_new.csv', sep = '^', index = False)