# Basics of the study

This study is based on [this article](https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/) from Jason Brownlee. The explanation is great and quite intuitive!

## Importing essential libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow.keras as keras

from sklearn.metrics import mean_squared_error, r2_score

## Importing series

### IPCA (general)  
General IPCA stands for general extended national consumer prices index. Having an overview of it would be useful for better understanding what's in store for the models and then making some adjustments so it becomes easier to work with.

In [None]:
ipca_general = pd.read_csv('Data/IPCA-general.csv', sep=';', decimal=',')
ipca_general.columns = ['Date', 'Variation_Percentage', 'Dropme']
ipca_general = ipca_general.drop('Dropme', axis=1)
ipca_general = ipca_general.set_index('Date')
print(ipca_general.info(),'\n')
print(ipca_general.head())

In [None]:
plt.figure(figsize=(10,5))
plt.title('Variation percentage of Brazilian inflation across time', fontsize=10)
plt.plot(ipca_general)
plt.xticks(range(0,len(ipca_general),int(len(ipca_general)/33)+1), rotation=90)
plt.show()

Percentage values scaling from 0 to 100 and beyond might not be particularly helpful for working with a machine learning model, so the series will be divided by 100.

In [None]:
ipca_general_raw = ipca_general.div(100)
plt.figure(figsize=(10,5))
plt.title('Variation tax of Brazilian inflation across time', fontsize=10)
plt.plot(ipca_general_raw)
plt.xticks(range(0,len(ipca_general_raw),int(len(ipca_general_raw)/33)+1), rotation=90)
plt.show()

Now the series can be properly worked with.

### IPCA (food and beverages)  
The inflation of food and beverages might behave differently from general inflation overall, so it might be interesting to analyze it as well.

In [None]:
ipca_food = pd.read_csv('Data/IPCA-food.csv', sep=';', decimal=',')
ipca_food.columns = ['Date', 'Variation_Percentage', 'Dropme']
ipca_food = ipca_food.drop('Dropme', axis=1)
ipca_food = ipca_food.set_index('Date')
print(ipca_food.info(),'\n')
print(ipca_food.head())

There's one missing value in the series, so it will be treated as the mean between the neighbor values.

In [None]:
plt.figure(figsize=(10,5))
plt.title('Variation percentage of Brazilian inflation for food and beverages across time', fontsize=10)
plt.plot(ipca_food)
plt.xticks(range(0,len(ipca_food),int(len(ipca_food)/33)+1), rotation=90)
plt.show()

The missing value is between 1991.06 and 1992.10, so it's important to look for it in this slice.

In [None]:
ipca_food['1991.06':'1992.10']

In [None]:
ipca_food_notnull = ipca_food.copy()
ipca_food_notnull = ipca_food_notnull.fillna((12.79+37.85)/2)
ipca_food_notnull['1991.06':'1992.10']

In [None]:
ipca_food_raw = ipca_food_notnull.div(100)
plt.figure(figsize=(10,5))
plt.title('Variation tax of Brazilian inflation for food and beverages across time', fontsize=10)
plt.plot(ipca_food_raw)
plt.xticks(range(0,len(ipca_food_raw),int(len(ipca_food_raw)/33)+1), rotation=90)
plt.show()

### IPCA (health expenses)

In [None]:
ipca_health = pd.read_csv('Data/IPCA-health.csv', sep=';', decimal=',')
ipca_health.columns = ['Date', 'Variation_Percentage', 'Dropme']
ipca_health = ipca_health.drop('Dropme', axis=1)
ipca_health = ipca_health.set_index('Date')
print(ipca_health.info(),'\n')
print(ipca_health.head())

In [None]:
plt.figure(figsize=(10,5))
plt.title('Variation percentage of Brazilian inflation for health expenses across time', fontsize=10)
plt.plot(ipca_health)
plt.xticks(range(0,len(ipca_health),int(len(ipca_health)/33)+1), rotation=90)
plt.show()

In [None]:
ipca_health['1991.06':'1992.10']

In [None]:
ipca_health_notnull = ipca_health.copy()
ipca_health_notnull = ipca_health_notnull.fillna((15.00+36.40)/2)
ipca_health_notnull['1991.06':'1992.10']

In [None]:
ipca_health_raw = ipca_health_notnull.div(100)
plt.figure(figsize=(10,5))
plt.title('Variation tax of Brazilian inflation for health expenses across time', fontsize=10)
plt.plot(ipca_health_raw)
plt.xticks(range(0,len(ipca_health_raw),int(len(ipca_health_raw)/33)+1), rotation=90)
plt.show()

# LSTM applications

## Standard LSTM

### Split method and model  
Each series must first be split in sequences of steps so they fit the LSTM format. The method used for Standard LSTM can also be used for a Stacked LSTM without any changes.  
Since the data is in months, the model itself will use 12 timesteps, assembling a batch of one year to predict the following month.

In [None]:
# The series must first be prepared to fit the type of LSTM.
# This method can be used for both Standard and Stacked LSTMs.
def split_standard(sequence, n_steps):
	X, y = list(), list()
	for i in range(len(sequence)):
		# find the end of this pattern
		end_ix = i + n_steps
		# check if we are beyond the sequence
		if end_ix > len(sequence)-1:
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)

In [None]:
# This model will use 12 timesteps, one for each month in a year, and a single feature.
n_steps_std = 12
n_features_std = 1

lstm_std = keras.models.Sequential()
lstm_std.add(keras.layers.LSTM(50, activation='relu', input_shape=(n_steps_std, n_features_std)))
lstm_std.add(keras.layers.Dense(1))
lstm_std.compile(optimizer='adam', loss='mse')

### IPCA (general)

In [None]:
X_IPCAgen_std, y_IPCAgen_std = split_standard(np.array(ipca_general_raw), n_steps_std)
# Reshape from [samples, timesteps] into [samples, timesteps, features]
X_IPCAgen_std = X_IPCAgen_std.reshape((X_IPCAgen_std.shape[0], X_IPCAgen_std.shape[1], n_features_std))

# Separating values in training, validation and testing
# 70% of the series will be used for testing
Xtrain_IPCAgen_std = X_IPCAgen_std[:int(len(X_IPCAgen_std)*0.7)]
ytrain_IPCAgen_std = y_IPCAgen_std[:int(len(y_IPCAgen_std)*0.7)]
# 20% of the series will be used for validation
Xval_IPCAgen_std = X_IPCAgen_std[int(len(X_IPCAgen_std)*0.7):int(len(X_IPCAgen_std)*0.9)]
yval_IPCAgen_std = y_IPCAgen_std[int(len(y_IPCAgen_std)*0.7):int(len(y_IPCAgen_std)*0.9)]
# 10% of the series will be used for testing
Xtest_IPCAgen_std = X_IPCAgen_std[int(len(X_IPCAgen_std)*0.9):]
ytest_IPCAgen_std = y_IPCAgen_std[int(len(y_IPCAgen_std)*0.9):]

In [None]:
lstm_IPCAgen_std.fit(Xtrain_IPCAgen_std, ytrain_IPCAgen_std, 
                     validation_data=(Xval_IPCAgen_std, yval_IPCAgen_std),
                     epochs=200, verbose=1)

In [None]:
plt.figure(figsize=(10,5))
plt.title('Inflation forecasting - Training set')
plt.plot(ytrain_IPCAgen_std, color='blue', label='True')
plt.plot(lstm_IPCAgen_std.predict(Xtrain_IPCAgen_std), color='orange', label='Predicted')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title('Inflation forecasting - Validation set')
plt.plot(yval_IPCAgen_std, color='blue', label='True')
plt.plot(lstm_IPCAgen_std.predict(Xval_IPCAgen_std), color='orange', label='Predicted')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title('Inflation forecasting - Test set')
plt.plot(ytest_IPCAgen_std, color='blue', label='True')
plt.plot(lstm_IPCAgen_std.predict(Xtest_IPCAgen_std), color='orange', label='Predicted')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title('Inflation forecasting - Full series')
plt.plot(y_IPCAgen_std, color='blue', label='True')
plt.plot(lstm_IPCAgen_std.predict(X_IPCAgen_std), color='orange', label='Predicted')
plt.legend()
plt.show()

### IPCA (food)

In [None]:
X_IPCAfood_std, y_IPCAfood_std = split_standard(np.array(ipca_food_raw), n_steps_std)
# Reshape from [samples, timesteps] into [samples, timesteps, features]
X_IPCAfood_std = X_IPCAfood_std.reshape((X_IPCAfood_std.shape[0], X_IPCAfood_std.shape[1], n_features_std))

# Separating values in training, validation and testing
# 70% of the series will be used for testing
Xtrain_IPCAfood_std = X_IPCAfood_std[:int(len(X_IPCAfood_std)*0.7)]
ytrain_IPCAfood_std = y_IPCAfood_std[:int(len(y_IPCAfood_std)*0.7)]
# 20% of the series will be used for validation
Xval_IPCAfood_std = X_IPCAfood_std[int(len(X_IPCAfood_std)*0.7):int(len(X_IPCAfood_std)*0.9)]
yval_IPCAfood_std = y_IPCAfood_std[int(len(y_IPCAfood_std)*0.7):int(len(y_IPCAfood_std)*0.9)]
# 10% of the series will be used for testing
Xtest_IPCAfood_std = X_IPCAfood_std[int(len(X_IPCAfood_std)*0.9):]
ytest_IPCAfood_std = y_IPCAfood_std[int(len(y_IPCAfood_std)*0.9):]

In [None]:
lstm_IPCAfood_std.fit(Xtrain_IPCAfood_std, ytrain_IPCAfood_std, 
                      validation_data=(Xval_IPCAfood_std, yval_IPCAfood_std),
                      epochs=200, verbose=1)

In [None]:
plt.figure(figsize=(10,5))
plt.title('Food inflation forecasting - Training set')
plt.plot(ytrain_IPCAfood_std, color='blue', label='True')
plt.plot(lstm_IPCAfood_std.predict(Xtrain_IPCAfood_std), color='orange', label='Predicted')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title('Food inflation forecasting - Validation set')
plt.plot(yval_IPCAfood_std, color='blue', label='True')
plt.plot(lstm_IPCAfood_std.predict(Xval_IPCAfood_std), color='orange', label='Predicted')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title('Food inflation forecasting - Testing set')
plt.plot(ytest_IPCAfood_std, color='blue', label='True')
plt.plot(lstm_IPCAfood_std.predict(Xtest_IPCAfood_std), color='orange', label='Predicted')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title('Food inflation forecasting - Full series')
plt.plot(y_IPCAfood_std, color='blue', label='True')
plt.plot(lstm_IPCAfood_std.predict(X_IPCAfood_std), color='orange', label='Predicted')
plt.legend()
plt.show()

### IPCA (health)

In [None]:
X_IPCAhlth_std, y_IPCAhlth_std = split_standard(np.array(ipca_health_raw), n_steps_std)
# Reshape from [samples, timesteps] into [samples, timesteps, features]
X_IPCAhlth_std = X_IPCAhlth_std.reshape((X_IPCAhlth_std.shape[0], X_IPCAhlth_std.shape[1], n_features_std))

# Separating values in training, validation and testing
# 70% of the series will be used for testing
Xtrain_IPCAhlth_std = X_IPCAhlth_std[:int(len(X_IPCAhlth_std)*0.7)]
ytrain_IPCAhlth_std = y_IPCAhlth_std[:int(len(y_IPCAhlth_std)*0.7)]
# 20% of the series will be used for validation
Xval_IPCAhlth_std = X_IPCAhlth_std[int(len(X_IPCAhlth_std)*0.7):int(len(X_IPCAhlth_std)*0.9)]
yval_IPCAhlth_std = y_IPCAhlth_std[int(len(y_IPCAhlth_std)*0.7):int(len(y_IPCAhlth_std)*0.9)]
# 10% of the series will be used for testing
Xtest_IPCAhlth_std = X_IPCAhlth_std[int(len(X_IPCAhlth_std)*0.9):]
ytest_IPCAhlth_std = y_IPCAhlth_std[int(len(y_IPCAhlth_std)*0.9):]

In [None]:
lstm_IPCAhlth_std.fit(Xtrain_IPCAhlth_std, ytrain_IPCAhlth_std, 
                      validation_data=(Xval_IPCAhlth_std, yval_IPCAhlth_std),
                      epochs=200, verbose=1)

In [None]:
plt.figure(figsize=(10,5))
plt.title('Health inflation forecasting - Training set')
plt.plot(ytrain_IPCAhlth_std, color='blue', label='True')
plt.plot(lstm_IPCAhlth_std.predict(Xtrain_IPCAhlth_std), color='orange', label='Predicted')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title('Health inflation forecasting - Validation set')
plt.plot(yval_IPCAhlth_std, color='blue', label='True')
plt.plot(lstm_IPCAhlth_std.predict(Xval_IPCAhlth_std), color='orange', label='Predicted')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title('Health inflation forecasting - Testing set')
plt.plot(yval_IPCAhlth_std, color='blue', label='True')
plt.plot(lstm_IPCAhlth_std.predict(Xval_IPCAhlth_std), color='orange', label='Predicted')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title('Health inflation forecasting - Full series')
plt.plot(y_IPCAhlth_std, color='blue', label='True')
plt.plot(lstm_IPCAhlth_std.predict(X_IPCAhlth_std), color='orange', label='Predicted')
plt.legend()
plt.show()

## Stacked LSTM

### IPCA (general)

## CNN LSTM

### IPCA (general)