### A Guide to Time Series Analysis in Python

Source: https://builtin.com/data-science/time-series-python

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from math import sqrt
from sklearn.metrics import mean_squared_error




In [None]:
# Read the data
df = pd.read_csv('AirPassengers.csv')

df.head()


In [None]:
# Format the data into datetime object
df['Month'] = pd.to_datetime(df['Month'], format='%Y-%m')

# Convert month column to an index
df.index = df['Month']
del df['Month']


In [None]:
# Plot the data
sns.lineplot(df)
plt.ylabel('Number of Passengers')
plt.show()


#### Stationarity (is the data change constant)

Dickey Fuller Test:

null-hypothesis: there is no stationarity

In [None]:
# 7-month mean and standard deviation
rolling_mean = df.rolling(7).mean()
rolling_std = df.rolling(7).std()


In [None]:
plt.plot(df, color='blue', label='Original Passenger Data')
plt.plot(rolling_mean, color='red', label='Rolling Mean Passenger Number')
plt.plot(rolling_std, color='black', label = 'Rolling Standard Deviation in Passenger Number')
plt.title('Passenger Time Series, Rolling Mean, Standard Deviation')
plt.legend(loc='best')
plt.show()


In [None]:
# Dickey Fuller Test
from statsmodels.tsa.stattools import adfuller

adft = adfuller(df, autolag='AIC')

output_df = pd.DataFrame({'Values': [adft[0], adft[1], adft[2], adft[3], 
                                     adft[4]['1%'], adft[4]['5%'], adft[4]['10%']],
                          'Metric': ['Test Statistics', 'p-value', 'No. of lags used', 'Number of observations used', 
                                     'critical value (1%)', 'critical value (5%)', 'critical value (10%)']})
print(output_df)


In [None]:
# The data is not stationary since the p-value is above 5% and the test statistic is greater than the critical value

#### Autocorrelation (is there a corrleation in the data with past values)

In [None]:
# One month lag
autocorrelation_lag1 = df['#Passengers'].autocorr(lag=1)
print('One Month Lag:', autocorrelation_lag1)

# Three month lag
autocorrelation_lag3 = df['#Passengers'].autocorr(lag=3)
print('Three Month Lag:', autocorrelation_lag3)

# Six month lag
autocorrelation_lag6 = df['#Passengers'].autocorr(lag=6)
print('Six Month Lag:', autocorrelation_lag6)

# Nine month lag
autocorrelation_lag9 = df['#Passengers'].autocorr(lag=9)
print('Nine Month Lag:', autocorrelation_lag9)


#### Decomposition (to visualize trends)

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

decompose = seasonal_decompose(df['#Passengers'], model='additive', period=7)
decompose.plot()
plt.show()


#### Forecasting

with ARIMA

In [None]:
# Training data
df['Date'] = df.index
train = df[df['Date'] < pd.to_datetime('1960-08', format='%Y-%m')]
train['train'] = train['#Passengers']

# Test data
del train['Date']
del train['#Passengers']
test = df[df['Date'] >= pd.to_datetime('1960-08', format='%Y-%m')]
del test['Date']
test['test'] = test['#Passengers']
del test['#Passengers']


In [None]:
# Plot train and test data together
plt.plot(train, color = 'black')
plt.plot(test, color = 'red')
plt.title('Train/Test split for Passenger Data')
plt.ylabel('Passenger Number')
plt.xlabel('Year-Month')
sns.set()
plt.show()


In [None]:
# Train the model
from pmdarima.arima import auto_arima

model = auto_arima(train, trace=True, error_action='ignore', suppress_warnings=True)
model.fit(train)


In [None]:
# Forecast
forecast = model.predict(n_periods = len(test))
forecast = pd.DataFrame(forecast, index = test.index, columns=['Prediction'])


In [None]:
# Calculate RMSE
rms = sqrt(mean_squared_error(test,forecast))
print('RMSE:', rms)


In [None]:
# Plot the results
plt.plot(train, color = 'black')
plt.plot(test, color = 'red', label='true values')
plt.plot(forecast, color= 'limegreen', label='predicted values')
plt.title('Train/Test split for Passenger Data')
plt.ylabel('Passenger Number')
plt.xlabel('Year-Month')
plt.legend()
sns.set()
plt.show()
