<a href="https://colab.research.google.com/github/dennistay1981/Resources/blob/main/Miscellaneous/EDA%20TSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries

In [None]:
!pip install pmdarima
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler

Import, scale, resample, and plot data

In [None]:
#Import data
data = pd.read_csv('test.csv', index_col='Time')
#data.index = pd.to_datetime(data.index)


#Scale data
scaler = MinMaxScaler()
data['scaled'] = scaler.fit_transform(data['SCL'].values.reshape(-1, 1))

#Resample data by seconds
#series = data['scaled'].resample('s').mean()
series = data['scaled']

# Plot data
plt.figure(figsize=(12, 6))
plt.plot(series)
plt.title('Resampled Data (Seconds)')
plt.show()


Compute and plot (P)ACF

In [None]:
lag_acf = acf(series, nlags=20)
lag_pacf = pacf(series, nlags=20, method='ols')


plt.figure(figsize=(12, 6))
plt.subplot(121)
plot_acf(series, lags=20, ax=plt.gca())
plt.subplot(122)
plot_pacf(series, lags=20, method='ols', ax=plt.gca())
plt.title('(P)ACF for XXX condition (XXX.csv)')
plt.show()

Auto fit best ARIMA model

In [None]:
model = auto_arima(series, start_p=0, start_q=0, test='adf', max_p=3, max_q=3, d=None,seasonal=True,
                      start_P=0, D=None, trace=True,  error_action='ignore', suppress_warnings=True,
                      stepwise=True)


print(model.summary())

Plot data vs. predictions and evaluate model

In [None]:
predictions = model.predict_in_sample()
#Create axes
x1 =np.arange(0, len(series))
x3= np.arange(0, len(series))

plt.plot(x3,series, label="actual", color='dodgerblue')
plt.plot(x1,predictions, label="predictions",color='orange')
plt.axhline(y=series.mean(), color='dodgerblue', linestyle='--', label='actual mean')
plt.axhline(y=predictions.mean(), color='orange', linestyle='--', label='predictions mean')
plt.legend()
plt.title('Actual vs. predicted for XXX condition (XXX.csv)')
plt.show()


print("RMSE:", np.sqrt(np.mean((series -predictions) ** 2)))
print("MAPE:", mean_absolute_percentage_error(series, predictions) )
print("R2:", r2_score(series, predictions))
print("AIC:", model.aic())

Remove first value of series and predictions if the latter is zero, and then replot/re-evaluate

In [None]:
predictions = predictions[1:]
series = series[1:]


x1 =np.arange(0, len(series))
x3= np.arange(0, len(series))

plt.plot(x3,series, label="actual", color='dodgerblue')
plt.plot(x1,predictions, label="predictions",color='orange')
plt.axhline(y=series.mean(), color='dodgerblue', linestyle='--', label='actual mean')
plt.axhline(y=predictions.mean(), color='orange', linestyle='--', label='predictions mean')
plt.legend()
plt.title('Actual vs. predicted for XXX condition (XXX.csv)')
plt.show()


print("RMSE:", np.sqrt(np.mean((series -predictions) ** 2)))
print("MAPE:", mean_absolute_percentage_error(series, predictions) )
print("R2:", r2_score(series, predictions))
print("AIC:", model.aic())

Train-test approach to diagnose overfitting

In [None]:
series_train = series[:int(len(series) * 0.8)]
series_test = series[int(len(series) * 0.8):]


model = auto_arima(series_train, start_p=0, start_q=0, test='adf', max_p=3, max_q=3, d=None,seasonal=True,
                      start_P=0, D=None, trace=True,  error_action='ignore', suppress_warnings=True,
                      stepwise=True)


print(model.summary())

In [None]:
#Predict on train and test sets
predictions_train = model.predict_in_sample()
predictions_test = model.predict(n_periods=len(series_test))



# Plotting and evaluation for train set
plt.figure(figsize=(12, 6))
plt.plot(series_train, label="Actual Train")
plt.plot(predictions_train, label="Predicted Train")
plt.legend()
plt.title("Actual vs. Predicted (Train)")
plt.show()

rmse_train = np.sqrt(np.mean((series_train - predictions_train) ** 2))
mape_train = mean_absolute_percentage_error(series_train, predictions_train)
print("Train RMSE:", rmse_train)
print("Train MAPE:", mape_train)


# Plotting and evaluation for test set
plt.scatter(series_test, predictions_test)
plt.title("Actual vs. Predicted (Test)")
plt.show()

#plt.figure(figsize=(12, 6))
#plt.plot(series_test, label="Actual Test")
#plt.plot(predictions_test, label="Predicted Test")
#plt.legend()
#plt.title("Actual vs. Predicted (Test)")
#plt.show()

rmse_test = np.sqrt(np.mean((series_test - predictions_test) ** 2))
mape_test = mean_absolute_percentage_error(series_test, predictions_test)
print("Test RMSE:", rmse_test)
print("Test MAPE:", mape_test)

Compare with SARIMAX (optional)

In [None]:
import statsmodels.api as sm

model=sm.tsa.SARIMAX(series,order=(1,1,2),seasonal_order = (0,0,0,0), trend='c').fit()   #seasonal_order = (P, D, Q, m)
model.summary()


#Predict
predictions=model.predict(start=0, end=len(series)-1)

#Create axes
x1 =np.arange(0, len(series))
x3= np.arange(0, len(series))

plt.plot(x3,series, label="actual", color='dodgerblue')
plt.plot(x1,predictions, label="predictions",color='orange')
plt.legend()
plt.title('Actual vs. predicted for XXX condition (XXX.csv)')
plt.show()


print("RMSE:", np.sqrt(np.mean((series -predictions) ** 2)))
print("MAPE:", mean_absolute_percentage_error(series, predictions) )
print("R2:", r2_score(series, predictions))
print("AIC:", model.aic())