In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:


sns.get_dataset_names()

In [None]:


df = sns.load_dataset('flights')
df['yearMonth'] = pd.to_datetime("01-"+df['month'].astype(str)+"-"+df['year'].astype(str))
df.set_index('yearMonth',inplace=True)
df.head()

In [None]:


plt.figure(figsize=(10,5))
sns.lineplot(data=df,x=df.index,y=df.passengers)

In [None]:


df['rollMean']  = df.passengers.rolling(window=12).mean()
df['rollStd']  = df.passengers.rolling(window=12).std()

In [None]:


plt.figure(figsize=(10,5))
sns.lineplot(data=df,x=df.index,y=df.passengers)
sns.lineplot(data=df,x=df.index,y=df.rollMean)
sns.lineplot(data=df,x=df.index,y=df.rollStd)

In [None]:


from statsmodels.tsa.stattools import adfuller

In [None]:


adfTest = adfuller(df['passengers'],autolag='AIC',)

In [None]:


adfTest

In [None]:


stats = pd.Series(adfTest[0:4],index=['Test Statistic','p-value','#lags used','number of observations used'])
stats

In [None]:


for key, values in adfTest[4].items():
    print('criticality',key,":",values)

In [None]:


def test_stationarity(dataFrame, var):
    dataFrame['rollMean']  = dataFrame[var].rolling(window=12).mean()
    dataFrame['rollStd' ]  = dataFrame[var].rolling(window=12).std()
    
    from statsmodels.tsa.stattools import adfuller
    adfTest = adfuller(dataFrame[var],autolag='AIC')
    stats = pd.Series(adfTest[0:4],index=['Test Statistic','p-value','#lags used','number of observations used'])
    print(stats)
    
    for key, values in adfTest[4].items():
        print('criticality',key,":",values)
        
    sns.lineplot(data=dataFrame,x=dataFrame.index,y=var)
    sns.lineplot(data=dataFrame,x=dataFrame.index,y='rollMean')
    sns.lineplot(data=dataFrame,x=dataFrame.index,y='rollStd')

In [None]:


air_df = df[['passengers']]
air_df.head()

In [None]:


# time shift

air_df['shift'] = air_df.passengers.shift()
air_df['shiftDiff'] = air_df['passengers'] - air_df['shift']
air_df.head()

In [None]:


test_stationarity(air_df.dropna(),'shiftDiff')

In [None]:


log_df = df[['passengers']]
log_df['log'] = np.log(log_df['passengers'])
log_df.head()

In [None]:


test_stationarity(log_df,'log')

In [None]:


sqrt_df = df[['passengers']]
sqrt_df['sqrt'] = np.sqrt(df['passengers'])
sqrt_df.head()

In [None]:


test_stationarity(sqrt_df,'sqrt')

In [None]:


cbrt_df = df[['passengers']]
cbrt_df['cbrt'] = np.cbrt(cbrt_df['passengers'])
cbrt_df.head()

In [None]:


test_stationarity(cbrt_df,'cbrt')

In [None]:


log_df2 = log_df[['passengers','log']]
log_df2['log_sqrt'] = np.sqrt(log_df['log'])
log_df2['logShiftDiff'] = log_df2['log_sqrt'] - log_df2['log_sqrt'].shift()
log_df2.head()

In [None]:


test_stationarity(log_df2.dropna(),'logShiftDiff')

In [None]:


log_shift = df[['passengers']].copy(deep=True)
log_shift['log'] = np.log(log_shift['passengers'])
log_shift['logShift'] = log_shift['log'].shift()
log_shift['logShiftDiff'] = log_shift['log'] - log_shift['logShift']
log_shift.head()

In [None]:


test_stationarity(log_shift.dropna(),'logShiftDiff')


# # Next - 2

In [None]:


airP = df[['passengers']].copy(deep=True)
airP['firstDiff'] = airP['passengers'].diff()
airP['Diff12'] = airP['passengers'].diff(12)

In [None]:


airP.head()

In [None]:


from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:


plot_pacf(airP['firstDiff'].dropna(),lags=20);

In [None]:


plot_acf(airP['firstDiff'].dropna(),lags=20);

In [None]:


# p = 1, q = 3, d =1

In [None]:


train = airP[:round(len(airP)*70/100)]
test = airP[round(len(airP)*70/100):]
test.head()

In [None]:


model = ARIMA(train['passengers'],order=(1,1,3))
model_fit = model.fit()
prediction = model_fit.predict(start=test.index[0],end=test.index[-1])
airP['arimaPred'] = prediction
airP.tail()

In [None]:


airP.dropna()
sns.lineplot(data=airP,x=airP.index,y='passengers')
sns.lineplot(data=airP,x=airP.index,y='arimaPred')

In [None]:


from sklearn.metrics import mean_squared_error

In [None]:


np.sqrt(mean_squared_error(test['passengers'],prediction))

In [None]:


from statsmodels.tsa.statespace.sarimax import SARIMAX

In [None]:


plot_pacf(airP['Diff12'].dropna(),lags=20);
plot_acf(airP['Diff12'].dropna(),lags=20);

In [None]:


model = SARIMAX(train['passengers'],order=(1,1,3),seasonal_order=(2,1,2,12))
model_fit = model.fit()
prediction = model_fit.predict(start=test.index[0],end=test.index[-1])
airP['sarimaxPred'] = prediction

In [None]:


airP.dropna()
sns.lineplot(data=airP,x=airP.index,y='passengers')
sns.lineplot(data=airP,x=airP.index,y='sarimaxPred')
sns.lineplot(data=airP,x=airP.index,y='arimaPred')
# model_fit.predict(start=futureDate.index[0],end=futureDate.index[-1]).plot(color='black')

In [None]:


np.sqrt(mean_squared_error(test['passengers'],prediction))

In [None]:


futureDate = pd.DataFrame(pd.date_range(start='1961-01-01', end='1962-12-01',freq='MS'),columns=['Dates'])
futureDate.set_index('Dates',inplace=True)
futureDate.head()

In [None]:


model_fit.predict(start=futureDate.index[0],end=futureDate.index[-1])

In [None]:


airP.dropna()
sns.lineplot(data=airP,x=airP.index,y='passengers')
sns.lineplot(data=airP,x=airP.index,y='sarimaxPred')
sns.lineplot(data=airP,x=airP.index,y='arimaPred')
model_fit.predict(start=futureDate.index[0],end=futureDate.index[-1]).plot(color='black')


# # Next - 3

In [None]:


checkDf = df[['passengers']]
checkDf['diff1'] = checkDf.diff()
# checkDf['diffInv'] = checkDf['diff1'].diffinv()
checkDf.head()