In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels

## 1. Getting to know the data

In [None]:
# Data from https://datahub.io/core/global-temp

df = pd.read_csv('../data/monthly_csv.csv')
df['Date'] = pd.to_datetime(df['Date'])
df[df['Date'] < pd.to_datetime('1880-06-01')].head(10)

In [None]:
temps = df.pivot(index='Date', columns='Source', values='Mean')
#temps.info()
#temps.head()

In [None]:
from numpy import polyfit

def fit(X, y, degree=3):
    coef = polyfit(X, y, degree)
    trendpoly = np.poly1d(coef)
    return trendpoly(X)

def get_season(s, yearly_periods=4, degree=3):
    X = [i%(365/4) for i in range(0, len(s))]
    seasonal = fit(X, s.values, degree)
    return pd.Series(data=seasonal, index=s.index)

def get_trends(s, degree=3):
    X = list(range(len(s)))
    trend = fit(X, s.values, degree)
    return pd.Series(data=trend, index=s.index)

temps['trend'] = get_trends(temps['GCAG'])
temps['season'] = get_season(temps['GCAG'] - temps['trend'])

In [None]:
fig,ax = plt.subplots(figsize=(12,6))
sns.lineplot(data=temps[['GCAG','season','trend']])
plt.show()

## 2. Autocorrelation
Autocorrelation is the correlation of a signal with a lagged version of itself. The autocorrelation plot draws the autocorrelation as a funtion of lag; this can help find repeating patterns, and if often used in signal processing. 

In [None]:
# from https://github.com/owid/owid-datasets/tree/master/datasets/Air%20pollution%20by%20city%20-%20Fouquet%20and%20DPCC%20(2011)
pollution = pd.read_csv('../data/smoke_pollution.csv', skiprows=1, names=['City','Year','Smoke','SPM'])
#pollution.head()


In [None]:
#let's only use the SPM for London for this example
fig, ax = plt.subplots(figsize=(12,6))
df = pollution[pollution['City'] == 'London']['SPM']
pd.plotting.autocorrelation_plot(df)
plt.show()

We can see high autocorrelations with a lag of only a few years. There is a negative autocorrelation at around hundred years, after which point the autocorrelation stays around zero. This plot clearly shows that air pollution is not a stationary process, since the autocorrelation is not flat. Let's use the [Dickey-Fuller test](https://en.wikipedia.org/wiki/Dickey%E2%80%93Fuller_test) to see this more statistically – for this we can use [`statsmodels.adfuller`](https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.adfuller.html). Have a look at [this notebook](https://www.statsmodels.org/dev/examples/notebooks/generated/stationarity_detrending_adf_kpss.html) if you are unsure about the meaning of this metric.

In [None]:
from statsmodels.tsa import stattools
stattools.adfuller(df)