In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))

from common import *
from ydj.analysis_tools import *

# 1. Load dataset

In [None]:
data = pd.read_csv(join(PATH.input, 'train_data.csv'))
print(len(data))
data.head()

# 2. Preprocessing

## 1) Generate `Time`

In [None]:
%%time
data = generate_full_timestamp(data)
data = impute_data(data)
data.head()

## 2) Feature engineering

In [None]:
%%time
data_proc = preprocess(data)
data_proc

In [None]:
data_proc = data_proc[list(data_proc.columns.drop(['Patv', 'Tmstamp', 'X', 'Y', 'Etmp_abs'])) + ['Patv']]
data_proc

# 3. Get correlation

In [None]:
d = data_proc[data_proc['TurbID'] == 1]
d.drop(columns=['TurbID'], inplace=True)
d

In [None]:
from analysis_tools.eda import *
plt.style.use('ggplot')

In [None]:
plot_corr(d.corr(), figsize=(15, 8))

In [None]:
plot_corr(data_proc.corr(), figsize=(15, 8))

In [None]:
sns.pairplot(d.iloc[:288])

In [None]:
%%time
sns.pairplot(d.iloc[:144])

In [None]:
%%time
sns.pairplot(d.iloc[144:288])

# 4. Time series analysis

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
# from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA

plt.style.use('ggplot')

In [None]:
%%time
data = pd.read_csv(join(PATH.input, 'train_data.csv'))
data = generate_full_timestamp(data)
data = impute_data(data)
data = preprocess(data)

In [None]:
%%time
data_tid = data[data['TurbID'] == 1]
data_tid['T'] = data_tid.apply(lambda row: pd.to_timedelta(f"{row['Day']} days {row['Tmstamp']}:00"), axis='columns')
data_tid

In [None]:
data = data[['TurbID', 'Day', 'Patv', 'Wspd']]
data

In [None]:
d = data_tid.query("TurbID == 1 and Day <= 10")
d

In [None]:
plt.style.use('ggplot')

In [None]:
ax = d.set_index('T')['Patv'].plot(figsize=(20, 10))
plt.gcf().autofmt_xdate()

In [None]:
%%time
from statsmodels.tsa.seasonal import seasonal_decompose

decomp = seasonal_decompose(d['Patv'], period=1)
fig = decomp.plot()
fig.set_size_inches((30, 10))
fig.tight_layout()

In [None]:
%%time
from statsmodels.tsa.seasonal import seasonal_decompose

decomp = seasonal_decompose(d['Patv'], period=144)
fig = decomp.plot()
fig.set_size_inches((30, 10))
fig.tight_layout()

In [None]:
train_data, test_data = train_test_split(d, test_size=144, shuffle=False)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf, month_plot, quarter_plot

In [None]:
%%time
fig, ax = plt.subplots(2, figsize=(30, 10))
plot_acf(train_data['Patv'], ax=ax[0], lags=30)
plot_pacf(train_data['Patv'], ax=ax[1], lags=30)

In [None]:
diff_train_data = train_data['Patv'].diff()
diff_train_data = diff_train_data.dropna()

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(40, 10))
axes[0].plot(train_data['Patv'])
axes[1].plot(diff_train_data);

In [None]:
%%time
fig, ax = plt.subplots(2, figsize=(30, 10))
plot_acf(diff_train_data, ax=ax[0], lags=30)
plot_pacf(diff_train_data, ax=ax[1], lags=30)

In [None]:
# auto_arima_model = auto_arima(tmp, start_p=1, start_q=1, max_p=2, max_q=2,
#                               start_P=1, start_Q=1, max_P=2, max_Q=2, m=144, seasonal=True,
#                               d=1, D=1,
#                               trace=True,
#                               error_action='ignore',
#                               suppress_warnings=True,
#                               stepwise=False, n_jobs=-1)

In [None]:
%%time
from itertools import product

orders = [(0, 1, 0)]  # (p, d, q)
seasonal_orders = [(0, 1, 0, 144), (0, 1, 1, 144), (1, 1, 0, 144), (1, 1, 1, 144)]  # (P, D, Q, S)

for order, seasonal_order in product(orders, seasonal_orders):
    model = SARIMAX(train_data['Patv'], order=order, seasonal_order=seasonal_order)  # order=(p, d, q)
    model_fit = model.fit()
    print(order, seasonal_order, ':', model_fit.aic)

In [None]:
model = SARIMAX(train_data['Patv'], order=(0, 1, 0), seasonal_order=(0, 1, 1, 144))  # order=(p, d, q)
model_fit = model.fit()

In [None]:
pred_uc = model_fit.get_forecast(steps=144*6)
pred_ci = pred_uc.conf_int()
pred    = pred_uc.predicted_mean

In [None]:
d = data_tid.query("TurbID == 1 and Day <= 10")

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

ax.plot(range(len(d)), d['Patv'], label='target')
ax.plot(range(len(train_data), len(d)), pred, label='pred')
plt.legend();

---

In [None]:
train_data, test_data = train_test_split(d, test_size=144, shuffle=False)

In [None]:
%%time

model = SARIMAX(train_data['Wspd'], order=(0, 1, 0), seasonal_order=(0, 1, 1, 144))  # order=(p, d, q)
model_fit = model.fit()

In [None]:
pred_uc = model_fit.get_forecast(steps=144*6)
pred_ci = pred_uc.conf_int()
pred    = pred_uc.predicted_mean

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

ax.plot(range(len(d)), d['Wspd'], label='target')
ax.plot(range(len(train_data), len(d)), pred, label='pred')
plt.legend();

In [None]:
train_data, test_data = train_test_split(d, test_size=144, shuffle=False)

In [None]:
%%time

model = SARIMAX(train_data['Wspd'], order=(0, 1, 0), seasonal_order=(0, 1, 1, 144))  # order=(p, d, q)
model_fit = model.fit()

In [None]:
pred_uc = model_fit.get_forecast(steps=144*6)
pred_ci = pred_uc.conf_int()
pred    = pred_uc.predicted_mean

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

ax.plot(range(len(d)), d['Wspd'], label='target')
ax.plot(range(len(train_data), len(d)), pred, label='pred')
plt.legend();

---

In [None]:
%%time

tmp = d[d['Day'] <= 5]
fig, ax = plt.subplots(figsize=(40, 10))
sns.lineplot(data=tmp, x='Tmstamp', y='Patv', hue='Day', legend='full', ax=ax)
plt.legend(bbox_to_anchor=(1.02, 1), loc=2)
plt.xticks(rotation=45)
plt.title('Seasonal Plot')

In [None]:
import plotly.express as px

tmp = d[d['Day'] <= 2]
fig = px.line_polar(tmp, r='Patv', theta='Tmstamp', 
                    color='Day', line_close=True, 
                    title='Polar seasonal plot',
                    width=1000, height=1000)
fig.show()

In [None]:
%%time
from statsmodels.tsa.seasonal import seasonal_decompose

tmp = d[d['Day'] <= 3]
decomp = seasonal_decompose(tmp['Patv'], period=144)
fig = decomp.plot()
fig.set_size_inches((30, 10))
fig.tight_layout()

In [None]:
%%time

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf, month_plot, quarter_plot

tmp = d[d['Day'] <= 5]
fig, ax = plt.subplots(2, figsize=(30, 10))
plot_acf(tmp['Patv'], ax=ax[0])
plot_pacf(tmp['Patv'], ax=ax[1])