In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression


In [3]:
pred_months = 12
train_years = 10
date_end = pd.to_datetime('2022-02-03')
col_price = 'close'
n_std = 2

In [10]:
df0 = pd.read_csv('data/IVV_EOD.csv.gz')
df0['date'] = pd.to_datetime(df0['date'])
date_min = df0['date'].min()
date_max = df0['date'].max()
print(f"Loaded data contains {len(df0)} rows over {df0['date'].nunique()} dates from {date_min.date()} to {date_max.date()}")

date_recent_end = min(date_end, date_max)
date_recent_start = date_end - pd.DateOffset(months=pred_months) + pd.DateOffset(days=1)
date_train_end = date_recent_start - pd.DateOffset(days=1)
date_train_start = date_train_end - pd.DateOffset(years=train_years) + pd.DateOffset(days=1)
date_train_start = max(date_train_start, date_min)

cols_select = ['date', col_price]
cols_map = {col_price:'price'}

df_train0 = df0[(df0['date']>=date_train_start) & (df0['date']<=date_train_end)][cols_select].reset_index(drop=True).rename(columns=cols_map)
print(f"Train data contains {len(df_train0)} rows over {df_train0['date'].nunique()} dates from {df_train0['date'].min().date()} to {df_train0['date'].max().date()}")

df_recent0 = df0[(df0['date']>=date_recent_start) & (df0['date']<=date_recent_end)][cols_select].reset_index(drop=True).rename(columns=cols_map)
print(f"Recent data contains {len(df_recent0)} rows over {df_recent0['date'].nunique()} dates from {df_recent0['date'].min().date()} to {df_recent0['date'].max().date()}")



Loaded data contains 5463 rows over 5463 dates from 2000-05-19 to 2022-02-03
Train data contains 2516 rows over 2516 dates from 2011-02-04 to 2021-02-03
Recent data contains 253 rows over 253 dates from 2021-02-04 to 2022-02-03


In [46]:
### log transform ###

df_train = df_train0.copy()

df_train['x'] = range(len(df_train))
x_train_max = df_train['x'].max()
x_train = np.array(df_train['x']).reshape(-1, 1)

lm = LinearRegression().fit(x_train, df_train['price'])
df_train['trend'] = lm.predict(x_train)
df_train['residual'] = df_train['trend'] - df_train['price']

residual_std = df_train['residual'].std()
residual_mean = df_train['residual'].mean()
upper_bond = residual_mean + n_std * residual_std
lower_bond = residual_mean - n_std * residual_std 

df_train['is_outlier'] = (df_train['residual'] > upper_bond) | (df_train['residual'] < lower_bond)
print(f"{df_train['is_outlier'].sum()} out of {len(df_train)} dates are identified as outliers.")

# re-train with outliers removed
index_select = ~df_train['is_outlier']
x_train_filter = np.array(df_train[index_select]['x']).reshape(-1, 1)
lm = LinearRegression().fit(x_train_filter, df_train[index_select]['price'])
df_train['trend'] = lm.predict(x_train)

df_train['is_recent'] = False
df_train.drop(columns=['residual'], inplace=True)

df_train.tail(3)


130 out of 2516 dates are identified as outliers.


Unnamed: 0,date,price,x,trend,is_outlier,is_recent
2513,2021-02-01,377.600006,2513,327.708579,True,False
2514,2021-02-02,382.959991,2514,327.794178,True,False
2515,2021-02-03,383.26001,2515,327.879776,True,False


In [47]:
df_recent = df_recent0.copy()

df_recent['x'] = np.arange(0, len(df_recent)) + x_train_max + 1
x_recent = np.array(df_recent['x']).reshape(-1, 1)
df_recent['trend'] = lm.predict(x_recent)

df_recent['is_outlier'] = False
df_recent['is_recent'] = True

df_recent.head(3)

Unnamed: 0,date,price,x,trend,is_outlier,is_recent
0,2021-02-04,387.600006,2516,327.965375,False,True
1,2021-02-05,389.119995,2517,328.050973,False,True
2,2021-02-08,392.049988,2518,328.136572,False,True


In [49]:
df_model = pd.concat([df_train, df_recent]).reset_index(drop=True)