In [2]:
import pandas as pd
from openbb_terminal.sdk import openbb
import pytimetk as ptk
import plotly.graph_objects as go

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score


In [3]:
SYMBOL='SPY'
START='2015-12-01'
END='2023-12-01'
TRAIN_UNTIL='2019-12-01'

In [5]:
dat=openbb.stocks.load(SYMBOL, start_date=START, end_date=END)

In [7]:
df=dat.reset_index()[['date','Open','High','Low','Close']]
df

Unnamed: 0,date,Open,High,Low,Close
0,2015-12-01,181.553651,182.749912,181.267588,182.628540
1,2015-12-02,182.576520,182.905928,180.504743,180.764801
2,2015-12-03,181.024848,181.302234,177.488088,178.233582
3,2015-12-04,178.233551,182.013029,178.233551,181.709625
4,2015-12-07,181.371540,181.804966,179.611831,180.608719
...,...,...,...,...,...
2010,2023-11-27,454.649994,455.489990,454.079987,454.480011
2011,2023-11-28,454.079987,456.269989,453.500000,454.929993
2012,2023-11-29,457.149994,458.320007,454.200012,454.609985
2013,2023-11-30,455.480011,456.760010,453.339996,456.399994


In [8]:
df .plot_timeseries(
        date_column="date",
        value_column="Close",
        title="SPY Close",
        x_lab="Date",
        y_lab="Close",
    )

In [9]:
# Feature Engineering

# Distance from Moving Averages
for m in [10, 20, 30, 50, 100]:
    df[f'feat_dist_from_ma_{m}'] = df['Close']/df['Close'].rolling(m).mean() - 1

# Distance from n day max/min
for m in [6, 10, 15, 20, 30, 50, 100]:
    df[f'feat_dist_from_max_{m}'] = df['Close']/df['High'].rolling(m).max() - 1
    df[f'feat_dist_from_min_{m}'] = df['Close']/df['Low'].rolling(m).min() - 1 

# Price Distance
for m in [6, 10, 15, 20, 30, 50, 100]:
    df[f'feat_price_dist_{m}'] = df['Close']/df['Close'].shift(m) - 1

df.glimpse()

<class 'pandas.core.frame.DataFrame'>: 2015 rows of 31 columns
date:                    datetime64[ns]    [Timestamp('2015-12-01 00:00: ...
Open:                    float64           [181.55365104134404, 182.5765 ...
High:                    float64           [182.7499121280941, 182.90592 ...
Low:                     float64           [181.26758803246364, 180.5047 ...
Close:                   float64           [182.6285400390625, 180.76480 ...
feat_dist_from_ma_10:    float64           [nan, nan, nan, nan, nan, nan ...
feat_dist_from_ma_20:    float64           [nan, nan, nan, nan, nan, nan ...
feat_dist_from_ma_30:    float64           [nan, nan, nan, nan, nan, nan ...
feat_dist_from_ma_50:    float64           [nan, nan, nan, nan, nan, nan ...
feat_dist_from_ma_100:   float64           [nan, nan, nan, nan, nan, nan ...
feat_dist_from_max_6:    float64           [nan, nan, nan, nan, nan, -0. ...
feat_dist_from_min_6:    float64           [nan, nan, nan, nan, nan, 0.0 ...
feat_dist_fro

In [11]:
df.tail()

Unnamed: 0,date,Open,High,Low,Close,feat_dist_from_ma_10,feat_dist_from_ma_20,feat_dist_from_ma_30,feat_dist_from_ma_50,feat_dist_from_ma_100,...,feat_dist_from_min_50,feat_dist_from_max_100,feat_dist_from_min_100,feat_price_dist_6,feat_price_dist_10,feat_price_dist_15,feat_price_dist_20,feat_price_dist_30,feat_price_dist_50,feat_price_dist_100
2010,2023-11-27,454.649994,455.48999,454.079987,454.480011,0.007281,0.030781,0.044547,0.048359,0.033707,...,0.110628,-0.007306,0.110628,0.00944,0.031479,0.045527,0.106652,0.053256,0.025058,0.037354
2011,2023-11-28,454.079987,456.269989,453.5,454.929993,0.004995,0.027219,0.04407,0.04885,0.034309,...,0.111727,-0.006324,0.111727,0.009184,0.033486,0.04416,0.094661,0.043322,0.025472,0.041009
2012,2023-11-29,457.149994,458.320007,454.200012,454.609985,0.002985,0.022294,0.041854,0.047537,0.033194,...,0.110945,-0.008095,0.110945,0.00077,0.013104,0.040464,0.087064,0.042636,0.02688,0.037651
2013,2023-11-30,455.480011,456.76001,453.339996,456.399994,0.005444,0.02244,0.043868,0.050802,0.036897,...,0.11532,-0.004189,0.11532,0.006905,0.014944,0.043796,0.079828,0.060877,0.040489,0.035144
2014,2023-12-01,455.769989,459.649994,455.160004,459.100006,0.009419,0.025234,0.047435,0.055671,0.042684,...,0.121918,-0.001197,0.121918,0.008967,0.019701,0.058224,0.065791,0.076613,0.064234,0.032957


In [13]:
# Target Variable (Predict price above 20SMA in 5 days)

df['target_ma'] = df['Close'].rolling(20).mean()
df['price_above_ma'] = df['Close'] > df['target_ma']
df['target'] = df['price_above_ma'].astype(int).shift(-5)

df.tail()

Unnamed: 0,date,Open,High,Low,Close,feat_dist_from_ma_10,feat_dist_from_ma_20,feat_dist_from_ma_30,feat_dist_from_ma_50,feat_dist_from_ma_100,...,feat_price_dist_6,feat_price_dist_10,feat_price_dist_15,feat_price_dist_20,feat_price_dist_30,feat_price_dist_50,feat_price_dist_100,target_ma,price_above_ma,target
2010,2023-11-27,454.649994,455.48999,454.079987,454.480011,0.007281,0.030781,0.044547,0.048359,0.033707,...,0.00944,0.031479,0.045527,0.106652,0.053256,0.025058,0.037354,440.908501,True,
2011,2023-11-28,454.079987,456.269989,453.5,454.929993,0.004995,0.027219,0.04407,0.04885,0.034309,...,0.009184,0.033486,0.04416,0.094661,0.043322,0.025472,0.041009,442.8755,True,
2012,2023-11-29,457.149994,458.320007,454.200012,454.609985,0.002985,0.022294,0.041854,0.047537,0.033194,...,0.00077,0.013104,0.040464,0.087064,0.042636,0.02688,0.037651,444.695999,True,
2013,2023-11-30,455.480011,456.76001,453.339996,456.399994,0.005444,0.02244,0.043868,0.050802,0.036897,...,0.006905,0.014944,0.043796,0.079828,0.060877,0.040489,0.035144,446.382999,True,
2014,2023-12-01,455.769989,459.649994,455.160004,459.100006,0.009419,0.025234,0.047435,0.055671,0.042684,...,0.008967,0.019701,0.058224,0.065791,0.076613,0.064234,0.032957,447.799998,True,


In [14]:
# Clean and Train Test Split

df = df.dropna()

feat_cols = [col for col in df.columns if 'feat' in col]
train_until = TRAIN_UNTIL

x_train = df[df['date'] <= train_until][feat_cols]
y_train = df[df['date'] <= train_until]['target']

x_test = df[df['date'] > train_until][feat_cols]
y_test = df[df['date'] > train_until]['target']

In [15]:
# Train Model

clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=3,
    random_state=42,
    class_weight='balanced'
)

clf.fit(x_train, y_train)

y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test)

print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred)}")

print(f"Train Precision: {precision_score(y_train, y_train_pred)}")
print(f"Test Precision: {precision_score(y_test, y_test_pred)}")

print(f"Train ROC AUC: {roc_auc_score(y_train, clf.predict_proba(x_train)[:, 1])}")
print(f"Test ROC AUC: {roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])}")

Train Accuracy: 0.7883131201764058
Test Accuracy: 0.7387836490528414
Train Precision: 0.9122486288848263
Test Precision: 0.8445229681978799
Train ROC AUC: 0.8665523822988832
Test ROC AUC: 0.7980375963504799


In [16]:
# Visualize

df_test = df[df['date'] > train_until].reset_index(drop=True)
df_test['pred_prob'] = clf.predict_proba(x_test)[:, 1]
df_test['pred'] = df_test['pred_prob'] > 0.5

fig = df_test \
    .plot_timeseries(
        date_column="date",
        value_column="Close",
        title=f"{SYMBOL} Price with Predicted Patterns",
        x_lab="Date",
        y_lab="Close",
    )

fig.add_trace(
    go.Line(
        x=df_test['date'],
        y=df_test['target_ma'],
        name="Target 20SMA"
    )
)

df_pattern = (
    df_test[df_test['pred']]
        .groupby((~df_test['pred']).cumsum())['date']
        .agg(['first', 'last'])
)

for idx, row in df_pattern.iterrows():
    fig.add_vrect(
        x0=row['first'],
        x1=row['last'],
        line_width=0,
        fillcolor="green",
        opacity=0.2,
    )
    
fig.update_layout(
    width = 800,
    height = 600,
    xaxis_rangeslider_visible=True,
)

fig.show()
