In [1]:
# Basics
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf

# ML
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

# OOP
from pipeline import Pipeline

**Technical Analysis (TA)** <br>

&#9675; **Moving Average Differential/Spread (`50_20_ma_spread`)**  
  Difference between short-term and long-term moving averages; helps determine directional momentum.  
  If short-term MA > long-term MA = **Bullish** momentum, else **Bearish** momentum.

&#9675; **Exponentially Weighted Volatility (`MSFT_ewm`)**  
  Measures recent volatility with more emphasis on recent returns.  
  Captures regime shifts and fast-changing market risk more responsively than simple rolling volatility.

&#9675; **Log Returns (`*_logreturns`)**  
  Continuous returns used to normalize price changes across assets and maintain time-additivity:
  - `MSFT_logreturns` – Microsoft  
  - `SPY_logreturns` – S&P 500 ETF  
  - `^VIX_logreturns` – Volatility Index  
  - `^TNX_logreturns` – 10-Year Treasury Yield  
  - `XLK_logreturns` – Technology Select Sector ETF

**Supplement Tickers** <br>

&#9675; **SPY** – ETF of the S&P 500; provides overall market direction.  
Features: `SPY_logreturns`, `SPY_volume`

&#9675; **^VIX** – CBOE Volatility Index; reflects implied volatility on SPY options and is often referred to as the "fear gauge".  
Feature: `^VIX_logreturns`

&#9675; **^TNX** – 10-Year Treasury Yield; rising yields signal inflationary pressure and impact equity risk premiums.  
Feature: `^TNX_logreturns`

&#9675; **XLK** – Technology sector ETF; contextualizes sector-specific behavior (especially for MSFT).  
Features: `XLK_logreturns`, `XLK_volume`

&#9675; **MSFT Volume (`MSFT_volume`)** – Total shares traded daily; proxy for liquidity and interest in MSFT.

&#9675; **SPY Volume (`SPY_volume`)** – Market-wide volume benchmark.

&#9675; **Target** – Binary or categorical prediction target (e.g., next-day movement).


In [6]:
pipeline = Pipeline(start='1999-01-01', end='2025-01-03')
df = pipeline.prepare_data
df

Unnamed: 0_level_0,MSFT_logreturns,^VIX_logreturns,^TNX_logreturns,SPY_logreturns,XLK_logreturns,MSFT_ewm,50_20_ma_spread,MSFT_volume,SPY_volume,XLK_volume,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1999-03-16,0.019034,-0.003572,-0.006454,-0.003818,0.017019,0.007388,0.266955,68554000,4547500,758800,0.0
1999-03-17,-0.011527,0.016562,0.004893,-0.004312,-0.006511,0.005497,0.215713,44919600,4524100,425400,1.0
1999-03-18,0.031292,-0.057964,-0.004108,0.015959,0.013841,0.008076,0.090488,56231200,3506300,321400,0.0
1999-03-19,-0.007275,0.007843,0.009172,-0.017144,-0.024349,0.006541,-0.028211,91980400,5526700,636500,1.0
1999-03-22,0.009448,0.027577,0.008896,0.001926,0.000000,0.006832,-0.143937,55719200,4603800,773500,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2024-12-24,0.009330,-0.162028,-0.001741,0.011054,0.010280,0.000338,-12.136488,7164500,33160100,2326700,0.0
2024-12-26,-0.002781,0.031727,-0.002617,0.000067,0.000665,0.000026,-12.238154,8194200,41219100,3575400,0.0
2024-12-27,-0.017453,0.079573,0.008698,-0.010582,-0.013384,-0.001722,-12.310184,18117700,64969300,4363200,0.0
2024-12-30,-0.013328,0.087011,-0.016150,-0.011477,-0.012798,-0.002882,-12.200056,13158700,56578800,3975200,0.0


In [3]:
# # (X_tr, y_tr), (X_ts, y_ts)
# split = {'prop_tr':0.90,'prop_ts':0.10}
# assert abs(sum(split.values()) - 1.0) < 1e-8, "Split proportions must sum to 1.0"

# n,p = df.shape
# n_tr = int(np.ceil(n*split['prop_tr']))

# X = df.drop(columns='Target')
# y = df['Target']

# X_tr, y_tr = X.iloc[:n_tr].to_numpy(), y.iloc[:n_tr].to_numpy()
# X_ts, y_ts = X.iloc[n_tr:].to_numpy(), y.iloc[n_tr:].to_numpy()


# scaler = StandardScaler() # Scale features
# X_tr_scaled = scaler.fit_transform(X_tr)
# X_ts_scaled = scaler.transform(X_ts)