# Time Series Modeling the S&P 500

### Disclaimer: This notebook should not be considered any kind of financial advice. It exists only for the purposes of practicing modeling and making predictions 

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.dummy import DummyClassifier 
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, \
log_loss, auc, roc_auc_score, roc_curve, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost
from xgboost import XGBClassifier

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

import datetime as dt

# New import of my custom class
from classification_classes import Model, Importance

pd.set_option("display.max_rows", 25)
pd.set_option('display.max_columns', 35)
sns.set_style("dark")

# Preprocessing

In [53]:
# Load the S&P Data
df = pd.read_csv('/Users/samalainabayeva/Desktop/Capstone Project/INDICES_FILLED.csv')
df.head(2)

Unnamed: 0,Date,Ticker,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,RMW,CMA,garman_klass_vol,RSI,...,dollar_volume(M),Quarter,Month,cycle_year,day_of_week,one_months_later,three_months_later,six_months_later,twelve_months_later,One_Month_Change,Three_Month_Change,Six_Month_Change,Twelve_Month_Change,One_Month_Positive,Three_Month_Positive,Six_Month_Positive,Twelve_Month_Positive
0,1964-02-06,^GSPC,76.93,76.93,77.260002,76.470001,0.0,4110000.0,669.822,3.5,0.0024,0.0006,-0.0023,0.0004,0.0006,,64.329769,...,316.1823,1,2,4,Thursday,1964-03-06,1964-05-06,1964-08-06,1965-02-08,0.017938,0.053685,0.057325,0.130248,1.0,1.0,1.0,1.0
1,1964-02-07,^GSPC,77.18,77.18,77.510002,76.660004,0.0,4710000.0,669.822,3.52,0.0037,-0.0006,0.0014,0.0008,0.0036,,68.197375,...,363.5178,1,2,4,Friday,1964-03-09,1964-05-07,1964-08-07,1965-02-08,0.0149,0.051438,0.060637,0.126587,1.0,1.0,1.0,1.0


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43966 entries, 0 to 43965
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   43966 non-null  object 
 1   Ticker                 43966 non-null  object 
 2   Adj Close              43966 non-null  float64
 3   Close                  43966 non-null  float64
 4   High                   43966 non-null  float64
 5   Low                    43966 non-null  float64
 6   Open                   43966 non-null  float64
 7   Volume                 43966 non-null  float64
 8   GDP_Filled             43966 non-null  float64
 9   Interest_Rates         43966 non-null  float64
 10  Mkt-RF                 43966 non-null  float64
 11  SMB                    43966 non-null  float64
 12  HML                    43966 non-null  float64
 13  RMW                    43966 non-null  float64
 14  CMA                    43966 non-null  float64
 15  ga

In [64]:
# Prepare the dataframe appropriately
df['Date'] = pd.to_datetime(sp["Date"])
df.set_index("Date", inplace=True)
df["Quarter"] = df["Quarter"].astype(str)
df['Month'] = df['Month'].astype(str)
df['cycle_year'] = df['cycle_year'].astype(str)
# Isolate S&P 500 index
df = df[df['Ticker'] == "^GSPC"]

In [65]:
df.shape

(14634, 40)

In [66]:
df.head()

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,RMW,CMA,garman_klass_vol,RSI,lowest_bollinger_band,...,dollar_volume(M),Quarter,Month,cycle_year,day_of_week,one_months_later,three_months_later,six_months_later,twelve_months_later,One_Month_Change,Three_Month_Change,Six_Month_Change,Twelve_Month_Change,One_Month_Positive,Three_Month_Positive,Six_Month_Positive,Twelve_Month_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
1964-02-06,^GSPC,76.93,76.93,77.260002,76.470001,0.0,4110000.0,669.822,3.5,0.0024,0.0006,-0.0023,0.0004,0.0006,,64.329769,76.168076,...,316.1823,1,2,4,Thursday,1964-03-06,1964-05-06,1964-08-06,1965-02-08,0.017938,0.053685,0.057325,0.130248,1.0,1.0,1.0,1.0
1964-02-07,^GSPC,77.18,77.18,77.510002,76.660004,0.0,4710000.0,669.822,3.52,0.0037,-0.0006,0.0014,0.0008,0.0036,,68.197375,76.234617,...,363.5178,1,2,4,Friday,1964-03-09,1964-05-07,1964-08-07,1965-02-08,0.0149,0.051438,0.060637,0.126587,1.0,1.0,1.0,1.0
1964-02-10,^GSPC,77.050003,77.050003,77.769997,76.830002,0.0,4150000.0,669.822,3.53,-0.0013,0.0017,-0.002,0.0013,-0.0016,,64.376738,76.331949,...,319.7575,1,2,4,Monday,1964-03-10,1964-05-11,1964-08-10,1965-02-10,0.019987,0.049968,0.061389,0.122128,1.0,1.0,1.0,1.0
1964-02-11,^GSPC,77.330002,77.330002,77.650002,76.809998,0.0,4040000.0,669.822,3.52,0.0033,-0.0003,-0.0014,0.0021,0.0038,,68.391587,76.385595,...,312.4132,1,2,4,Tuesday,1964-03-11,1964-05-11,1964-08-11,1965-02-11,0.020949,0.046166,0.057287,0.106168,1.0,1.0,1.0,1.0
1964-02-13,^GSPC,77.519997,77.519997,77.93,77.099998,0.0,4820000.0,669.822,3.52,0.0008,-0.0001,-0.0007,0.0013,-0.0029,,69.894212,76.383568,...,373.6464,1,2,4,Thursday,1964-03-13,1964-05-13,1964-08-13,1965-02-15,0.020898,0.044505,0.063081,0.110294,1.0,1.0,1.0,1.0


### Working df object

In [67]:
df.describe()

Unnamed: 0,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,RMW,CMA,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M),One_Month_Change,Three_Month_Change,Six_Month_Change,Twelve_Month_Change,One_Month_Positive,Three_Month_Positive,Six_Month_Positive,Twelve_Month_Positive
count,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,10143.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0,14634.0
mean,895.91581,895.91581,901.186077,889.954599,865.699879,1373188000.0,8744.134898,4.460946,0.000236,8.8e-05,0.000159,0.000135,0.000143,7.5e-05,53.448075,865.097317,879.324913,893.552509,907.780106,922.007702,11.784443,1.755583,2640127.0,0.006548,0.019573,0.03998,0.082204,0.607284,0.646986,0.68088,0.736367
std,1000.251152,1000.251152,1005.802066,994.00602,1024.836753,1814063000.0,6915.580368,3.279356,0.010237,0.005437,0.005818,0.003985,0.003762,0.000195,10.472723,964.997087,981.121631,997.464771,1014.015939,1030.765113,15.83035,15.396667,4576708.0,0.04561,0.07598,0.110604,0.160591,0.488371,0.477923,0.466152,0.440617
min,62.279999,62.279999,63.23,60.959999,0.0,3020000.0,669.822,-0.05,-0.1744,-0.1119,-0.0502,-0.0301,-0.0587,0.0,15.561473,60.160341,63.756685,66.122499,67.775022,68.551043,0.869632,-237.020227,249.429,-0.329668,-0.409638,-0.467638,-0.488228,0.0,0.0,0.0,0.0
25%,107.502501,107.502501,108.362501,106.612501,0.0,29700000.0,2476.949,1.66,-0.0042,-0.0028,-0.0023,-0.0017,-0.0018,1.6e-05,46.151287,104.197559,105.85501,107.506374,108.878364,110.583693,1.798892,-0.849727,2996.542,-0.017332,-0.020559,-0.026359,-0.007251,0.0,0.0,0.0,0.0
50%,448.470001,448.470001,449.979996,446.925003,448.410004,257540000.0,6882.098,4.67,0.0005,0.0003,0.0001,0.0001,0.0001,3.3e-05,53.978064,440.605738,444.857573,448.071,451.756315,455.536899,4.124392,0.752002,115530.4,0.010137,0.02404,0.046083,0.095086,1.0,1.0,1.0,1.0
75%,1308.359955,1308.359955,1316.130035,1299.167542,1308.014954,2879958000.0,14448.882,6.17,0.005,0.0031,0.0025,0.0019,0.002,7.2e-05,60.984455,1269.461287,1291.552938,1309.826132,1331.769042,1350.338849,16.499133,5.06071,4367037.0,0.033031,0.065953,0.105077,0.182529,1.0,1.0,1.0,1.0
max,4796.560059,4796.560059,4818.620117,4780.040039,4804.509766,11456230000.0,26408.405,17.14,0.0977,0.0617,0.0673,0.0451,0.0253,0.006982,84.072869,4610.997106,4653.915217,4716.387012,4795.193216,4925.005874,152.756467,92.583711,37419630.0,0.266108,0.399522,0.527456,0.747797,1.0,1.0,1.0,1.0


In [68]:
df.index.min(), df.index.max()

('1964-02-06', '2022-11-02')

In [10]:
knn = KNNImputer(add_indicator=True)

In [11]:
ar = ARIMA(endog=df["Adj Close"], )

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
