# Time Series Modeling the S&P 500

### Disclaimer: This notebook should not be considered any kind of financial advice. It exists only for the purposes of practicing modeling and making predictions 

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.dummy import DummyClassifier 
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, \
log_loss, auc, roc_auc_score, roc_curve, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost
from xgboost import XGBClassifier

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

import datetime as dt

# New import of my custom class
from classification_classes import Model, Importance

pd.set_option("display.max_rows", 25)
pd.set_option('display.max_columns', 35)
sns.set_style("dark")

# Preprocessing

In [32]:
# Load the S&P Data
sp = pd.read_csv('/Users/samalainabayeva/Desktop/Capstone Project/INDICES_FILLED.csv')

In [33]:
# Prepare the dataframe appropriately
sp.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
sp.set_index("Date", inplace=True)
sp = sp[sp['Index'] == "^GSPC"]

In [34]:
sp.shape

(15064, 24)

In [35]:
# Inspect Nulls
sp.isna().sum()

Index                     0
Adj Close                 0
Close                     0
High                      0
Low                       0
Open                      0
Volume                    0
garman_klass_vol          0
RSI                      20
lowest_bollinger_band    19
lower_bollinger_band     19
20_day_SMA               19
one_up_bollinger_band    19
upper_bollinger_band     19
ATR                      14
MACD                     25
dollar_volume(M)          0
Mkt-RF                   67
SMB                      67
HML                      67
RMW                      67
CMA                      67
GDP_Filled                0
Interest Rates            0
dtype: int64

In [36]:
# 92 total values out of 15064, so I will drop the nulls
print(df.shape[0] - df.dropna().shape[0])

# Create the permanent df object that we will work with
df = sp.dropna().copy()

92


### Working df object

In [50]:
# Have many -inf values due do zeros in the "open" feature being a denominator for garmin_klass, "zero-division"
# We will impute those values later when we model
df["garman_klass_vol"].replace(-np.inf, np.nan, inplace=True)

In [51]:
df.describe()

Unnamed: 0,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M),Mkt-RF,SMB,HML,RMW,CMA,GDP_Filled,Interest Rates
count,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0,10421.0,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0,14972.0
mean,933.231748,933.231748,938.680396,927.033412,903.256919,1402298000.0,7.4e-05,53.487227,900.769995,915.584615,930.399236,945.213857,960.028477,12.309743,2.040333,2806906.0,0.000271,7.8e-05,0.00014,0.000138,0.000132,9064.084889,0.04472
std,1054.84885,1054.84885,1060.623892,1048.270047,1078.887655,1828670000.0,0.000193,10.445511,1016.778954,1033.684447,1050.798942,1068.112392,1085.615278,16.526116,15.832952,4823673.0,0.010273,0.005445,0.005854,0.004004,0.0038,7236.276739,0.032613
min,62.279999,62.279999,63.23,60.959999,0.0,3020000.0,0.0,15.561473,60.160341,63.756685,66.122499,67.775022,68.551043,0.869632,-237.020227,249.429,-0.1744,-0.1119,-0.0502,-0.0301,-0.0587,678.674,-0.0005
25%,107.827501,107.827501,108.755001,106.907503,0.0,30727500.0,1.6e-05,46.21539,104.856303,106.312262,107.773625,109.433707,110.977014,1.812673,-0.847751,3090.348,-0.0042,-0.0028,-0.0024,-0.0017,-0.0018,2591.247,0.0169
50%,453.059998,453.059998,454.220001,450.705002,453.050003,269655000.0,3.3e-05,54.048673,444.058837,447.628338,452.163,456.907365,461.731549,4.220744,0.766159,121656.7,0.0005,0.0002,0.0001,0.0001,0.0001,7115.652,0.0468
75%,1325.182526,1325.182526,1332.974976,1313.934998,1324.837463,2997040000.0,7.2e-05,60.985406,1281.393648,1300.849336,1322.913246,1345.605311,1369.072329,16.875214,5.312635,4569764.0,0.0051,0.0031,0.0025,0.0019,0.002,14651.249,0.061525
max,4796.560059,4796.560059,4818.620117,4780.040039,4804.509766,11456230000.0,0.006982,84.072869,4610.997106,4653.915217,4716.387012,4795.193216,4925.005874,152.756467,92.583711,37419630.0,0.1135,0.0617,0.0673,0.0451,0.0253,27623.543,0.1714


In [52]:
knn = KNNImputer(add_indicator=True)

In [55]:
ar = ARIMA(endog=df["Adj Close"], )

TypeError: ARIMA.__init__() missing 1 required positional argument: 'endog'