In [29]:
import numpy as np
import pandas as pd
import plotly.express as px
from matplotlib.pyplot import subplots
import statsmodels.api as sm

from ISLP import load_data
from ISLP.models import (
    ModelSpec as MS,
    summarize,
)
from ISLP import confusion_table
from ISLP.models import contrast

from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis as qda,
    QuadraticDiscriminantAnalysis as QDA,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [30]:
df = load_data('Weekly')
df

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.270,Down
1,1990,-0.270,0.816,1.572,-3.936,-0.229,0.148574,-2.576,Down
2,1990,-2.576,-0.270,0.816,1.572,-3.936,0.159837,3.514,Up
3,1990,3.514,-2.576,-0.270,0.816,1.572,0.161630,0.712,Up
4,1990,0.712,3.514,-2.576,-0.270,0.816,0.153728,1.178,Up
...,...,...,...,...,...,...,...,...,...
1084,2010,-0.861,0.043,-2.173,3.599,0.015,3.205160,2.969,Up
1085,2010,2.969,-0.861,0.043,-2.173,3.599,4.242568,1.281,Up
1086,2010,1.281,2.969,-0.861,0.043,-2.173,4.835082,0.283,Up
1087,2010,0.283,1.281,2.969,-0.861,0.043,4.454044,1.034,Up


In [68]:
px.imshow(
    (
        df
        .drop(columns=["Year", "Direction"])
        .corr()
        .replace(to_replace=1, value=np.nan)
    ),
    title="Weekly Correlation",
    height=600,
    text_auto=".2f",
)


In [35]:
px.line(
    df,
    y=["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"],
)

In [42]:
px.histogram(
    df,
    x=["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"],
)

In [43]:
px.histogram(
    df,
    x=["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"],
    facet_col="Direction",
)

In [32]:
px.line(
    df,
    y="Volume",
)

In [40]:
px.histogram(
    df,
    x="Volume",
)

In [50]:
predictors = ["Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume"]
response = "Direction"

design = MS(df[predictors])
X = design.fit_transform(df)
y = df[response] == 'Up'

model = sm.Logit(endog=y, exog=X)
results = model.fit()
summarize(results)

Optimization terminated successfully.
         Current function value: 0.682441
         Iterations 4


Unnamed: 0,coef,std err,z,P>|z|
intercept,0.2669,0.086,3.106,0.002
Lag1,-0.0413,0.026,-1.563,0.118
Lag2,0.0584,0.027,2.175,0.03
Lag3,-0.0161,0.027,-0.602,0.547
Lag4,-0.0278,0.026,-1.05,0.294
Lag5,-0.0145,0.026,-0.549,0.583
Volume,-0.0227,0.037,-0.616,0.538


In [83]:
probabilities = results.predict()
print(f"{probabilities[:10]=}")

predictions = np.where(probabilities > 0.5, "Up", "Down")
print(f"{predictions[:10]=}")

px.imshow(confusion_table(predictions, df[response]), text_auto=True)


probabilities[:10]=array([0.60862494, 0.60103144, 0.58756995, 0.48164156, 0.61690129,
       0.56841902, 0.57860971, 0.51519724, 0.57151998, 0.55542873])
predictions[:10]=array(['Up', 'Up', 'Up', 'Down', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up'],
      dtype='<U4')


In [89]:
train_mask = df["Year"] < 2009
test_mask = ~train_mask

print(f"{train_mask.sum()=}, {test_mask.sum()=}")

X_train = X[train_mask]
X_test = X[test_mask]

y_train = y[train_mask]
y_test = y[test_mask]

lag_2_predictor = "Lag2"


train_mask.sum()=985, test_mask.sum()=104


In [90]:
lag_2_model = sm.Logit(endog=y_train, exog=X_train[["intercept", lag_2_predictor]])
lag_2_results = lag_2_model.fit()
summarize(lag_2_results)

lag_2_probabilities = lag_2_results.predict(X_test[["intercept", lag_2_predictor]])
print(f"{lag_2_probabilities[:10]=}")

lag_2_predictions = np.where(lag_2_probabilities > 0.5, "Up", "Down")
print(f"{lag_2_predictions[:10]=}")

px.imshow(confusion_table(lag_2_predictions, df.loc[test_mask, response]), text_auto=True)


Optimization terminated successfully.
         Current function value: 0.685555
         Iterations 4
lag_2_probabilities[:10]=985    0.526129
986    0.644736
987    0.486216
988    0.485200
989    0.519767
990    0.540125
991    0.623348
992    0.480993
993    0.451220
994    0.484881
dtype: float64
lag_2_predictions[:10]=array(['Up', 'Up', 'Down', 'Down', 'Up', 'Up', 'Up', 'Down', 'Down',
       'Down'], dtype='<U4')


In [100]:
lda = LDA(store_covariance=True)

lda.fit(y=y_train, X=X_train[[lag_2_predictor]])
print(f"""
{lda.means_=
}

{lda.classes_=
}

{lda.priors_=
}

{lda.scalings_=
}
""")

lda_probabilities = lda.predict(X_test[[lag_2_predictor]])
print(f"{lda_probabilities[:10]=}")

lda_predictions = np.where(lda_probabilities > 0.5, "Up", "Down")
print(f"{lda_predictions[:10]=}")

px.imshow(confusion_table(lda_predictions, df.loc[test_mask, response]), text_auto=True)



lda.means_=
array([[-0.03568254],
       [ 0.26036581]])

lda.classes_=
array([False,  True])

lda.priors_=
array([0.44771574, 0.55228426])

lda.scalings_=
array([[0.44141622]])

lda_probabilities[:10]=array([ True,  True, False, False,  True,  True,  True, False, False,
       False])
lda_predictions[:10]=array(['Up', 'Up', 'Down', 'Down', 'Up', 'Up', 'Up', 'Down', 'Down',
       'Down'], dtype='<U4')


In [101]:
qda = QDA(store_covariance=True)

qda.fit(y=y_train, X=X_train[[lag_2_predictor]])
print(f"""
{qda.means_=
}

{qda.classes_=
}

{qda.priors_=
}

{qda.scalings_=
}
""")

qda_probabilities = qda.predict(X_test[[lag_2_predictor]])
print(f"{qda_probabilities[:10]=}")

qda_predictions = np.where(qda_probabilities > 0.5, "Up", "Down")
print(f"{qda_predictions[:10]=}")

px.imshow(confusion_table(qda_predictions, df.loc[test_mask, response]), text_auto=True)



qda.means_=
array([[-0.03568254],
       [ 0.26036581]])

qda.classes_=
array([False,  True])

qda.priors_=
array([0.44771574, 0.55228426])

qda.scalings_=
[array([4.83781758]), array([5.37073888])]

qda_probabilities[:10]=array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])
qda_predictions[:10]=array(['Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up'],
      dtype='<U4')


In [103]:
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(y=y_train, X=X_train[[lag_2_predictor]])

knn_probabilities = knn.predict(X_test[[lag_2_predictor]])
print(f"{knn_probabilities[:10]=}")

knn_predictions = np.where(knn_probabilities > 0.5, "Up", "Down")
print(f"{knn_predictions[:10]=}")

px.imshow(confusion_table(knn_predictions, df.loc[test_mask, response]), text_auto=True)


knn_probabilities[:10]=array([ True,  True, False, False, False,  True,  True, False,  True,
       False])
knn_predictions[:10]=array(['Up', 'Up', 'Down', 'Down', 'Down', 'Up', 'Up', 'Down', 'Up',
       'Down'], dtype='<U4')


In [106]:
nb = GaussianNB()

nb.fit(y=y_train, X=X_train[[lag_2_predictor]])
print(f"""
{nb.class_prior_=
}

{nb.classes_=
}

{nb.theta_=
}
""")

nb_probabilities = nb.predict(X_test[[lag_2_predictor]])
print(f"{nb_probabilities[:10]=}")

nb_predictions = np.where(nb_probabilities > 0.5, "Up", "Down")
print(f"{nb_predictions[:10]=}")

px.imshow(confusion_table(nb_predictions, df.loc[test_mask, response]), text_auto=True)



nb.class_prior_=
array([0.44771574, 0.55228426])

nb.classes_=
array([False,  True])

nb.theta_=
array([[-0.03568254],
       [ 0.26036581]])

nb_probabilities[:10]=array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])
nb_predictions[:10]=array(['Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up'],
      dtype='<U4')


: 