## Imports 

In [32]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix 

from sklearn.metrics import roc_curve 
from sklearn.metrics import roc_auc_score 


if not "root" in locals():
    current_path = Path(os.getcwd())
    root = current_path.parent.absolute()

In [33]:
os.chdir(root)
from data_treatment.box_creator import create_binary_box

df = pd.read_csv('database.csv')
df = create_binary_box(df, relative_threshold = 0.05, box_length=5).set_index("Date").dropna(axis = 0)

# Feature selection 

In [34]:
our_features= ['PX_OPEN_VIX_volatility', 'VOLUME_TOTAL_CALL_VIX_volatility',
       'PUT_CALL_VOLUME_RATIO_CUR_DAY_SPX_volatility',
       'VOLUME_TOTAL_PUT_SPX_volatility', 'VOLATILITY_30D_SPX_volatility',
       'VOLATILITY_20D_SPX_volatility', 'VOLATILITY_30D_SX5E_volatility',
       'VOLATILITY_90D_SX5E_volatility', 'VOLATILITY_60D_SX5E_volatility',
       'VOLATILITY_20D_SX5E_volatility', 'PX_OPEN_VSTOXX_volatility',
       'PX_LOW_VSTOXX_volatility', 'QMJ USA_QMJ Factors',
       'QMJ Global_QMJ Factors', 'BAB USA_BAB Factors',
       'BAB Global_BAB Factors', 'Bullish_SENTIMENT',
       'Bullish 8-week Mov Avg_SENTIMENT', 'Bull-Bear Spread_SENTIMENT',
       'Mkt-RF_F-F_Research_Data_5_Factors_2x3_daily',
       'SMB_F-F_Research_Data_5_Factors_2x3_daily',
       'CMA_F-F_Research_Data_5_Factors_2x3_daily']

## Train/Test split

In [35]:
prescale_X = df[our_features]
y = df[["Box"]]

X_train, X_test, y_train, y_test = train_test_split(prescale_X, y, test_size=0.2)


## Choosing the best solver 

In [36]:
# define model
model = LinearDiscriminantAnalysis()
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['solver'] = ['svd', 'lsqr']
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X_train, y_train)
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

Mean Accuracy: 0.613
Config: {'solver': 'svd'}


  y = column_or_1d(y, warn=True)


In [37]:
full_pipeline= make_pipeline(StandardScaler(),LinearDiscriminantAnalysis(solver='svd') )

final_predictions = full_pipeline.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [38]:
y_pred= final_predictions.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.72      0.69       277
           1       0.57      0.50      0.53       207

    accuracy                           0.62       484
   macro avg       0.61      0.61      0.61       484
weighted avg       0.62      0.62      0.62       484



we can see that the default SVD solver performs the best compared to the other built-in solvers

### Tuning the shrinkage hyperparameter

In [39]:
import numpy as np

model = LinearDiscriminantAnalysis(solver='lsqr')
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid

grid = dict()
grid['shrinkage'] = np.arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X_train, y_train)
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

Mean Accuracy: 0.603
Config: {'shrinkage': 0.0}


  y = column_or_1d(y, warn=True)
