In [17]:
import pandas as pd

from headers import *
from training_helper import trainPipeline
from utils.custom_train_test_split import *
from joblib import dump, load
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [18]:
cutoffs = [0.10, 0.15, 0.20]
files = ['../data/combined_inner_ticker.csv', '../data/extra-data/combined_inner_ticker.csv']
thresholds = [80, 90, 95]
yLabels = ['alpha1Year', 'alpha2Year']

In [19]:
trainDf, testDf = custom_train_test_split(['../data/deduplicated/combined_inner_ticker.csv'])
# TODO: RERUN WITH DEDUPLICATED!

In [21]:
# pd.unique(trainDf['ticker'])
pd.unique(testDf['ticker'])

array(['ABT.json', 'ALGN.json', 'ALTR.json', 'AMAT.json', 'AMT.json',
       'ANET.json', 'AVGO.json', 'AWK.json', 'CAT.json', 'CB.json',
       'CE.json', 'CHTR.json', 'CMG.json', 'COKE.json', 'CZR.json',
       'DAL.json', 'DOW.json', 'DRI.json', 'DTE.json', 'DVN.json',
       'EBAY.json', 'ECL.json', 'EFX.json', 'EL.json', 'ELS.json',
       'EMR.json', 'EPAM.json', 'EQIX.json', 'ESS.json', 'ESTC.json',
       'FMC.json', 'GD.json', 'GRMN.json', 'HAL.json', 'HIG.json',
       'HOLX.json', 'HQY.json', 'HRL.json', 'HUM.json', 'IBM.json',
       'ICE.json', 'IDXX.json', 'ILMN.json', 'INSM.json', 'ISRG.json',
       'JNJ.json', 'K.json', 'KHC.json', 'KMI.json', 'LDOS.json',
       'LUV.json', 'LYV.json', 'MCHP.json', 'MET.json', 'META.json',
       'MKTX.json', 'MSCI.json', 'MUSA.json', 'NBIX.json', 'NDAQ.json',
       'NDSN.json', 'NOC.json', 'NOW.json', 'NRG.json', 'NSC.json',
       'NTAP.json', 'NVDA.json', 'O.json', 'OC.json', 'ORI.json',
       'PAG.json', 'PCG.json', 'PCTY.json',

In [3]:
trainDf, testDf = per_year_train_test_split(files)

# To generate models
# trainPipeline(cutoffs, yLabels, trainDf, testDf, thresholds, saveModel=True)

X_train = trainDf[ratioKeys + relativeRatioKeys]
X_test = testDf[ratioKeys + relativeRatioKeys]

modelKeys = ['svc', 'tree']
for modelKey in modelKeys:
    for yLabel in yLabels:
        for cutoff in cutoffs:
            print(f'{modelKey} - {yLabel} - {cutoff}')
            grid = load(f'../data/cvPipeline/{modelKey}{yLabel}{cutoff}.joblib')
            y_pred = grid.predict(X_test)
            y_true = np.where(testDf[yLabel] <= cutoff, 0, 1)
            print(confusion_matrix(y_true, y_pred))
            print(classification_report(y_true, y_pred))

            if modelKey != 'svc':
                continue
            model = grid.best_estimator_['svc']
            scaler = grid.best_estimator_['scaler']
            X_scaled = scaler.transform(X_test)
            scores = model.decision_function(X_scaled)
            for threshold in thresholds:
                barrier = np.percentile(scores, threshold)
                y_pred_threshold = scores >= barrier
                xTakenThreshold = X_test[y_pred_threshold]
                takenDataThreshold = testDf.loc[testDf.index.intersection(xTakenThreshold.index)]
                print(
                    f'For threshold {threshold} data - mean was {round(takenDataThreshold.loc[:, yLabel].mean() * 100, 2)}%')
                thresholdLabel = f'svcThreshold{threshold}'
                print(confusion_matrix(y_true, y_pred_threshold))
                print(classification_report(y_true, y_pred_threshold))

                returns = takenDataThreshold[yLabel]
                bins = pd.cut(returns, bins=[-1.0, -0.6, -0.3, -0.1, 0.1, 0.3, 0.6, 1.0, 10])
                bin_counts = bins.value_counts().sort_index()
                bin_df = bin_counts.reset_index()
                bin_df.columns = ['Bin Range', 'Count']
                print(bin_df)


svc - alpha1Year - 0.1
[[8469    6]
 [4380    0]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79      8475
           1       0.00      0.00      0.00      4380

    accuracy                           0.66     12855
   macro avg       0.33      0.50      0.40     12855
weighted avg       0.43      0.66      0.52     12855

For threshold 80 data - mean was 6.25%
[[6873 1602]
 [3411  969]]
              precision    recall  f1-score   support

           0       0.67      0.81      0.73      8475
           1       0.38      0.22      0.28      4380

    accuracy                           0.61     12855
   macro avg       0.52      0.52      0.51     12855
weighted avg       0.57      0.61      0.58     12855

      Bin Range  Count
0  (-1.0, -0.6]      6
1  (-0.6, -0.3]    245
2  (-0.3, -0.1]    561
3   (-0.1, 0.1]    790
4    (0.1, 0.3]    527
5    (0.3, 0.6]    325
6    (0.6, 1.0]     82
7   (1.0, 10.0]     35
For threshold 90 data -