In [1]:
import pandas as pd

from headers import *
from training_helper import trainPipeline
from utils.custom_train_test_split import *
from joblib import dump, load
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from numpy import mean

In [2]:
cutoffs = [0.10, 0.15, 0.20]
files = ['../data/deduplicated/combined_inner_ticker.csv']
thresholds = [90, 95]
yLabels = ['alpha1Year', 'alpha2Year']
modelKeys = ['svc', 'tree']

In [3]:
trainDf, testDf = per_year_train_test_split(files, format='%Y-%m-%d')
X_train = trainDf[ratioKeys + relativeRatioKeys]
X_test = testDf[ratioKeys + relativeRatioKeys]

In [4]:
# trainPipeline(cutoffs, yLabels, trainDf, testDf, thresholds, saveModel=True)

-----------------
% of True in dataset 35.23% resulted in 33.35% and median 25.45%
MODEL tree
For alpha1Year, yCutoff = 0.1, taken used - 3602 is 34.12%
Mean was 6.04%, median was 2.27%
MODEL svc
For alpha1Year, yCutoff = 0.1, taken used - 19 is 0.18%
Mean was 5.73%, median was -1.89%
% of True taken with threshold 90 adjustment -  1056
For threshold data - mean was 7.47%, median was 1.82%
% of True taken with threshold 95 adjustment -  528
For threshold data - mean was 7.62%, median was 3.64%
-----------------
-----------------
% of True in dataset 28.24% resulted in 38.53% and median 29.9%
MODEL tree
For alpha1Year, yCutoff = 0.15, taken used - 3114 is 29.5%
Mean was 5.69%, median was 1.17%
MODEL svc
For alpha1Year, yCutoff = 0.15, taken used - 9 is 0.09%
Mean was 9.79%, median was -13.9%
% of True taken with threshold 90 adjustment -  1056
For threshold data - mean was 8.49%, median was 2.68%
% of True taken with threshold 95 adjustment -  528
For threshold data - mean was 8.01%, me

{'alpha1Year': {'tree': [{'mean': np.float64(0.06036016084213012),
    'median': np.float64(0.02272711924272555),
    'taken': np.int64(3602)},
   {'mean': np.float64(0.056872310887688966),
    'median': np.float64(0.01172876676442605),
    'taken': np.int64(3114)},
   {'mean': np.float64(0.052522327238327386),
    'median': np.float64(0.01395581407404505),
    'taken': np.int64(2572)}],
  'svc': [{'mean': np.float64(0.05725471820444397),
    'median': np.float64(-0.0189153230747683),
    'taken': np.int64(19)},
   {'mean': np.float64(0.09791089394438221),
    'median': np.float64(-0.1389913260393867),
    'taken': np.int64(9)},
   {'mean': np.float64(0.5890923759435145),
    'median': np.float64(0.8006267657160063),
    'taken': np.int64(3)}],
  'svcThreshold90': [{'mean': np.float64(0.0746767941267405),
    'median': np.float64(0.0182337097483905),
    'taken': -1},
   {'mean': np.float64(0.0849160674624662),
    'median': np.float64(0.0268083570377613),
    'taken': -1},
   {'mean':

In [5]:
noSpecReturns = []
diffs = []
for modelKey in modelKeys:
    for yLabel in yLabels:
        print('========')
        for cutoff in cutoffs:
            print(f'{modelKey} - {yLabel} - {cutoff}')
            grid = load(f'../data/cvPipeline-2/{modelKey}{yLabel}{cutoff}.joblib')
            y_pred = grid.predict(X_test)
            y_true = np.where(testDf[yLabel] <= cutoff, 0, 1)
            print(confusion_matrix(y_true, y_pred))
            print(classification_report(y_true, y_pred))
            xTaken = X_test[y_pred == 1]
            takenData = testDf.loc[testDf.index.intersection(xTaken.index)]
            print(f'Overall mean was {round(takenData.loc[:, yLabel].mean() * 100,2)}%')

            if modelKey != 'svc':
                continue
            model = grid.best_estimator_['svc']
            scaler = grid.best_estimator_['scaler']
            X_scaled = scaler.transform(X_test)
            scores = model.decision_function(X_scaled)
            for threshold in thresholds:
                barrier = np.percentile(scores, threshold)
                y_pred_threshold = scores >= barrier
                xTakenThreshold = X_test[y_pred_threshold]
                takenDataThreshold = testDf.loc[testDf.index.intersection(xTakenThreshold.index)]
                print(
                    f'For threshold {threshold} data - mean was {round(takenDataThreshold.loc[:, yLabel].mean() * 100, 2)}%')
                thresholdLabel = f'svcThreshold{threshold}'
                print(confusion_matrix(y_true, y_pred_threshold))
                print(classification_report(y_true, y_pred_threshold))

                returns = takenDataThreshold[yLabel]
                bins = pd.cut(returns, bins=[-1.0, -0.6, -0.3, -0.1, 0.1, 0.3, 0.6, 1.0, 10])
                bin_counts = bins.value_counts().sort_index()
                bin_df = bin_counts.reset_index()
                bin_df.columns = ['Bin Range', 'Count']
                print(bin_df)
                noSpecialReturns = mean(returns[returns < 1.0])
                noSpecReturns.append(noSpecialReturns)
                print(f'Without 100+% returns mean was {noSpecialReturns}.')
                diff = mean(returns) - noSpecialReturns
                diffs.append(diff)
                print(f'Generated diff = {diff}')
                print('--------')


svc - alpha1Year - 0.1
[[6824   13]
 [3713    6]]
              precision    recall  f1-score   support

           0       0.65      1.00      0.79      6837
           1       0.32      0.00      0.00      3719

    accuracy                           0.65     10556
   macro avg       0.48      0.50      0.39     10556
weighted avg       0.53      0.65      0.51     10556

Overall mean was 5.73%
For threshold 90 data - mean was 7.47%
[[6191  646]
 [3309  410]]
              precision    recall  f1-score   support

           0       0.65      0.91      0.76      6837
           1       0.39      0.11      0.17      3719

    accuracy                           0.63     10556
   macro avg       0.52      0.51      0.46     10556
weighted avg       0.56      0.63      0.55     10556

      Bin Range  Count
0  (-1.0, -0.6]      5
1  (-0.6, -0.3]     93
2  (-0.3, -0.1]    253
3   (-0.1, 0.1]    295
4    (0.1, 0.3]    211
5    (0.3, 0.6]    140
6    (0.6, 1.0]     39
7   (1.0, 10.0]     20


In [6]:
print(mean(noSpecReturns))
print(mean(diffs))

0.018329605630572895
0.1561246343978376


Kompletne zniszczenie 2y returns

In [7]:
print(diffs)

[np.float64(0.038392496858124334), np.float64(0.03449594393848685), np.float64(0.03657574291112117), np.float64(0.029746088633546844), np.float64(0.029149685788108483), np.float64(0.03509732324508082), np.float64(0.24182026932799922), np.float64(0.3216883608520682), np.float64(0.2469447126370399), np.float64(0.3128643196700065), np.float64(0.22996465219661455), np.float64(0.31675601671585474)]
