In [6]:
from headers import *
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, make_scorer, confusion_matrix
from sklearn.pipeline import Pipeline
from joblib import dump, load
from IPython.display import Markdown, display
from math import sqrt

In [7]:
df = pd.read_csv('../data/combined_inner.csv')
X = df[ratioKeys + relativeRatioKeys]

for yLabel in yAlpha:
    yContinuous = df[yLabel]
    y = np.where(yContinuous <= 0, 0, 1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('tree', DecisionTreeClassifier())
    ])

    param_grid = {
        'tree__criterion': ["gini", "entropy", "log_loss"],
    }

    grid = GridSearchCV(pipeline,
                        param_grid,
                        cv=StratifiedKFold(3),
                        verbose=1,
                        refit=True,
                        n_jobs=-1
                        )
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    xTaken = X_test[y_pred == 1]
    wholeTestData = df.loc[df.index.intersection(X_test.index)]
    wholeTestData.to_csv('../data/tree_results/' + yLabel + '.csv', index=False)

    takenData = df.loc[df.index.intersection(xTaken.index)]

    estimator = grid.best_estimator_
    dump(grid, '../data/tree_models/' + yLabel + '.joblib')

    print(f'Results {yLabel} mean of: {takenData.loc[:, yLabel].mean()}')

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Results alpha1Year mean of: 0.15653675508716436
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Results adjustedAlpha1Year mean of: 0.1635819123795836
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Results equalAlpha1Year mean of: 0.1441426871690846
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Results equalAdjustedAlpha1Year mean of: 0.15035732547406913
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Results alpha2Year mean of: 0.32320891221244263
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Results adjustedAlpha2Year mean of: 0.28812714479347945
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Results equalAlpha2Year mean of: 0.31628891587900254
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Results equalAdjustedAlpha2Year mean of: 0.3211670876589014


In [8]:
for label in yAlpha:
    model = load('../data/tree_models/' + label + '.joblib')
    testedData = pd.read_csv('../data/tree_results/' + label + '.csv')

    X = testedData[ratioKeys + relativeRatioKeys]
    y = np.where(testedData[label] <= 0, 0, 1)
    display(Markdown(f'**Results for key: {label}**'))
    print(f'data count: {testedData[label].count()}')
    print(f'data mean: {testedData[label].mean()}')
    print(f'data median: {testedData[label].median()}')

    print('====TEST=====')
    y_pred = model.predict(X)
    xTaken = X[y_pred == 1]
    results = testedData.loc[testedData.index.intersection(xTaken.index)]

    print(f'res count {results[label].count()}')
    print(f'res mean {results[label].mean()}')
    print(f'res median {results[label].median()}')

    print(confusion_matrix(y, y_pred))

    display(Markdown(f'**diff: {results[label].mean() - testedData[label].mean()}**'))
    if '2' in label:
        cagrToBenchmark = sqrt(results[label].mean() - testedData[label].mean() + 1) - 1
        print(f'CAGR: {cagrToBenchmark}')
    print('-------------------------------------------------')

# TODO: iterate over y cutoffs. Train each cutoff with gridCV. Train linearSVC first, and check how faster it is. 
# TODO: maybe use (Linear)SVR - regression, since we have linear regression

**Results for key: alpha1Year**

data count: 6053
data mean: 0.0300172769962029
data median: -0.0031354597826141
====TEST=====
res count 3052
res mean 0.15653675508716436
res median 0.1198777801275849
[[2269  788]
 [ 732 2264]]


**diff: 0.12651947809096145**

-------------------------------------------------


**Results for key: adjustedAlpha1Year**

data count: 6053
data mean: 0.03069154232727888
data median: -0.0027346138213903
====TEST=====
res count 3012
res mean 0.1635819123795836
res median 0.1229411614801334
[[2277  766]
 [ 764 2246]]


**diff: 0.13289037005230472**

-------------------------------------------------


**Results for key: equalAlpha1Year**

data count: 6053
data mean: -0.0005140143373880067
data median: -0.0269341720292398
====TEST=====
res count 2827
res mean 0.1441426871690846
res median 0.1087162778017972
[[2521  759]
 [ 705 2068]]


**diff: 0.1446567015064726**

-------------------------------------------------


**Results for key: equalAdjustedAlpha1Year**

data count: 6053
data mean: 0.0022943754574641026
data median: -0.0252829451284293
====TEST=====
res count 2813
res mean 0.15035732547406913
res median 0.1080186725528246
[[2520  758]
 [ 720 2055]]


**diff: 0.14806295001660502**

-------------------------------------------------


**Results for key: alpha2Year**

data count: 6053
data mean: 0.03438167051225602
data median: -0.046537226661542
====TEST=====
res count 2767
res mean 0.32320891221244263
res median 0.1967748108634572
[[2690  647]
 [ 596 2120]]


**diff: 0.2888272417001866**

CAGR: 0.13526527371367547
-------------------------------------------------


**Results for key: adjustedAlpha2Year**

data count: 6053
data mean: 0.0308540392764738
data median: -0.0467539697115497
====TEST=====
res count 2749
res mean 0.28812714479347945
res median 0.1907788604227936
[[2662  649]
 [ 642 2100]]


**diff: 0.25727310551700566**

CAGR: 0.12128190278671913
-------------------------------------------------


**Results for key: equalAlpha2Year**

data count: 6053
data mean: -0.006875623891206189
data median: -0.0953112579061575
====TEST=====
res count 2448
res mean 0.31628891587900254
res median 0.19273643372549498
[[3029  574]
 [ 576 1874]]


**diff: 0.3231645397702087**

CAGR: 0.15028889404801649
-------------------------------------------------


**Results for key: equalAdjustedAlpha2Year**

data count: 6053
data mean: 0.002947332607081462
data median: -0.0894219371820089
====TEST=====
res count 2438
res mean 0.3211670876589014
res median 0.1774244201179481
[[3003  596]
 [ 612 1842]]


**diff: 0.31821975505181993**

CAGR: 0.14813751574095857
-------------------------------------------------
