In [1]:
from joblib import dump, load
import pandas as pd
import numpy as np
from headers import *
from sklearn.metrics import classification_report, make_scorer, confusion_matrix
from IPython.display import Markdown, display
from math import sqrt

Mean return for purchases (1 and 2-year holding period) that beat benchmark

In [2]:
df = pd.read_csv('../data/combined_inner.csv')
for label in yAlpha:
    positiveAlpha = df[df[label] >= 0]
    print(f'{label}: {round(positiveAlpha[label].mean(),3)}, overall mean: {round(df[label].mean(),3)}, median: {round(df[label].median(),3)}')


alpha1Year: 0.255, overall mean: 0.025, median: -0.004
adjustedAlpha1Year: 0.255, overall mean: 0.027, median: -0.002
equalAlpha1Year: 0.246, overall mean: -0.001, median: -0.026
equalAdjustedAlpha1Year: 0.245, overall mean: -0.001, median: -0.026
alpha2Year: 0.456, overall mean: 0.035, median: -0.048
adjustedAlpha2Year: 0.452, overall mean: 0.037, median: -0.045
equalAlpha2Year: 0.46, overall mean: -0.004, median: -0.091
equalAdjustedAlpha2Year: 0.457, overall mean: -0.004, median: -0.09


Note - 2 year returns are 2 year returns, not CAGR over 2 years. Can't be compared 1 to 1

In [3]:

for label in yAlpha:
    model = load('./models2/' + label + '.joblib')
    testedData = pd.read_csv('../data/svm_results/' + label + '.csv')

    X = testedData[ratioKeys + relativeRatioKeys]
    # TODO: adjust/test cut off for another svm batch - DONE in general_learning.ipynb
    y = np.where(testedData[label] <= 0, 0, 1)
    display(Markdown(f'**Results for key: {label}**'))
    print(f'data count: {testedData[label].count()}')
    print(f'data mean: {testedData[label].mean()}')
    print(f'data median: {testedData[label].median()}')

    print('====TEST=====')
    y_pred = model.predict(X)
    xTaken = X[y_pred == 1]
    results = testedData.loc[testedData.index.intersection(xTaken.index)]

    print(f'res count {results[label].count()}')
    print(f'res mean {results[label].mean()}')
    print(f'res median {results[label].median()}')

    print(confusion_matrix(y, y_pred))

    display(Markdown(f'**diff: {results[label].mean() - testedData[label].mean()}**'))
    if '2' in label:
        cagrToBenchmark = sqrt(results[label].mean() - testedData[label].mean() + 1) - 1
        print(f'CAGR: {cagrToBenchmark}')
    print('-------------------------------------------------')

# TODO: iterate over y cutoffs. Train each cutoff with gridCV. Train linearSVC first, and check how faster it is. 
# TODO: maybe use (Linear)SVR - regression, since we have linear regression

**Results for key: alpha1Year**

data count: 6053
data mean: 0.0300172769962029
data median: -0.0031354597826141
====TEST=====
res count 3191
res mean 0.08618092754640873
res median 0.0552135037242193
[[1746 1311]
 [1116 1880]]


**diff: 0.05616365055020583**

-------------------------------------------------


**Results for key: adjustedAlpha1Year**

data count: 6053
data mean: 0.03069154232727888
data median: -0.0027346138213903
====TEST=====
res count 3319
res mean 0.08724656949335886
res median 0.055379492847763
[[1663 1380]
 [1071 1939]]


**diff: 0.056555027166079984**

-------------------------------------------------


**Results for key: equalAlpha1Year**

data count: 6053
data mean: -0.0005140143373880067
data median: -0.0269341720292398
====TEST=====
res count 1304
res mean 0.12032397185252566
res median 0.07278516440849955
[[2775  505]
 [1974  799]]


**diff: 0.12083798618991368**

-------------------------------------------------


**Results for key: equalAdjustedAlpha1Year**

data count: 6053
data mean: 0.0022943754574641026
data median: -0.0252829451284293
====TEST=====
res count 1452
res mean 0.11743820774544717
res median 0.0676370123580763
[[2710  568]
 [1891  884]]


**diff: 0.11514383228798306**

-------------------------------------------------


**Results for key: alpha2Year**

data count: 6053
data mean: 0.03438167051225602
data median: -0.046537226661542
====TEST=====
res count 1896
res mean 0.23426657599418027
res median 0.120840753925159
[[2625  712]
 [1532 1184]]


**diff: 0.19988490548192425**

CAGR: 0.09539258053079958
-------------------------------------------------


**Results for key: adjustedAlpha2Year**

data count: 6053
data mean: 0.0308540392764738
data median: -0.0467539697115497
====TEST=====
res count 1683
res mean 0.2279131598860506
res median 0.1219773565401627
[[2698  613]
 [1672 1070]]


**diff: 0.1970591206095768**

CAGR: 0.09410196993222564
-------------------------------------------------


**Results for key: equalAlpha2Year**

data count: 6053
data mean: -0.006875623891206189
data median: -0.0953112579061575
====TEST=====
res count 690
res mean 0.3509518193232506
res median 0.1882448713570007
[[3402  201]
 [1961  489]]


**diff: 0.35782744321445675**

CAGR: 0.1652585306336345
-------------------------------------------------


**Results for key: equalAdjustedAlpha2Year**

data count: 6053
data mean: 0.002947332607081462
data median: -0.0894219371820089
====TEST=====
res count 668
res mean 0.3558874474355833
res median 0.16869667708090824
[[3380  219]
 [2005  449]]


**diff: 0.35294011482850185**

CAGR: 0.1631595397143515
-------------------------------------------------
