In [10]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, make_scorer, confusion_matrix
from sklearn.pipeline import Pipeline
from joblib import dump, load
from headers import *
from IPython.display import Markdown, display
from math import sqrt

TRAINING

In [11]:
df = pd.read_csv('../data/combined_inner.csv')
X = df[ratioKeys + relativeRatioKeys]

for yLabel in yAlpha:
    yContinuous = df[yLabel]
    y = np.where(yContinuous <= 0, 0, 1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svc', LinearSVC())
    ])

    param_grid = {
        'svc__C': [0.1, 0.5, 1.0, 5.0, 10.0],
        'svc__penalty': ['l1', 'l2']
    }

    grid = GridSearchCV(pipeline,
                        param_grid,
                        cv=StratifiedKFold(3),
                        verbose=1,
                        refit=True,
                        n_jobs=-1
                        )
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    xTaken = X_test[y_pred == 1]
    wholeTestData = df.loc[df.index.intersection(X_test.index)]
    wholeTestData.to_csv('../data/linear_svm_results/' + yLabel + '.csv', index=False)

    takenData = df.loc[df.index.intersection(xTaken.index)]

    estimator = grid.best_estimator_
    dump(grid, '../data/linear_svm_models/' + yLabel + '.joblib')

    print(f'Results {yLabel} mean of: {takenData.loc[:, yLabel].mean()}')

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Results alpha1Year mean of: 0.058401035094455864
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Results adjustedAlpha1Year mean of: 0.05138855192309529
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Results equalAlpha1Year mean of: 0.061566350069853566
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Results equalAdjustedAlpha1Year mean of: 0.049143291866449
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Results alpha2Year mean of: 0.1561368966435215
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Results adjustedAlpha2Year mean of: 0.12291775356300404
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Results equalAlpha2Year mean of: 0.23382518930691878
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Results equalAdjustedAlpha2Year mean of: 0.8459305816939832


RESULTS OVERVIEW

In [15]:
for label in yAlpha:
    model = load('../data/linear_svm_models/' + label + '.joblib')
    testedData = pd.read_csv('../data/linear_svm_results/' + label + '.csv')

    X = testedData[ratioKeys + relativeRatioKeys]
    # TODO: adjust/test cut off for another svm batch
    y = np.where(testedData[label] <= 0, 0, 1)
    display(Markdown(f'**Results for key: {label}**'))
    print(f'data count: {testedData[label].count()}')
    print(f'data mean: {testedData[label].mean()}')
    print(f'data median: {testedData[label].median()}')

    print('====TEST=====')
    y_pred = model.predict(X)
    xTaken = X[y_pred == 1]
    results = testedData.loc[testedData.index.intersection(xTaken.index)]

    print(f'res count {results[label].count()}')
    print(f'res mean {results[label].mean()}')
    print(f'res median {results[label].median()}')

    print(confusion_matrix(y, y_pred))

    display(Markdown(f'**diff: {results[label].mean() - testedData[label].mean()}**'))
    if '2' in label:
        cagrToBenchmark = sqrt(results[label].mean() - testedData[label].mean() + 1) - 1
        print(f'CAGR: {cagrToBenchmark}')
    print('-------------------------------------------------')

# TODO: iterate over y cutoffs. Train each cutoff with gridCV. Train linearSVC first, and check how faster it is. 
# TODO: maybe use (Linear)SVR - regression, since we have linear regression

**Results for key: alpha1Year**

data count: 6053
data mean: 0.0300172769962029
data median: -0.0031354597826141
====TEST=====
res count 2768
res mean 0.058401035094455864
res median 0.012927754741060251
[[1726 1331]
 [1559 1437]]


**diff: 0.028383758098252963**

-------------------------------------------------


**Results for key: adjustedAlpha1Year**

data count: 6053
data mean: 0.03069154232727888
data median: -0.0027346138213903
====TEST=====
res count 2717
res mean 0.05138855192309529
res median 0.0074582397724619
[[1715 1328]
 [1621 1389]]


**diff: 0.020697009595816412**

-------------------------------------------------


**Results for key: equalAlpha1Year**

data count: 6053
data mean: -0.0005140143373880067
data median: -0.0269341720292398
====TEST=====
res count 851
res mean 0.061566350069853566
res median 0.0130280468493187
[[2872  408]
 [2330  443]]


**diff: 0.06208036440724157**

-------------------------------------------------


**Results for key: equalAdjustedAlpha1Year**

data count: 6053
data mean: 0.0022943754574641026
data median: -0.0252829451284293
====TEST=====
res count 806
res mean 0.049143291866449
res median 0.00733619785329025
[[2882  396]
 [2365  410]]


**diff: 0.0468489164089849**

-------------------------------------------------


**Results for key: alpha2Year**

data count: 6053
data mean: 0.03438167051225602
data median: -0.046537226661542
====TEST=====
res count 635
res mean 0.1561368966435215
res median 0.0364710608386151
[[3039  298]
 [2379  337]]


**diff: 0.12175522613126549**

CAGR: 0.059129466180251944
-------------------------------------------------


**Results for key: adjustedAlpha2Year**

data count: 6053
data mean: 0.0308540392764738
data median: -0.0467539697115497
====TEST=====
res count 662
res mean 0.12291775356300404
res median 0.03871233580331945
[[3010  301]
 [2381  361]]


**diff: 0.09206371428653023**

CAGR: 0.045018523417901024
-------------------------------------------------


**Results for key: equalAlpha2Year**

data count: 6053
data mean: -0.006875623891206189
data median: -0.0953112579061575
====TEST=====
res count 82
res mean 0.23382518930691878
res median 0.0651943971238062
[[3567   36]
 [2404   46]]


**diff: 0.24070081319812497**

CAGR: 0.11386750253256106
-------------------------------------------------


**Results for key: equalAdjustedAlpha2Year**

data count: 6053
data mean: 0.002947332607081462
data median: -0.0894219371820089
====TEST=====
res count 81
res mean 0.8459305816939832
res median 0.1581123758336411
[[3561   38]
 [2411   43]]


**diff: 0.8429832490869017**

CAGR: 0.35756519146849874
-------------------------------------------------
