In [2]:
from headers import *
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from utils.custom_train_test_split import custom_train_test_split
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, make_scorer, confusion_matrix
from sklearn.pipeline import Pipeline
from joblib import dump, load
from IPython.display import Markdown, display
from math import sqrt

Obserwacja:
train test split per firma daje dużo trudniejsze zadanie dla modeli.
Jednak dane w obrębie 1 firmy muszą być od siebie stosunkowo nieźle zależne. W szczególności dla wartości cutoff > 0.1, 
gdzie teraz SVM i DTree raczej robią overfit niż faktycznie trafiają.
Wcześniej było inaczej.

Możliwa jeszcze zabawa z (krzywą roc i predict_proba) lub z wagami, by spróbować dostosować SVM do większych wartości cutoff. 
Obecnie duży cutoff powoduje zwracanie 0-3 wartości True

In [3]:
df = pd.read_csv('../data/combined_inner.csv')
cutoffs = [0.0, 0.01, 0.03, 0.05, 0.07, 0.10, 0.15, 0.20, 0.25, 0.50]

trainDf, testDf = custom_train_test_split(['../data/combined_inner_ticker.csv', '../data/extra-data/combined_inner_ticker.csv'])
X_train = trainDf[ratioKeys + relativeRatioKeys]
X_test = testDf[ratioKeys + relativeRatioKeys]

for yLabel in yAlpha:
    for cutoff in cutoffs:
        y_train = np.where(trainDf[yLabel] <= cutoff, 0, 1)
        y_test = np.where(testDf[yLabel] <= cutoff, 0, 1)
        pipelines = {
            'tree': Pipeline([
                ('scaler', StandardScaler()),
                ('tree', DecisionTreeClassifier())
            ]),
            'svc': Pipeline([
                ('scaler', StandardScaler()),
                ('svc', LinearSVC())
            ])
        }
        params = [
            {
                'tree__criterion': ["gini", "entropy", "log_loss"],
            },
            {
                'svc__C': [0.1, 0.5, 1.0, 5.0, 10.0],
                'svc__penalty': ['l1', 'l2']
            }
        ]
        for (key, pipeline), param_grid in zip(pipelines.items(), params):
            grid = GridSearchCV(pipeline,
                        param_grid,
                        cv=StratifiedKFold(3),
                        verbose=1,
                        refit=True,
                        n_jobs=-1
                        )
            grid.fit(X_train, y_train)
            y_pred = grid.predict(X_test)
            wholeTestData = df.loc[df.index.intersection(X_test.index)]
            # wholeTestData.to_csv('../data/general_results/' + yLabel + '.csv', index=False)
            
            xTaken = X_test[y_pred == 1]
            takenData = df.loc[df.index.intersection(xTaken.index)]
            prc = round(takenData.loc[:, yLabel].count() / wholeTestData.loc[:, yLabel].count(), 4) * 100
            print(f'For {yLabel}, yCutoff = {cutoff}, model - {key} mean was {takenData.loc[:, yLabel].mean()}, taken used - {takenData.loc[:, yLabel].count()} is {prc}%')

Fitting 3 folds for each of 3 candidates, totalling 9 fits
For alpha1Year, yCutoff = 0.01, model - tree mean was 0.02100151497813131, taken used - 2894 is 49.82%
Fitting 3 folds for each of 10 candidates, totalling 30 fits
For alpha1Year, yCutoff = 0.01, model - svc mean was 0.04827342435064167, taken used - 1767 is 30.42%
Fitting 3 folds for each of 3 candidates, totalling 9 fits
For alpha1Year, yCutoff = 0.03, model - tree mean was 0.03319044508554306, taken used - 2635 is 45.36%
Fitting 3 folds for each of 10 candidates, totalling 30 fits
For alpha1Year, yCutoff = 0.03, model - svc mean was 0.08227992001688397, taken used - 318 is 5.47%
Fitting 3 folds for each of 3 candidates, totalling 9 fits
For alpha1Year, yCutoff = 0.05, model - tree mean was 0.02270206689708208, taken used - 2552 is 43.93%
Fitting 3 folds for each of 10 candidates, totalling 30 fits
For alpha1Year, yCutoff = 0.05, model - svc mean was -0.017878307077527734, taken used - 65 is 1.1199999999999999%
Fitting 3 fold

In [ ]:
 # TODO: Adjust class_weight to favor the positive class
# Manually adjust the decision threshold
# probs = model.predict_proba(X_test)[:, 1]
# preds = (probs > 0.3).astype(int)  # Lower threshold from default 0.5