In [21]:
from headers import *
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, make_scorer, confusion_matrix
from sklearn.pipeline import Pipeline
from joblib import dump, load
from IPython.display import Markdown, display
from math import sqrt

In [22]:
df = pd.read_csv('../data/combined_inner.csv')
X = df[ratioKeys + relativeRatioKeys]
cutoffs = [0.01, 0.03, 0.05, 0.07, 0.10, 0.15, 0.20, 0.25, 0.50]

for yLabel in yAlpha:
    yContinuous = df[yLabel]
    for cutoff in cutoffs:
        y = np.where(yContinuous <= cutoff, 0, 1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)
        pipelines = {
            'tree': Pipeline([
                ('scaler', StandardScaler()),
                ('tree', DecisionTreeClassifier())
            ]),
            'svc': Pipeline([
                ('scaler', StandardScaler()),
                ('svc', LinearSVC())
            ])
        }
        params = [
            {
                'tree__criterion': ["gini", "entropy", "log_loss"],
            },
            {
                'svc__C': [0.1, 0.5, 1.0, 5.0, 10.0],
                'svc__penalty': ['l1', 'l2']
            }
        ]
        for (key, pipeline), param_grid in zip(pipelines.items(), params):
            grid = GridSearchCV(pipeline,
                        param_grid,
                        cv=StratifiedKFold(3),
                        verbose=1,
                        refit=True,
                        n_jobs=-1
                        )
            grid.fit(X_train, y_train)
            y_pred = grid.predict(X_test)
            wholeTestData = df.loc[df.index.intersection(X_test.index)]
            # wholeTestData.to_csv('../data/general_results/' + yLabel + '.csv', index=False)
            
            xTaken = X_test[y_pred == 1]
            takenData = df.loc[df.index.intersection(xTaken.index)]
            prc = round(takenData.loc[:, yLabel].count() / wholeTestData.loc[:, yLabel].count(), 4) * 100
            print(f'For {yLabel}, yCutoff = {cutoff}, model - {key} mean was {takenData.loc[:, yLabel].mean()}, taken used - {takenData.loc[:, yLabel].count()} is {prc}%')

Fitting 3 folds for each of 3 candidates, totalling 9 fits
For alpha1Year, yCutoff = 0.01, model - tree mean was 0.16808197026062494, taken used - 2904 is 47.980000000000004%
Fitting 3 folds for each of 10 candidates, totalling 30 fits
For alpha1Year, yCutoff = 0.01, model - svc mean was 0.07869735028071999, taken used - 1552 is 25.64%
Fitting 3 folds for each of 3 candidates, totalling 9 fits
For alpha1Year, yCutoff = 0.03, model - tree mean was 0.16377100845361955, taken used - 2698 is 44.57%
Fitting 3 folds for each of 10 candidates, totalling 30 fits
For alpha1Year, yCutoff = 0.03, model - svc mean was 0.07983834421668513, taken used - 443 is 7.32%
Fitting 3 folds for each of 3 candidates, totalling 9 fits
For alpha1Year, yCutoff = 0.05, model - tree mean was 0.19635757097166942, taken used - 2464 is 40.71%
Fitting 3 folds for each of 10 candidates, totalling 30 fits
For alpha1Year, yCutoff = 0.05, model - svc mean was 0.16617620414028872, taken used - 105 is 1.73%
Fitting 3 folds 