In [7]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline


ratioKeys = ['psRatio', 'peRatio', 'priceToBook', 'evToEbitda', 'evToEbit', 'priceToFreeCashFlow', 'evToSales', 'evToGrossProfit', 'priceToGrossProfit']
yKeys = ['yAdjustedPriceSpinoffExcl1Year', 'yAdjustedPrice1Year', 'yPrice1Year']
y2Keys = ['yAdjustedPriceSpinoffExcl2Year', 'yAdjustedPrice2Year', 'yPrice2Year']
yAlpha = ['alpha1Year','adjustedAlpha1Year','equalAlpha1Year','equalAdjustedAlpha1Year','alpha2Year','adjustedAlpha2Year','equalAlpha2Year','equalAdjustedAlpha2Year']

numericalKeys = ratioKeys + yKeys + y2Keys
relativeRatioKeys = [x + '_relative' for x in ratioKeys]

In [8]:
df = pd.read_csv('../data/combined_inner.csv')
X = df[ratioKeys + relativeRatioKeys]
# yLabels = yKeys + y2Keys + yAlpha

for yLabel in yAlpha:
    yContinuous = df[yLabel]
    # TODO: adjust/test cut off
    y = np.where(yContinuous <= 0, 0, 1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    pipeline = Pipeline([
        ('scaler', StandardScaler()), 
        ('svc', SVC()) 
    ])
    param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 'auto'],
    'svc__kernel': ['rbf', 'linear']
    }

    grid = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)
    grid.fit(X_train, y_train)
    
    y_pred = grid.predict(X_test)
    print("Best parameters:", grid.best_params_)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    # TODO: Do I train this for each y column?
    break

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=  24.6s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=  24.4s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=  23.8s
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=  11.5s
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=  11.2s
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=  11.1s
[CV] END .......svc__C=0.1, svc__gamma=auto, svc__kernel=rbf; total time=  24.9s
[CV] END .......svc__C=0.1, svc__gamma=auto, svc__kernel=rbf; total time=  24.1s
[CV] END .......svc__C=0.1, svc__gamma=auto, svc__kernel=rbf; total time=  24.0s
[CV] END ....svc__C=0.1, svc__gamma=auto, svc__kernel=linear; total time=  11.5s
[CV] END ....svc__C=0.1, svc__gamma=auto, svc__kernel=linear; total time=  11.2s
[CV] END ....svc__C=0.1, svc__gamma=auto, svc__k

In [13]:
y = np.where(df['adjustedAlpha1Year'] <= 0, 0, 1)
print('0: ' + str(len(y[y==0])) + ', %: ' + str(len(y[y==0])/len(y)))
print('1: ' + str(len(y[y==1])))

0: 15211, %: 0.5026601896830905
1: 15050
