In [29]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
import os
os.chdir('/home/jovyan/work/ufc/')

In [161]:
import pandas as pd
import numpy as np

import os
import time

import click
import joblib
import json

from ruamel.yaml import YAML

import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import LinearSVC

from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from src.feature_selection_funcs import BFS2Thres


conf = YAML().load(open('params.yaml'))
SEED=conf['seed']
np.random.seed(SEED)

In [80]:
DN = conf['train_eval']['report_dn']

df = pd.read_csv(conf['feat_shortlist_sel']['feat_short_fn'])
nfeats = ['event', 'fighter', 'opponent', 'event_day', 'target', 'split']
feats = [it for it in df.columns if not it in nfeats]

In [81]:
fake_split = lambda x: [(x[x.split=='tr'].index.tolist(), x[x.split=='val'].index.tolist())]

# LogisticRegression

In [169]:
maxiter = 1000
# для b значимые в начале, не меняем порядок
selector = BFS2Thres(Pipeline(steps=[('sc', RobustScaler()), ('clf', LogisticRegression(max_iter=maxiter, random_state=conf['seed']))]), scoring='roc_auc', cv = fake_split(df), thresh = 0.01, 
                     direction='b')
selector.fit(df[feats], df['target'], verbose=False)
 
feats_new = selector.feat_ar[selector.mask]


In [170]:
len(feats_new), len(feats)

(8, 66)

In [173]:
model = Pipeline(steps=[('sc', RobustScaler()), ('clf', LogisticRegression(solver='liblinear', random_state=conf['seed']))])

params_d = {'clf__C':np.logspace(start=-10, stop=7, num=100, base=2), 'clf__penalty':['l1', 'l2']}

gr_s = RandomizedSearchCV(model, param_distributions=params_d, n_iter=100, scoring='roc_auc', cv=fake_split(df), random_state=SEED)
gr_s.fit(df[feats_new], df['target'])

print(gr_s.best_score_), print(gr_s.best_params_)

0.6837353958694918
{'clf__penalty': 'l2', 'clf__C': 5.146659592037726}


(None, None)

In [174]:
model = gr_s.best_estimator_
for idx_tr, idx_val in fake_split(df):
    model.fit(df.loc[idx_tr, feats_new], df.loc[idx_tr, 'target'])

roc_auc_score(df.loc[idx_tr, 'target'], model.predict_proba(df.loc[idx_tr, feats_new])[:,1]), roc_auc_score(df.loc[idx_val, 'target'], model.predict_proba(df.loc[idx_val, feats_new])[:,1])

(0.6286376953124999, 0.6837353958694918)

# HistGradientBoostingClassifier

In [165]:
# для b значимые в начале, не меняем порядок
selector = BFS2Thres(HistGradientBoostingClassifier(random_state=SEED), scoring='roc_auc', cv = fake_split(df), thresh = 0.02, 
                     direction='b')

selector.fit(df[feats], df['target'], verbose=False)
 
feats_new = selector.feat_ar[selector.mask]


In [166]:
len(feats_new), len(feats)

(8, 66)

In [167]:
model = HistGradientBoostingClassifier(random_state=SEED)

params_d = {'max_depth':[3, 5, 7, 10,14], 'learning_rate': [1e-4, 1e-3,1e-2, 1e-1], 'l2_regularization': [1e-4, 1e-3,1e-2, 1e-1],
            'max_iter':np.arange(50, 350, 50), 'min_samples_leaf':np.arange(1, 10, 1), 'max_features': [0.7, 0.8, 0.9]}


gr_s = RandomizedSearchCV(model, param_distributions=params_d, n_iter=100, scoring='roc_auc', cv=fake_split(df), random_state=SEED)
gr_s.fit(df[feats_new], df['target'])

print(gr_s.best_score_), print(gr_s.best_params_)

0.6417954781069086
{'min_samples_leaf': 9, 'max_iter': 50, 'max_features': 0.9, 'max_depth': 10, 'learning_rate': 0.01, 'l2_regularization': 0.0001}


(None, None)

In [168]:
model = gr_s.best_estimator_
for idx_tr, idx_val in fake_split(df):
    model.fit(df.loc[idx_tr, feats_new], df.loc[idx_tr, 'target'])

roc_auc_score(df.loc[idx_tr, 'target'], model.predict_proba(df.loc[idx_tr, feats_new])[:,1]), roc_auc_score(df.loc[idx_val, 'target'], model.predict_proba(df.loc[idx_val, feats_new])[:,1])

(0.7490540907118056, 0.6417954781069086)

# Tree

In [None]:
# для b значимые в начале, не меняем порядок
selector = BFS2Thres(HistGradientBoostingClassifier(random_state=SEED), scoring='roc_auc', cv = fake_split(df), thresh = 0.02, 
                     direction='b')

selector.fit(df[feats], df['target'], verbose=False)
 
feats_new = selector.feat_ar[selector.mask]

In [None]:
params = [{'reg':[DecisionTreeClassifier()], 'reg__max_depth':[3, 5, 7,10,14]},
          {'reg':[RandomForestClassifier(),ExtraTreesClassifier()],
           'reg__max_depth':[3, 5, 7, 10,14], 'reg__n_estimators':np.arange(50,500,10),
           'reg__min_samples_leaf':[1,2,3,4]},
         ]

# LinearSVC

In [148]:
maxiter = 1000
# для b значимые в начале, не меняем порядок

selector = BFS2Thres(Pipeline(steps=[('sc', RobustScaler()), ('clf', LinearSVC(max_iter=maxiter*10, random_state=SEED))]), scoring='roc_auc', cv = fake_split(df), thresh = 0.001, 
                     direction='b')
selector.fit(df[feats], df['target'], verbose=False)
 
feats_new = selector.feat_ar[selector.mask]
  

In [149]:
len(feats_new), len(feats)

(29, 66)

In [150]:
model = Pipeline(steps=[('sc', RobustScaler()), ('clf', LinearSVC(max_iter=maxiter*10, random_state=SEED))])

params_d = {'clf__C':np.logspace(start=-10, stop=5, num=100, base=2), 'clf__penalty':['l1', 'l2']}

gr_s = RandomizedSearchCV(model, param_distributions=params_d, n_iter=100, scoring='roc_auc', cv=fake_split(df), random_state=SEED)
gr_s.fit(df[feats_new], df['target'])

print(gr_s.best_score_), print(gr_s.best_params_)

0.7204888816410209
{'clf__penalty': 'l2', 'clf__C': 13.812272411144424}


(None, None)

In [151]:
model = CalibratedClassifierCV(estimator=Pipeline(steps=[('sc', RobustScaler()), ('clf', LinearSVC(max_iter=maxiter*10, random_state=SEED))]), method='sigmoid', cv=5, n_jobs=-1, ensemble=False)
for idx_tr, idx_val in fake_split(df):
    model.fit(df.loc[idx_tr, feats_new], df.loc[idx_tr, 'target'])

roc_auc_score(df.loc[idx_tr, 'target'], model.predict_proba(df.loc[idx_tr, feats_new])[:,1]), roc_auc_score(df.loc[idx_val, 'target'], model.predict_proba(df.loc[idx_val, feats_new])[:,1])

(0.6611268446180556, 0.7204684970121957)

# GaussianNB

In [118]:
maxiter = 1000
# для b значимые в начале, не меняем порядок
selector = BFS2Thres(Pipeline(steps=[('sc', RobustScaler()), ('clf', GaussianNB())]), scoring='roc_auc', cv = fake_split(df), thresh = 0.01, 
                     direction='b')

selector.fit(df[feats], df['target'], verbose=False)

feats_new = selector.feat_ar[selector.mask]

In [119]:
len(feats_new), len(feats)

(4, 66)

In [120]:
model = Pipeline(steps=[('sc', RobustScaler()), ('clf', GaussianNB())])
for idx_tr, idx_val in fake_split(df):
    model.fit(df.loc[idx_tr, feats_new], df.loc[idx_tr, 'target'])

roc_auc_score(df.loc[idx_tr, 'target'], model.predict_proba(df.loc[idx_tr, feats_new])[:,1]), roc_auc_score(df.loc[idx_val, 'target'], model.predict_proba(df.loc[idx_val, feats_new])[:,1])

(0.6959461805555556, 0.6436970727673008)