In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import seaborn as sns
from mord import LogisticAT
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score
from kappa_loss_perceptron import KappaLossPerceptron
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.pipeline import Pipeline
from skll.metrics import kappa
pd.set_option('display.max_columns', None)
sns.set(rc={'figure.figsize': (15.7, 8.27)})

In [2]:
from sklearn.base import is_classifier
LogisticAT._estimator_type = "classifier" # type: ignore
assert is_classifier(LogisticAT) 
assert is_classifier(KappaLossPerceptron)

TODO: 
- properly collect model outputs from nested kfold loop
	- doesn't matter at this stage
- ensure stratification
	- default for gridsearch, if classifier, had to reimpliment for perceptron AND mord
	- ![](2021-12-10-10-31-54.png) 
- reunderstand classweight
- double check categorical/1-hot values are working correctly

- MAYBE try custom loss for xgboost IF THERE's time
https://towardsdatascience.com/custom-loss-functions-for-gradient-boosting-f79c1b40466d

We have some clean data. Now we can thing about modeling, starting with transformations.

0) Ensure categorical features are identified and encoded as such
	- All features are continuous with the exception of `Tendency` which is ordinal (and can still be treated as continuous)
1) We want to normalize everything that's scalar so that regularization works properly
2) We'll want to try out a PCA step since many inputs are correlated
3) Because of class imbalance, we'll want to at least think about options like SMOTE
### For training:
1) We have a smaller dataset, so a nested k-fold will work. The inner fold is for hyperparam optimization, the outer is for evaluating model performance. 
2) We have imbalanced classes so k-folding should be stratified by class
### For modeling, we need to choose classification models. 
1) we're dealing with an ordinal regression, not just classification. 
2) We need to think of smart error metrics

Goal: Add in learning curve with one model and see how scalable it is to use on a basked

Sometimes when you want to debug, you just a get a lot of "abstract" data types. I found it's helpful to temporarily disable JIT

In [3]:
df = pd.read_csv("./data/preprocessed_data.csv", index_col=0)

TARGET = "NSP"
FEATURES = df.columns.drop(TARGET)

# Reserve a "final final" test set
X = df[FEATURES].values
y = df[TARGET].values.astype(int) - 1
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [4]:
# class weight between None and "Balanced"
class_weight_intermediate = {0: 1, 1: 2, 2: 3}
class_weight_high = {0: 1, 1: 10, 2: 20}
model_params = {
    'logisticOVR': {
        'model__C': [1, 5, 10],
        'model__class_weight': ['balanced', None, class_weight_intermediate, class_weight_high]
    },
    'logisticMN': {
        'model__C': [1, 5, 10],
        'model__class_weight': ['balanced', None, class_weight_intermediate, class_weight_high]
    },
    'kappaPerceptron': {
    },
    'ordinal': {'model__alpha': [0, 0.5, 1, 2]},
    'RF': {
        'model__n_estimators': [1, 10, 20],
        'model__max_features': ['sqrt', None],
        'model__max_depth': [None, 5, 10, 20],
        'model__class_weight': ['balanced', None, class_weight_intermediate, class_weight_high]
    },
    'rbfSVM': {
        'model__C': [1, 5, 10],
        'model__gamma': [0.001, 0.0001],
        'model__class_weight': ['balanced', None, class_weight_intermediate, class_weight_high]
    },
    'KNN': {'model__n_neighbors': [3, 7, 10],
            'model__weights': ['uniform', 'distance']},
    'XGBoost': {
        'model__n_estimators': [1, 10, 20],
        'model__max_features': ['sqrt', None],
        'model__max_depth': [None, 5, 10, 20],
        'model__class_weight': ['balanced', None, class_weight_intermediate, class_weight_high]
    }
}

pipe_params = {
    'PCA': [None, PCA(), PCA(15), PCA(10), PCA(5)],
    'smote': [None, SMOTE()]
}

In [5]:
# error metrics for each class of fetal state
f1Scorer1 = make_scorer(lambda x, y: f1_score(x, y, average=None)[0])
f1Scorer2 = make_scorer(lambda x, y: f1_score(x, y, average=None)[1])
f1Scorer3 = make_scorer(lambda x, y: f1_score(x, y, average=None)[2])
precScorer1 = make_scorer(lambda x, y: precision_score(x, y, average=None)[0])
precScorer2 = make_scorer(lambda x, y: precision_score(x, y, average=None)[1])
precScorer3 = make_scorer(lambda x, y: precision_score(x, y, average=None)[2])
recallScorer1 = make_scorer(lambda x, y: recall_score(x, y, average=None)[0])
recallScorer2 = make_scorer(lambda x, y: recall_score(x, y, average=None)[1])
recallScorer3 = make_scorer(lambda x, y: recall_score(x, y, average=None)[2])

# Penalties for different types of class confusion
weights = np.array([
# Predicted   N    S    P     # True
            [0.0, 0.5, 0.6],  # N
            [1.0, 0.0, 0.3],  # S
            [2.0, 0.6, 0.0]   # P
])
def weightedKappa(x, y): return kappa(x, y, weights=weights)

kappaScorer = make_scorer(weightedKappa)

complete_scorer = {
    'f1-N': f1Scorer1, 'f1-S': f1Scorer2, 'f1-P': f1Scorer3,
    'prec-N': precScorer1, 'prec-S': precScorer2, 'prec-P': precScorer3,
    'recall-N': recallScorer1, 'recall-S': recallScorer2, 'recall-P': recallScorer3,
    'weighted-kappa': kappaScorer
}

In [9]:
def make_pipeline(model):
    return Pipeline(steps=[
               ('smote', SMOTE()),
               ('scaler', StandardScaler()),
               ('PCA', PCA()),
               ('model', model)
           ])

def make_inner_kfold(pipeline, preprocessing_grid, model_grid):
    full_grid = {}
    full_grid.update(preprocessing_grid)
    full_grid.update(model_grid)
    return GridSearchCV(pipeline, param_grid=full_grid, cv=2, n_jobs=-1, 
                        scoring=kappaScorer)

models = {
    "logisticOVR": LogisticRegression(max_iter=5000, multi_class='ovr'),
    "logisticMN": LogisticRegression(
        max_iter=5000, multi_class='multinomial'
    ),
    "kappaPerceptron": KappaLossPerceptron(
        num_classes=3, weight_matrix=weights, max_iter=5000
    ),
    "ordinal": LogisticAT(),
    "RF": RandomForestClassifier(),
    "rbfSVM": svm.SVC(kernel="rbf"),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
}

piped_models = {
    key: make_pipeline(model) 
    for key, model in models.items()
}

piped_folded_models = {
    key: make_inner_kfold(pipe, pipe_params, model_params[key])
    for key, pipe in piped_models.items()
}

In [10]:
piped_models['XGBoost'].fit(X=X_train, y=y_train)

Pipeline(steps=[('smote', SMOTE()), ('scaler', StandardScaler()),
                ('PCA', PCA()),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               eval_metric='mlogloss', gamma=0, gpu_id=-1,
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=12, num_parallel_tree=1,
                               objective='multi:softprob', predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=None, subsample

In [14]:
piped_folded_models['XGBoost'].best_estimator_['model']

XGBClassifier(base_score=0.5, booster='gbtree', class_weight=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=20,
              max_features='sqrt', min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=20, n_jobs=12,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1, ...)

In [10]:
scores = {}
for key, model in piped_folded_models.items():
    scores[key] = cross_validate(
        model, X=X_train, y=y_train, 
        cv=5, scoring=complete_scorer, n_jobs=-1, verbose=10
        )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
Stopping early after 11 iterations.
Stopping early after 11 iterations.
Stopping early after 11 iterations.
Stopping early after 203 iterations.
Stopping early after 211 iterations.
Stopping early after 24 iterations.
Stopping early after 20 iterations.
Stopping early after 199 iterations.
Stopping early after 23 iterations.
Stopping early after 28 iterations.
Stopping early after 11 iterations.
Stopping early after 11 iterations.
Stopping early after 11 iterations.
Stopping early after 11 iterations.
Stopping early after 143 iterations.
Stopping early after 198 iterations.
Stopping early

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   22.2s remaining:   33.3s


Stopping early after 208 iterations.
[CV] END  f1-N: (test=0.894) f1-P: (test=0.586) f1-S: (test=0.598) prec-N: (test=0.967) prec-P: (test=0.531) prec-S: (test=0.479) recall-N: (test=0.831) recall-P: (test=0.654) recall-S: (test=0.795) weighted-kappa: (test=0.671) total time=  22.5s


[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   22.6s remaining:   15.1s


Stopping early after 239 iterations.
[CV] END  f1-N: (test=0.891) f1-P: (test=0.714) f1-S: (test=0.595) prec-N: (test=0.990) prec-P: (test=0.667) prec-S: (test=0.453) recall-N: (test=0.810) recall-P: (test=0.769) recall-S: (test=0.867) weighted-kappa: (test=0.730) total time=  23.0s
Stopping early after 228 iterations.
[CV] END  f1-N: (test=0.879) f1-P: (test=0.656) f1-S: (test=0.587) prec-N: (test=0.985) prec-P: (test=0.568) prec-S: (test=0.451) recall-N: (test=0.794) recall-P: (test=0.778) recall-S: (test=0.841) weighted-kappa: (test=0.703) total time=  23.4s
[CV] START .....................................................................
[CV] START .....................................................................


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   23.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   23.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] END  f1-N: (test=0.899) f1-P: (test=0.780) f1-S: (test=0.537) prec-N: (test=0.986) prec-P: (test=0.719) prec-S: (test=0.418) recall-N: (test=0.827) recall-P: (test=0.852) recall-S: (test=0.750) weighted-kappa: (test=0.744) total time=   8.0s
[CV] END  f1-N: (test=0.882) f1-P: (test=0.806) f1-S: (test=0.475) prec-N: (test=0.967) prec-P: (test=0.694) prec-S: (test=0.378) recall-N: (test=0.811) recall-P: (test=0.962) recall-S: (test=0.636) weighted-kappa: (test=0.701) total time=   8.0s
[CV] END  f1-N: (test=0.891) f1-P: (test=0.800) f1-S: (test=0.538) prec-N: (test=0.971) prec-P: (test=0.833) prec-S: (test=0.412) recall-N: (test=0.823) recall-P: (test=0.769) recall-S: (test=0.778) weighted-kappa: (test=0.710) total time=   8.0s
[CV] END  f1-N: 

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    8.1s remaining:   12.1s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    8.1s remaining:    5.4s


[CV] END  f1-N: (test=0.886) f1-P: (test=0.655) f1-S: (test=0.480) prec-N: (test=0.967) prec-P: (test=0.643) prec-S: (test=0.370) recall-N: (test=0.819) recall-P: (test=0.667) recall-S: (test=0.682) weighted-kappa: (test=0.674) total time=   8.6s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.7s finished


In [12]:
pd.DataFrame(scores)

NameError: name 'scores' is not defined