In [2]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score
from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.pipeline import Pipeline
from skll.metrics import kappa
pd.set_option('display.max_columns', None)
sns.set(rc={'figure.figsize': (15.7, 8.27)})

- ensure smote is working correctly on categorical variables
- double check categorical stuff

We have some clean data. Now we can thing about modeling, starting with transformations.

0) Ensure categorical features are identified and encoded as such
	- All features are continuous with the exception of `Tendency` which is ordinal (and can still be treated as continuous)
1) We want to normalize everything that's scalar so that regularization works properly
2) We'll want to try out a PCA step since many inputs are correlated
3) Because of class imbalance, we'll want to at least think about options like SMOTE
### For training:
1) We have a smaller dataset, so a nested k-fold will work. The inner fold is for hyperparam optimization, the outer is for evaluating model performance. 
2) We have imbalanced classes so k-folding should be stratified by class
### For modeling, we need to choose classification models. 
1) we're dealing with an ordinal regression, not just classification. 
2) We need to think of smart error metrics

Goal: Add in learning curve with one model and see how scalable it is to use on a basked

In [3]:
df = pd.read_csv("./data/preprocessed_data.csv", index_col=0)

TARGET = "NSP"
FEATURES = df.columns.drop(TARGET)

# Reserve a "final final" test set
X = df[FEATURES]
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [4]:
# class weight between None and "Balanced"
class_weight_intermediate = {1: 1, 2: 2, 3: 3}
class_weight_high = {1: 1, 2: 10, 3: 20}
model_params = {
    'logisticOVR': {
        'model__C': [1, 5, 10],
        'model__class_weight': ['balanced', None, class_weight_intermediate, class_weight_high]
    },
    'logisticMN': {
        'model__C': [1, 5, 10],
        'model__class_weight': ['balanced', None, class_weight_intermediate, class_weight_high]
    },
    'ordinal': {'model__alpha': [0, 0.5, 1, 2]},
    'RF': {
        'model__n_estimators': [1, 10, 20],
        'model__max_features': ['sqrt', None],
        'model__max_depth': [None, 5, 10, 20],
        'model__class_weight': ['balanced', None, class_weight_intermediate, class_weight_high]
    },
    'lSVM': {
        'model__C': [1, 5, 10],
        'model__class_weight': ['balanced', None, class_weight_intermediate, class_weight_high]
    },
    'rbfSVM': {
        'model__C': [1, 5, 10],
        'model__gamma': [0.001, 0.0001],
        'model__class_weight': ['balanced', None, class_weight_intermediate, class_weight_high]
    },
    'NB': {'model__var_smoothing': [1e-08, 1e-10]},
    'KNN': {'model__n_neighbors': [3, 7, 10],
            'model__weights': ['uniform', 'distance']},
    'XGBoost': {
        'model__n_estimators': [1, 10, 20],
        'model__max_features': ['sqrt', None],
        'model__max_depth': [None, 5, 10, 20],
        'model__class_weight': ['balanced', None, class_weight_intermediate, class_weight_high]
    }
}

pipe_params = {
    'PCA': [None, PCA(), PCA(15), PCA(10), PCA(5)],
    'smote': [None, SMOTE()]
}

In [5]:
# error metrics for each class of fetal state
f1Scorer1 = make_scorer(lambda x, y: f1_score(x, y, average=None)[0])
f1Scorer2 = make_scorer(lambda x, y: f1_score(x, y, average=None)[1])
f1Scorer3 = make_scorer(lambda x, y: f1_score(x, y, average=None)[2])
precScorer1 = make_scorer(lambda x, y: precision_score(x, y, average=None)[0])
precScorer2 = make_scorer(lambda x, y: precision_score(x, y, average=None)[1])
precScorer3 = make_scorer(lambda x, y: precision_score(x, y, average=None)[2])
recallScorer1 = make_scorer(lambda x, y: recall_score(x, y, average=None)[0])
recallScorer2 = make_scorer(lambda x, y: recall_score(x, y, average=None)[1])
recallScorer3 = make_scorer(lambda x, y: recall_score(x, y, average=None)[2])

# Penalties for different types of class confusion
weights = np.array([
# Predicted   N    S    P     # True
            [0.0, 0.5, 0.6],  # N
            [1.0, 0.0, 0.3],  # S
            [2.0, 0.6, 0.0]   # P
])
def weightedKappa(x, y): return kappa(x, y, weights=weights)

kappaScorer = make_scorer(weightedKappa)

complete_scorer = {
    'f1-N': f1Scorer1, 'f1-S': f1Scorer2, 'f1-P': f1Scorer3,
    'prec-N': precScorer1, 'prec-S': precScorer2, 'prec-P': precScorer3,
    'recall-N': recallScorer1, 'recall-S': recallScorer2, 'recall-P': recallScorer3,
    'weighted-kappa': kappaScorer
}

In [6]:
def make_pipeline(model):
    return Pipeline(steps=[
               ('smote', SMOTE()),
               ('scaler', StandardScaler()),
               ('PCA', PCA()),
               ('model', model)
           ])

def make_inner_kfold(pipeline, preprocessing_grid, model_grid):
    full_grid = {}
    full_grid.update(preprocessing_grid)
    full_grid.update(model_grid)
    return GridSearchCV(pipeline, param_grid=full_grid, cv=3, n_jobs=-1, 
                        scoring=kappaScorer)

models = {
    "logisticOVR": LogisticRegression(max_iter=5000, multi_class='ovr'),
    "logisticMN": LogisticRegression(max_iter=5000, multi_class='multinomial'),
    # "ordinal": LogisticAT(),
    # "RF": RandomForestClassifier(),
    # "lSVM": svm.SVC(kernel="linear"),
    # "rbfSVM": svm.SVC(kernel="rbf"),
    # "NB": GaussianNB(),
    # "KNN": KNeighborsClassifier(),
    # "XGBoost": XGBClassifier()
}

piped_models = {
    key: make_pipeline(model) 
    for key, model in models.items()
}

piped_folded_models = {
    key: make_inner_kfold(pipe, pipe_params, model_params[key])
    for key, pipe in piped_models.items()
}


In [61]:
from sklearn.model_selection import GridSearchCV, cross_validate

scores = {}
for key, model in piped_folded_models.items():
    scores[key] = cross_validate(model, X=X_train, y=y_train, cv=5, scoring=complete_scorer, n_jobs=-1, verbose=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] END  f1-N: (test=0.913) f1-P: (test=0.704) f1-S: (test=0.667) prec-N: (test=0.986) prec-P: (test=0.568) prec-S: (test=0.574) recall-N: (test=0.851) recall-P: (test=0.926) recall-S: (test=0.795) weighted-kappa: (test=0.775) total time=  39.5s
[CV] END  f1-N: (test=0.914) f1-P: (test=0.793) f1-S: (test=0.638) prec-N: (test=0.986) prec-P: (test=0.719) prec-S: (test=0.514) recall-N: (test=0.851) recall-P: (test=0.885) recall-S: (test=0.841) weighted-kappa: (test=0.778) total time=  39.6s


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   39.6s remaining:   59.4s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   39.8s remaining:   26.5s


[CV] END  f1-N: (test=0.928) f1-P: (test=0.800) f1-S: (test=0.660) prec-N: (test=0.978) prec-P: (test=0.706) prec-S: (test=0.574) recall-N: (test=0.883) recall-P: (test=0.923) recall-S: (test=0.778) weighted-kappa: (test=0.792) total time=  39.8s
[CV] END  f1-N: (test=0.927) f1-P: (test=0.714) f1-S: (test=0.661) prec-N: (test=0.986) prec-P: (test=0.667) prec-S: (test=0.544) recall-N: (test=0.875) recall-P: (test=0.769) recall-S: (test=0.841) weighted-kappa: (test=0.786) total time=  40.7s
[CV] END  f1-N: (test=0.926) f1-P: (test=0.731) f1-S: (test=0.673) prec-N: (test=0.973) prec-P: (test=0.760) prec-S: (test=0.551) recall-N: (test=0.883) recall-P: (test=0.704) recall-S: (test=0.864) weighted-kappa: (test=0.753) total time=  41.0s
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   41.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   41.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


[CV] END  f1-N: (test=0.930) f1-P: (test=0.807) f1-S: (test=0.655) prec-N: (test=0.986) prec-P: (test=0.742) prec-S: (test=0.545) recall-N: (test=0.880) recall-P: (test=0.885) recall-S: (test=0.818) weighted-kappa: (test=0.806) total time=  50.8s
[CV] END  f1-N: (test=0.932) f1-P: (test=0.762) f1-S: (test=0.667) prec-N: (test=0.986) prec-P: (test=0.667) prec-S: (test=0.574) recall-N: (test=0.883) recall-P: (test=0.889) recall-S: (test=0.795) weighted-kappa: (test=0.803) total time=  51.6s


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   51.6s remaining:  1.3min


[CV] END  f1-N: (test=0.924) f1-P: (test=0.691) f1-S: (test=0.631) prec-N: (test=0.973) prec-P: (test=0.655) prec-S: (test=0.530) recall-N: (test=0.879) recall-P: (test=0.731) recall-S: (test=0.778) weighted-kappa: (test=0.748) total time=  52.7s


[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   52.7s remaining:   35.1s


[CV] END  f1-N: (test=0.929) f1-P: (test=0.727) f1-S: (test=0.646) prec-N: (test=0.957) prec-P: (test=0.690) prec-S: (test=0.582) recall-N: (test=0.903) recall-P: (test=0.769) recall-S: (test=0.727) weighted-kappa: (test=0.749) total time=  53.0s
[CV] END  f1-N: (test=0.949) f1-P: (test=0.784) f1-S: (test=0.694) prec-N: (test=0.963) prec-P: (test=0.833) prec-S: (test=0.630) recall-N: (test=0.935) recall-P: (test=0.741) recall-S: (test=0.773) weighted-kappa: (test=0.791) total time=  53.7s
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   53.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   53.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
Parameters: { "class_weight", "max_features" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "class_weight", "max_features" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "class_weight", "max_features" } might not be used.

  This could be a false alarm, wit

In [None]:
scores

{'logisticOVR': {'fit_time': array([0.03958917, 0.03519082, 0.03857303, 0.04261994, 0.08312702]),
  'score_time': array([0., 0., 0., 0., 0.]),
  'test_f1-N': array([nan, nan, nan, nan, nan]),
  'test_f1-S': array([nan, nan, nan, nan, nan]),
  'test_f1-P': array([nan, nan, nan, nan, nan]),
  'test_prec-N': array([nan, nan, nan, nan, nan]),
  'test_prec-S': array([nan, nan, nan, nan, nan]),
  'test_prec-P': array([nan, nan, nan, nan, nan]),
  'test_recall-N': array([nan, nan, nan, nan, nan]),
  'test_recall-S': array([nan, nan, nan, nan, nan]),
  'test_recall-P': array([nan, nan, nan, nan, nan]),
  'test_weighted-kappa': array([nan, nan, nan, nan, nan])},
 'logisticMN': {'fit_time': array([0.04826474, 0.11896491, 0.04307222, 0.03440714, 0.045578  ]),
  'score_time': array([0., 0., 0., 0., 0.]),
  'test_f1-N': array([nan, nan, nan, nan, nan]),
  'test_f1-S': array([nan, nan, nan, nan, nan]),
  'test_f1-P': array([nan, nan, nan, nan, nan]),
  'test_prec-N': array([nan, nan, nan, nan, nan])