In [19]:
import os
from os.path import join as oj
import sys, time
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from copy import deepcopy
import pickle as pkl
import pandas as pd
import data_pecarn
import data_psrc
import data
import train
import matplotlib.gridspec as grd
from data import feats_numerical, feats_categorical, meta, outcome_def
# sns.set(style="black")
# plt.style.use('dark_background')
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# combined

In [17]:
df_pecarn, df_psrc, common_feats, filtered_feats_pecarn, filtered_feats_psrc = data.load_it_all(dummy=True)
df = df_pecarn[common_feats].append(df_psrc[common_feats])

processed_feats = data.select_final_feats(common_feats)
train_idxs = data.pecarn_train_idxs
test_idxs1 = data.pecarn_test_idxs
test_idxs2 = data.psrc_train_idxs + data.psrc_test_idxs
print(len(processed_feats), processed_feats)
print(train_idxs, test_idxs1, test_idxs2)

computing pecarn preprocessing...
computing psrc preprocessing...
26 ['MOI_Motorcycle/ATV/Scooter collision', 'AbdDistention_unknown', 'MOI_Motor vehicle collision', 'AbdTenderDegree_Severe', 'InitSysBPRange', 'GCSScore_Full_yes', 'ThoracicTrauma_yes', 'MOI_Bike collision/fall', 'MOI_unknown', 'InitHeartRate', 'RtCostalTender', 'MOI_Object struck abdomen', 'VomitWretch_yes', 'CostalTender', 'AbdTrauma_or_SeatBeltSign_yes', 'LtCostalTender', 'AbdTenderDegree_Moderate', 'MOI_Fall from an elevation', 'MOI_Pedestrian/bicyclist struck by moving vehicle', 'AbdDistention_yes', 'VomitWretch_unknown', 'AbdTenderDegree_Mild', 'Age', 'Hypotension_yes', 'AbdomenPain_yes', 'DecrBreathSound_yes']
[1, 2, 3, 4] [5, 6] [8, 9, 10, 11, 12, 13]


In [62]:
def get_sample_weights(balancing_ratio):
    # class weights
    class_weights = {0: 1, 1: balancing_ratio}
    sample_weights_class = pd.Series(df[outcome_def]).map(class_weights).values
    
    # weights for different risk populations
    age_discrete = pd.cut(df['Age'], bins=(-1, 4, 9, 1000), labels=['<5', '5-9', '>9']).values
    # we don't have sex for psrc, so just fill in 0 (only matters for training anyway)
    sex = pd.Series(np.hstack((df_pecarn['Sex_M'].values, np.zeros(df_psrc.shape[0])))).map({0: 'F', 1: 'M'}).values
    risk_identity = [(sex[i], age_discrete[i]) for i in range(age_discrete.shape[0])]

    risk_weights = {
        ('F', '<5'): 2, ('F', '5-9'): 1, ('F', '>9'): 1,
        ('M', '<5'): 1, ('M', '5-9'): 1, ('M', '>9'): 1                
    }
    sample_weights_identity = pd.Series(risk_identity).map(risk_weights).values
    sample_weights = sample_weights_class * sample_weights_identity # elementwise multiply
    return sample_weights

# predict

In [None]:
out_dir = f'results/jun17_1'
os.makedirs(out_dir, exist_ok=True)
for balancing in ['sample_weights']: # ['ros', 'smote']:
    for balancing_ratio in [100, 5]: 
        sample_weights = get_sample_weights(balancing_ratio)
        for model_type in ['logistic', 'dt']: #, 'rf', 'mlp2', 'svm']): # 'rf', 'mlp2', 'svm', 'gb'
            for feature_selection in ['select_stab_lasso', 'select_lasso', 'select_rf']: #, 'select_lasso', 'select_rf']: # select_lasso, select_rf, None
                for feature_selection_num in tqdm([5, 6, 7, 10, len(processed_feats)]):
                    out_name = f'{model_type}_{feature_selection}={feature_selection_num}_{balancing}={balancing_ratio}'
                    train.train(df,
                                feat_names=processed_feats,
                                model_type=model_type, 
                                balancing=balancing,
                                outcome_def=outcome_def,
                                sample_weights=sample_weights,
                                balancing_ratio=balancing_ratio,
                                out_name=f'{out_dir}/{out_name}.pkl',
                                feature_selection=feature_selection,
                                feature_selection_num=feature_selection_num,
                                train_idxs=train_idxs,
                                test_idxs1=test_idxs1,
                                test_idxs2=test_idxs2)









balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights










balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights










balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights










balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights










balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights








100%|██████████| 5/5 [01:51<00:00, 22.35s/it][A[A[A[A[A[A







balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights










balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights










balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights










balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights










balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights








100%|██████████| 5/5 [01:58<00:00, 23.30s/it][A[A[A[A[A[A







balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights










balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights










balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights










balancing sample_weights
balancing sample_weights
balancing sample_weights
balancing sample_weights








 80%|████████  | 4/5 [01:29<00:22, 22.35s/it][A[A[A[A[A[A

In [27]:
train_idxs

[1, 2, 3, 4]

In [13]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
from stability_selection import StabilitySelection


def _generate_dummy_classification_data(p=1000, n=1000, k=5, random_state=123321):

    rng = check_random_state(random_state)

    X = rng.normal(loc=0.0, scale=1.0, size=(n, p))
    betas = np.zeros(p)
    important_betas = np.sort(rng.choice(a=np.arange(p), size=k))
    betas[important_betas] = rng.uniform(size=k)

    probs = 1 / (1 + np.exp(-1 * np.matmul(X, betas)))
    y = (probs > 0.5).astype(int)

    return X, y, important_betas

## This is all preparation of the dummy data set
n, p, k = 500, 1000, 5

X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
base_estimator = LogisticRegression(penalty='l1', solver='liblinear')

## Here stability selection is instantiated and run
selector = StabilitySelection(base_estimator=base_estimator, lambda_name='C',
                              lambda_grid=np.logspace(-5, -1, 5), max_features=20).fit(X, y)

print(len(selector.get_support(indices=True)), selector.get_support(indices=True))

num max feats 20
25 [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19 180 495 523 524 647]


In [11]:
selector.stability_scores_.shape

(1000, 5)