In [4]:
import os
from os.path import join as oj
import sys, time
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from copy import deepcopy
import pickle as pkl
import pandas as pd
import data_pecarn
import data_psrc
import data
import train
import matplotlib.gridspec as grd
from data import feats_numerical, feats_categorical, meta, outcome_def
# sns.set(style="black")
# plt.style.use('dark_background')
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# combined

In [5]:
df_pecarn, df_psrc, common_feats, filtered_feats_pecarn, filtered_feats_psrc = data.load_it_all(dummy=True)
df = df_pecarn[common_feats].append(df_psrc[common_feats])

processed_feats = data.select_final_feats(common_feats)
train_idxs = data.pecarn_train_idxs
test_idxs1 = data.pecarn_test_idxs
test_idxs2 = data.psrc_train_idxs + data.psrc_test_idxs
print(len(processed_feats), processed_feats)
print(train_idxs, test_idxs1, test_idxs2)

computing pecarn preprocessing...
computing psrc preprocessing...
24 ['RtCostalTender', 'InitHeartRate', 'CostalTender', 'ThoracicTrauma_yes', 'MOI_Motorcycle/ATV/Scooter collision', 'Age', 'GCSScore_Full_yes', 'AbdDistention_yes', 'AbdomenPain_yes', 'VomitWretch_yes', 'MOI_Fall from an elevation', 'MOI_Pedestrian/bicyclist struck by moving vehicle', 'DecrBreathSound_yes', 'AbdTenderDegree_Mild', 'AbdTenderDegree_None', 'MOI_Bike collision/fall', 'LtCostalTender', 'MOI_Motor vehicle collision', 'MOI_Object struck abdomen', 'AbdTenderDegree_Moderate', 'InitSysBPRange', 'AbdTenderDegree_Severe', 'Hypotension_yes', 'AbdTrauma_or_SeatBeltSign_yes']
[1, 2, 3, 4] [5, 6] [8, 9, 10, 11, 12, 13]


In [6]:
def get_sample_weights(balancing_ratio):
    # class weights
    class_weights = {0: 1, 1: balancing_ratio}
    sample_weights_class = pd.Series(df[outcome_def]).map(class_weights).values
    
    # weights for different risk populations
    age_discrete = pd.cut(df['Age'], bins=(-1, 4, 9, 1000), labels=['<5', '5-9', '>9']).values
    # we don't have sex for psrc, so just fill in 0 (only matters for training anyway)
    sex = pd.Series(np.hstack((df_pecarn['Sex_M'].values, np.zeros(df_psrc.shape[0])))).map({0: 'F', 1: 'M'}).values
    risk_identity = [(sex[i], age_discrete[i]) for i in range(age_discrete.shape[0])]

    risk_weights = {
        ('F', '<5'): 33.9, ('F', '5-9'): 25.8, ('F', '>9'): 27.2,
        ('M', '<5'): 14.8, ('M', '5-9'): 13.7, ('M', '>9'): 13.1                
    }
    sample_weights_identity = pd.Series(risk_identity).map(risk_weights).values
    sample_weights = sample_weights_class * sample_weights_identity # elementwise multiply
    return sample_weights

# predict

In [None]:
out_dir = f'results/jul8_1'
os.makedirs(out_dir, exist_ok=True)
for balancing in ['sample_weights']: # ['ros', 'smote']:
    for balancing_ratio in [100, 5]: 
        sample_weights = get_sample_weights(balancing_ratio)
        for model_type in ['logistic', 'dt']: #, 'rf', 'mlp2', 'svm']): # 'rf', 'mlp2', 'svm', 'gb'
            for feature_selection in ['select_stab_lasso', 'select_lasso', 'select_rf']: #, 'select_lasso', 'select_rf']: # select_lasso, select_rf, None
                for feature_selection_num in tqdm([5, 6, 7, 10, len(processed_feats)]):
                    out_name = f'{model_type}_{feature_selection}={feature_selection_num}_{balancing}={balancing_ratio}'
                    train.train(df,
                                feat_names=processed_feats,
                                model_type=model_type, 
                                balancing=balancing,
                                outcome_def=outcome_def,
                                sample_weights=sample_weights,
                                balancing_ratio=balancing_ratio,
                                out_name=f'{out_dir}/{out_name}.pkl',
                                feature_selection=feature_selection,
                                feature_selection_num=feature_selection_num,
                                train_idxs=train_idxs,
                                test_idxs1=test_idxs1,
                                test_idxs2=test_idxs2)

100%|██████████| 5/5 [00:53<00:00, 10.73s/it]
100%|██████████| 5/5 [00:02<00:00,  2.09it/s]
100%|██████████| 5/5 [00:04<00:00,  1.21it/s]
100%|██████████| 5/5 [00:53<00:00, 10.67s/it]
100%|██████████| 5/5 [00:02<00:00,  2.33it/s]
100%|██████████| 5/5 [00:03<00:00,  1.29it/s]
100%|██████████| 5/5 [00:53<00:00, 10.71s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 5/5 [00:02<00:00,  2.10it/s]
100%|██████████| 5/5 [00:04<00:00,  1.22it/s]
 80%|████████  | 4/5 [00:42<00:10, 10.71s/it]