In [1]:
import numpy as np
import pandas as pd
import pickle
import sklearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *

In [2]:
# load in the preprocessed data
preprocessed_data = pd.read_csv("../mimic_data_after_preprocess/mimic_preprocessed.csv", index_col='subject_id')

In [3]:
# split preprocesses dataset into two part for training model and simulation respectively
train_model_set, simulation_set = train_test_split(preprocessed_data, test_size=0.2)
train_model_set.to_csv("../mimic_data_after_preprocess/train_model_set25.csv", encoding='utf-8')
simulation_set.to_csv("../mimic_data_after_preprocess/simulation_set25.csv", encoding='utf-8')

In [4]:
# split train_model_set into two part
train_set, test_set = train_test_split(train_model_set, test_size=0.2)
print(train_set.shape, sum(train_set['adverse_flag']))
print(test_set.shape, sum(test_set['adverse_flag']))
# turn pandas dataframe to csv file
train_set.to_csv("../mimic_data_after_preprocess/training_set25.csv", encoding='utf-8')
test_set.to_csv("../mimic_data_after_preprocess/testing_set25.csv", encoding='utf-8')

(73912, 26) 1707
(18478, 26) 446


In [5]:
# when directly sampling on age group features, it may be some cases whose several age group flags are 1
# it may also generate similar errors on narcotic or anti_narcotic flag and total amount of drugs
# therefore, turn the original form into another form

train_set_wo_flag = train_set.drop(['age_1', 'age_2', 'age_3', 'age_4', 'age_5', 'age_6', 'age_7', 'age_8',
                    'anti_narcotic', 'narcotic', 'n_anti_narcotic', 'n_narcotic'], axis=1)

In [6]:
subjects_train = train_set.index
count = 0
for subject in subjects_train:
    if train_set.loc[subject, 'age_1'] == 1:
        age = 1
    elif train_set.loc[subject, 'age_2'] == 1:
        age = 2
    elif train_set.loc[subject, 'age_3'] == 1:
        age = 3
    elif train_set.loc[subject, 'age_4'] == 1:
        age = 4
    elif train_set.loc[subject, 'age_5'] == 1:
        age = 5
    elif train_set.loc[subject, 'age_6'] == 1:
        age = 6
    elif train_set.loc[subject, 'age_7'] == 1:
        age = 7
    elif train_set.loc[subject, 'age_8'] == 1:
        age = 8
    train_set_wo_flag.loc[subject, 'age'] = age
    count += 1
    if count % 5000 == 0 :
        print("finish",count)


finish 5000
finish 10000
finish 15000
finish 20000
finish 25000
finish 30000
finish 35000
finish 40000
finish 45000
finish 50000
finish 55000
finish 60000
finish 65000
finish 70000


In [7]:
train_set_wo_flag['age'] = train_set_wo_flag['age'].astype("int64")

In [8]:
# Do oversampling. After sampling, in default, the number of minority class should be the same with the majority class
# SMOTE ratio 1:1
features_train = train_set_wo_flag.loc[:, train_set_wo_flag.columns!="adverse_flag"]
labels_train = train_set_wo_flag['adverse_flag']
oversample = SMOTE(sampling_strategy=0.25)
features_train_o, labels_train_o = oversample.fit_resample(features_train, labels_train)

In [9]:
print(features_train.shape, sum(labels_train==1), sum(labels_train==0))
print(features_train_o.shape, sum(labels_train_o==1), sum(labels_train_o==0))
print(max(features_train['age']), min(features_train['age']))
print(max(features_train_o['age']), min(features_train_o['age']))

(73912, 14) 1707 72205
(90256, 14) 18051 72205
8 2
8 2


In [10]:
# now compute the new age group flags, drug flags, and total amount of drugs
train_set_smote = features_train_o.drop('age', axis=1)
subjects_smote = features_train_o.index

In [11]:
# compute age group first
for i in range(1,9):
    train_set_smote['age_'+str(i)] = 0
    train_set_smote['age_'+str(i)][features_train_o['age']==i] = 1

In [12]:
# compute 'anti_narcotic', 'narcotic' flag and total amounts of drugs
drug_names = ['oxymorphone', 'oxycodone', 'morphine', 'meperidine',
                'hydromorphone', 'hydrocodone', 'fentanyl', 'codeine', 'buprenorphine',
                'methadone', 'methadone', 'naloxone']
narcotic = train_set_smote['oxymorphone']
for drug in drug_names[1:-1]:
    narcotic = narcotic + train_set_smote[drug]
train_set_smote['n_narcotic'] = narcotic
train_set_smote['narcotic'] = 0
train_set_smote['narcotic'][narcotic>0] = 1

anti_narcotic = train_set_smote['methadone']+train_set_smote['naloxone']
train_set_smote['n_anti_narcotic'] = anti_narcotic
train_set_smote['anti_narcotic'] = 0
train_set_smote['anti_narcotic'][anti_narcotic>0] = 1

In [13]:
# add the labels and switch the columns

columns=['age_1', 'age_2', 'age_3', 'age_4', 'age_5', 'age_6', 'age_7', 'age_8',
            'gender', 'n_hosp', 'anti_narcotic', 'narcotic',
            'n_anti_narcotic', 'n_narcotic',
            'oxymorphone', 'oxycodone', 'morphine', 'meperidine',
            'hydromorphone', 'hydrocodone', 'fentanyl', 'codeine',
            'buprenorphine', 'methadone', 'naloxone',
            'adverse_flag']

train_set_smote['adverse_flag'] = labels_train_o
train_set_smote = train_set_smote[columns]

In [14]:
# save the train_set_smote to csv
train_set_smote.to_csv("../mimic_data_after_preprocess/training_set_smote25.csv", encoding='utf-8')