In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from pandas import DataFrame, concat
from pandas import DataFrame
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import itertools
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from joblib import parallel_backend
from sklearn.model_selection import GridSearchCV
import sys
import os

# Prepare Exaggerate

In [23]:
# load exaggerate dataset
exaggerate = pd.read_csv('../exaggerate/data/working_sheet.csv', sep=";")
target='aecopd_12m' # aecopd_12m
# rename variables for more practical handling
exaggerate = exaggerate.rename(columns={'sex': 'gender', 'dyspnoea_mMRC': 'mmrc', 'ami':'myocardial_infarct', 'cbd':'stroke', 'dyspnoea_yesno':'dyspnoea', target:'target'})
relevant_vars=['gender', 'age', 'bmi', 'sbp', 'dbp', 'diabetes', 'heart_failure', 'temperature',  'dyspnoea', 'mmrc', 'target', 'myocardial_infarct', 'cancer', 'stroke'] #, 'rr', ', ,'prev_exacerb'

# drop irrelevant variables
exaggerate = exaggerate[relevant_vars]

# sort columns
exaggerate = exaggerate.reindex(sorted(exaggerate.columns), axis=1)

# correct bmi scale.

# if bmi is between 100 and 1000, then divide by 10 to get the correct scale
exaggerate.loc[(exaggerate['bmi'] > 100) & (exaggerate['bmi'] < 1000), 'bmi'] = exaggerate['bmi'] / 10
# if bmi is betweeen 1000 and 10000 then divide by 100 to get the correct scale
exaggerate.loc[(exaggerate['bmi'] > 1000) & (exaggerate['bmi'] < 10000), 'bmi'] = exaggerate['bmi'] / 100

# correct the temperature scale (if temperature is between 100 and 1000, then divide by 10 to get the correct scale)
exaggerate.loc[(exaggerate['temperature'] > 100) & (exaggerate['temperature'] < 1000), 'temperature'] = exaggerate['temperature'] / 10

# create fever variable
exaggerate['fever'] = 0
# according to CDC, john hopkins etc. fever is defined as a temperature of 38 degrees or higher
exaggerate.loc[exaggerate['temperature'] > 38, 'fever'] = 1
exaggerate = exaggerate.drop(columns=['temperature'])

# if prev_exacerb is 1 or above then set it to 1 else 0
# exaggerate.loc[exaggerate['prev_exacerb'] >= 1, 'prev_exacerb'] = 1

# # create fast_breathing variable
# exaggerate['fast_breathing'] = 0
# # clinically normal respiratory rate is 12 - 20 anything above is considered fast breathing for adults
# exaggerate.loc[exaggerate['rr'] > 20, 'fast_breathing'] = 1
# exaggerate = exaggerate.drop(columns=['rr'])

# plot_hist(exaggerate)

########
# Missing values
########



# print(exaggerate.shape)
# drop rows with more than 85% missing values
exaggerate = exaggerate.dropna(thresh=0.70*exaggerate.shape[1], axis=0)

# drop all rows where mmrc is missing
# exaggerate = exaggerate.dropna(subset=['mmrc'], axis=0)

# reset index to be sequential again
exaggerate = exaggerate.reset_index(drop=True)
# print(exaggerate.shape)

# print missing values in a table for each variable
# print("missing values in the dataset: ")
# print(exaggerate.isna().sum())


# impute mmrc values using median and store it for later
# mmrc = exaggerate['mmrc'].fillna(exaggerate['mmrc'].median())

# imputer = IterativeImputer(random_state=42)
# imputed = imputer.fit_transform(exaggerate)
# exaggerate = pd.DataFrame(imputed, columns=exaggerate.columns)


# plot 2x5 histograms for each variable


# print missing values in a table for each variable
# print("missing values in the dataset after: ")
# print(exaggerate.isna().sum())

numeric_vars = ['age', 'bmi', 'sbp', 'dbp']
for var in numeric_vars:
    exaggerate[var] = exaggerate[var].fillna(exaggerate[var].median())

categorical_vars = ['gender', 'diabetes', 'heart_failure', 'mmrc', 'target', 'fever',   'dyspnoea', 'cancer',  'myocardial_infarct', 'stroke'] # , '', ,,'prev_exacerb' 'fast_breathing',
for var in categorical_vars:
    exaggerate[var] = exaggerate[var].fillna(exaggerate[var].mode()[0])


# plot_hist(exaggerate)

#########
# Outliers
#########

# dealing with outliers is not necessary in this dataset

########
# Feature Engineering
########

# create a new variable for hypertension. If sbp variable is greater than 140 or dbp variable is greater than 90, then hypertension is 1, otherwise 0
exaggerate['hypertension'] = 0
exaggerate.loc[(exaggerate['sbp'] > 140) | (exaggerate['dbp'] > 90), 'hypertension'] = 1

# drop sbp and dbp columns
exaggerate = exaggerate.drop(columns=['sbp', 'dbp'])

# drop mmrc column for now
# exaggerate = exaggerate.drop(columns=['mmrc'])

#Cardiovascular disease was defined as heart failure, acute myocardial infarction, cerebrovascular disease, or peripheral arterial disease


##########
# Normalization
##########



numeric_vars = ['age', 'bmi', 'mmrc']
df_nr = exaggerate[numeric_vars]
df_rest = exaggerate.drop(columns=numeric_vars)
transf = MinMaxScaler(feature_range=(0, 1), copy=True).fit(df_nr)
tmp = DataFrame(transf.transform(df_nr), index=exaggerate.index, columns=numeric_vars)
exaggerate= concat([tmp, df_rest], axis=1)

############
# Target variable
############

# Binarize the target variable if data in target column is above 0, then set it to 1 else 0
# print("target variable value counts: ")
# print(exaggerate['target'].value_counts())
exaggerate.loc[exaggerate['target'] > 0, 'target'] = 1


# convert target to 4 classes like in harvard dataset
# exaggerate_data.loc[(exaggerate_data['target'] > 1) & (exaggerate_data['target'] <= 5), 'target'] = 2
# exaggerate_data.loc[(exaggerate_data['target'] > 5) & (exaggerate_data['target'] <= 10), 'target'] = 3
# exaggerate_data.loc[(exaggerate_data['target'] > 10), 'target'] = 4
# # add one to all target classes
# exaggerate_data['target'] = exaggerate_data['target'] + 1

exaggerate = exaggerate.reindex(sorted(exaggerate.columns), axis=1)
exaggerate.to_csv('../exaggerate/data/imputed_hypertension.csv', index=False, sep=';')


# print(exaggerate['target'].value_counts())

# get rid of any nan values in the dataset
exaggerate = exaggerate.dropna()

# use smote to rebalance the exaggerate dataset in place
print("perform smote")
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(exaggerate.drop(columns=['target']), exaggerate['target'])
X_res['target'] = y_res
exaggerate = X_res

# undersampling

# # get number of underrespresented class 0
# u = exaggerate['target'].value_counts().min()
# u_label = exaggerate['target'].value_counts().idxmin()

# # overrepresented class 1
# o = exaggerate['target'].value_counts().max()
# o_label = exaggerate['target'].value_counts().idxmax()

# # add all underrepresented samples to a new dataframe
# underrepresented = exaggerate[exaggerate['target'] == u_label]
# # sample u number of overrepresented samples from the dataset
# overrepresented = exaggerate[exaggerate['target'] == o_label].sample(n=u, random_state=42)
# # concatenate underrepresented and overrepresented
# exaggerate = pd.concat([underrepresented, overrepresented])



# print(X_res['target'].value_counts())

# print(exaggerate.head())
# plot_hist(exaggerate)

# TODO there is something wrong with target value distribution


# sort the columns
exaggerate = exaggerate.reindex(sorted(exaggerate.columns), axis=1)
# convert all variables to int
exaggerate = exaggerate.astype(int)

# save the new dataset
exaggerate.to_csv('../exaggerate/data/harvard_cv.csv', index=False, sep=';')

perform smote


# Prepare Harvard

In [32]:
target = 'fclinra08'

relevant_vars = ['dem02', 'dem03', 'bclinra01', 'bclinra02', 'bclinpt04', 'bclinpt15', 'mmrc', 'bclinpt07', 'bclinpt34',  'bclinpt28',  'login', 'bclinpt19', 'bclinpt08','bclinpt09',target] # (cancer), 'bclinpt36' fast breathing, , ,'bclinpt22' (prev exa)

# load harvard dataset
harvard = pd.read_csv('../harvard/data/dropped_variables.csv', sep=";")
harvard = harvard[relevant_vars]

# change mmrc by starting at 0 instead of 1
harvard['mmrc'] = harvard['mmrc'] - 1

# make female 0 and male = 1
harvard.loc[harvard['dem03'] == 2, 'dem03'] = 0
harvard.loc[harvard['dem03'] == 1, 'dem03'] = 0

# rename columns to common names with exaggerate
harvard = harvard.rename(columns={'dem02':'age', 'dem03': 'gender', 'bclinpt19': 'cancer', 'bclinpt04': 'hypertension', 'bclinpt15': 'diabetes', 'bclinpt07': 'heart_failure', target: 'target', 'bclinra01': 'height', 'bclinra02': 'weight', 'bclinpt34': 'fever',  'bclinpt08':'myocardial_infarct', 'bclinpt09':'stroke', 'bclinpt28':'dyspnoea', 'bclinpt36': 'fast_breathing', 'mmrc': 'mmrc', 'bclinpt22': 'prev_exacerb'
 }) 

# get the baseline dataset
unique_logins = harvard['login'].unique()
# baseline dataframe has the same columns as the original dataset
baseline = []
for login in unique_logins:
    # add the first row of the selection to the baseline dataset
    baseline.append(harvard[harvard['login'] == login].iloc[0])
# turn lists into dataframes but with columns from harvard dataset
harvard = pd.DataFrame(baseline, columns=harvard.columns)
harvard = harvard.drop('login', axis=1)

# swap variable for which yes and no are reversed (yes = 0, no = 1)
harvard['hypertension'] = 1 - harvard['hypertension']
harvard['fever'] = 1 - harvard['fever']
harvard['heart_failure'] = 1- harvard['heart_failure']

# calculate bmi from weight and height
harvard['bmi'] = harvard['weight'] / (harvard['height'] / 100) ** 2
harvard = harvard.drop(columns=['weight', 'height'])
harvard = harvard.reindex(sorted(harvard.columns), axis=1)

# # if prev_exacerb is 1 set it to 0
# harvard.loc[harvard['prev_exacerb'] == 1, 'prev_exacerb'] = 0
# # if prev_exacerb is above 1 set it to 1
# harvard.loc[harvard['prev_exacerb'] > 1, 'prev_exacerb'] = 1
# # plot_hist(harvard)

##########
# Missing Values & duplicates
##########


# print missing values per variable and total
print("missing values per variable")
print("total records: "+str(len(harvard)))
print(harvard.isnull().sum())

# drop all records where target is nan
harvard = harvard.dropna(subset=['target'])

numerical_vars = ['age', 'bmi']
for var in numerical_vars:
    harvard[var] = harvard[var].fillna(harvard[var].median())


# impute gender, cancer, hypertension, diabetes, heart_failure, target with most frequent value
categorical_vars = ['gender', 'hypertension', 'diabetes', 'heart_failure', 'target', 'mmrc', 'fever',   'dyspnoea', 'cancer', 'myocardial_infarct', 'stroke'] # , 'fast_breathing',,, 'prev_exacerb'
for var in categorical_vars:
    harvard[var] = harvard[var].fillna(harvard[var].mode()[0])


# print("empty cells remaining in the dataset")
# print(harvard.isnull().sum().sum())

# print("mv imputation")

# plot_hist(harvard)

##########
# Feature engineering
############

# print("feature engineering")

# plot_hist(harvard)

##########
# Normalization
##########

numeric_vars = ['age', 'bmi', 'mmrc']
df_nr = harvard[numeric_vars]
df_rest = harvard.drop(columns=numeric_vars)
transf = MinMaxScaler(feature_range=(0, 1), copy=True).fit(df_nr)
tmp = DataFrame(transf.transform(df_nr), index=harvard.index, columns=numeric_vars)
harvard= concat([tmp, df_rest], axis=1)


###########
# Target variable
###########

# if target variable is 1 then set it to 0, otherwise 1
harvard.loc[harvard['target'] == 2, 'target'] = 0

# set all none 0 values to 1
harvard.loc[harvard['target'] == 1, 'target'] = 1
print(harvard['target'].value_counts())

# use smote to rebalance the harvard dataset in place
# print("perform smote")

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(harvard.drop(columns=['target']), harvard['target'])
X_res['target'] = y_res
harvard = X_res

# use undersampling

# # get number of underrespresented class 0
# u = harvard['target'].value_counts().min()
# u_label = harvard['target'].value_counts().idxmin()

# # overrepresented class 1
# o = harvard['target'].value_counts().max()
# o_label = harvard['target'].value_counts().idxmax()

# # add all underrepresented samples to a new dataframe
# underrepresented = harvard[harvard['target'] == u_label]
# # sample u number of overrepresented samples from the dataset
# overrepresented = harvard[harvard['target'] == o_label].sample(n=u, random_state=42)
# # concatenate underrepresented and overrepresented
# harvard = pd.concat([underrepresented, overrepresented])
# # plot_hist(harvard)
# print(harvard['target'].value_counts())

# sort the columns
harvard = harvard.reindex(sorted(harvard.columns), axis=1)
# convert all variables to int
harvard = harvard.astype(int)

print(harvard.head())

# save dataset
harvard.to_csv('../harvard/data/exaggerate_cv.csv', index=False, sep=';')


missing values per variable
total records: 188
age                    0
bmi                   16
cancer                 8
diabetes              11
dyspnoea               1
fever                  2
gender                 0
heart_failure         11
hypertension          10
mmrc                   0
myocardial_infarct    11
stroke                12
target                38
dtype: int64
0.0    126
1.0     24
Name: target, dtype: int64
   age  bmi  cancer  diabetes  dyspnoea  fever  gender  heart_failure  \
0    0    0       0         0         0      0       0              1   
1    0    0       1         1         1      0       0              1   
2    0    0       0         1         1      0       0              0   
3    0    0       0         0         1      0       0              1   
4    0    0       0         0         0      0       0              1   

   hypertension  mmrc  myocardial_infarct  stroke  target  
0             0     0                   0       0       0  
1      

  harvard = pd.read_csv('../harvard/data/dropped_variables.csv', sep=";")


# Cross Dataset

In [57]:
from IPython.utils import io

# split harvard into blending and validation
blend_percentages = [0, 3, 5, 15, 25]
classifiers = [XGBClassifier(), RandomForestClassifier(),  KNeighborsClassifier()]
results = pd.DataFrame(columns=['combination', 'blend', 'acc_exaggerate', 'acc_harvard', 'model', 'parameters'])
selection = exaggerate.columns


relevant_vars = ['gender', 'age', 'bmi', 'hypertension', 'diabetes', 'heart_failure', 'mmrc', 'fever',  'dyspnoea', 'cancer', 'myocardial_infarct', 'stroke']

# get all possible sets of relevant variables
all_combinations = []
for i in range(1, len(relevant_vars)+1):
    combinations_object = itertools.combinations(relevant_vars, i)
    combinations_list = list(combinations_object)
    all_combinations += combinations_list

for i, selection in enumerate(all_combinations):
    selection = list(selection)
    selection.append('target')
    exaggerate_selection = exaggerate[selection]
    harvard_selection = harvard[selection]
    for percentage in blend_percentages:
        # split into blending and validation
        harvard_blending, harvard_validation = pd.DataFrame(columns=harvard_selection.columns), pd.DataFrame(columns=harvard_selection.columns)
        if percentage != 0:
            harvard_blending, harvard_validation = train_test_split(harvard_selection, test_size=1-percentage/100, random_state=42)
        else:
            harvard_validation = harvard_selection

        X_test = harvard_validation.drop(columns=['target']).to_numpy()
        y_test = harvard_validation['target'].to_numpy()
        # create X_train from harvard_blending and exaggerate
        X_train = pd.concat([harvard_blending.drop(columns=['target']), exaggerate_selection.drop(columns=['target'])]).to_numpy()
        # create y_train from harvard_blending and exaggerate
        y_train = pd.concat([harvard_blending['target'], exaggerate_selection['target']]).to_numpy()
        for clf in classifiers:
            # perform grid search
            if clf.__class__.__name__ == 'XGBClassifier':
                clf = XGBClassifier()
                parameters = {
                    'max_depth': [5, 10, 25],
                    'learning_rate': [.01, .05, .1, .2],
                    'n_estimators': [5, 10, 25, 75, 100, 300]
                }
                grid_search = GridSearchCV(
                    estimator=clf,
                    param_grid=parameters,
                    scoring = 'accuracy',
                    n_jobs = -1,
                    # specify validation set as the validation set above (harvard_validation) and 
                    cv=4,
                    # verbose=True,
                    
                    error_score='raise'
                )
                grid_search.fit(X_train, y_train)
                clf = grid_search
            if clf.__class__.__name__ == 'RandomForestClassifier':
                clf = RandomForestClassifier()
                parameters = {
                    'max_depth': [5, 10, 25],
                    'max_features': [.3, .5, .7],
                    'n_estimators': [5, 10, 25, 75, 100, 300]
                }
                grid_search = GridSearchCV(
                    estimator=clf,
                    param_grid=parameters,
                    scoring = 'accuracy',
                    n_jobs = -1,
                    cv=4,
                    # verbose=True,
                    error_score='raise'
                )
                grid_search.fit(X_train, y_train)
                clf = grid_search
            if clf.__class__.__name__ == 'KNeighborsClassifier':
                clf = KNeighborsClassifier()
                parameters = {
                    'n_neighbors': [3, 5, 7, 9],
                }
                grid_search = GridSearchCV(
                    estimator=clf,
                    param_grid=parameters,
                    scoring = 'accuracy',
                    n_jobs = -1,
                    cv=4,
                    # verbose=True,
                    error_score='raise'
                )
                grid_search.fit(X_train, y_train)
                clf = grid_search
            y_pred_harvard = clf.predict(X_test)
            accuracy_harvard = accuracy_score(y_test, y_pred_harvard)
            y_pred_exaggerate = clf.predict(exaggerate_selection.drop(columns=['target']).to_numpy())
            accuracy_exaggerate = accuracy_score(exaggerate_selection['target'].to_numpy(), y_pred_exaggerate)
            new_row = {'combination': selection, 'blend': percentage, 'acc_exaggerate': accuracy_exaggerate, 'acc_harvard': accuracy_harvard, 'model': str(clf.best_estimator_.__class__.__name__), 'parameters': str(clf.best_params_)} # 'combination': str(combination),
            # TODO replace append as it might be deprecated soon. For now workaround to avoid printing to console
            
            with io.capture_output() as captured:
                results = results.append(new_row, ignore_index=True, )
            
    # print something if multiples of 1% are done
    if i % np.floor(len(all_combinations) / 100) == 0:
        print(f'{i / len(all_combinations) * 100}% done')
        results.to_csv('./results/exaggerate_harvard_2.csv', sep=';', index=False)
        # print row with best acc_mean
        print(results.loc[results['acc_harvard'].idxmax()])

results.to_csv('./results/exaggerate_harvard_2.csv', sep=';', index=False)

0.0% done
combination                                        [gender, target]
blend                                                            25
acc_exaggerate                                             0.525685
acc_harvard                                                0.513228
model                                                 XGBClassifier
parameters        {'learning_rate': 0.01, 'max_depth': 5, 'n_est...
Name: 12, dtype: object
0.9768009768009768% done
combination       [age, diabetes, target]
blend                                   5
acc_exaggerate                   0.480023
acc_harvard                      0.604167
model                KNeighborsClassifier
parameters             {'n_neighbors': 5}
Name: 383, dtype: object


exception calling callback for <Future at 0x200e6789450 state=finished raised PicklingError>
Traceback (most recent call last):
  File "c:\Users\Daniel\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\queues.py", line 125, in _feed
    obj_ = dumps(obj, reducers=reducers)
  File "c:\Users\Daniel\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\reduction.py", line 211, in dumps
    dump(obj, buf, reducers=reducers, protocol=protocol)
  File "c:\Users\Daniel\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\reduction.py", line 204, in dump
    _LokyPickler(file, reducers=reducers, protocol=protocol).dump(obj)
  File "c:\Users\Daniel\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\cloudpickle\cloudpickle_fast.py", line 632, in dump
    return Pickler.dump(self, obj)
  File "c:\Users\Daniel\AppData\Local\Programs\Python\Python310\lib\site-packages\j

KeyboardInterrupt: 