In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import KNNImputer
from sklearn import preprocessing, model_selection
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from itertools import combinations 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import os

In [None]:
def greatest_class(prediction):
    max_item = max(prediction)
    return prediction.tolist().index(max_item)

def impute(x, y, imputer = None):
    if imputer:
        _x = pd.DataFrame(imputer.transform(x), columns = x.columns)
        return _x, y, imputer
    
    imputer = KNNImputer(n_neighbors=3)
    _x = pd.DataFrame(imputer.fit_transform(x), columns = x.columns)
    return _x, y, imputer

def oversample(x, y):
    x_smote, y_smote = SMOTETomek().fit_resample(x, y)
    return x_smote, y_smote

def select_rows(x, y, cols):
    df = pd.DataFrame(x, columns = cols)
    df = df[df.columns.drop(list(df.filter(regex='Positive discriminator|MTS|Hospitalisation')))]
    return df.to_numpy(), y, df.columns

def augment(x, y, augmenter = None):
    x = x.fillna(-1)
    x, augmenter = _add_interactions(x, augmenter)
    return x, y, augmenter, x.columns

def _add_interactions(df, augmenter):
    combos = list(combinations(list(df.columns), 2))
    colnames = list(df.columns) + ['_'.join(x) for x in combos]

    if augmenter:
        df = augmenter.transform(df)
    else:
        augmenter = PolynomialFeatures(interaction_only=True, include_bias=False)
        df = augmenter.fit_transform(df)

    df = pd.DataFrame(df)
    df.columns = colnames
    
    noint_indicies = [i for i, x in enumerate(list((df == 0).all())) if x]
    df = df.drop(df.columns[noint_indicies], axis=1)
    
    return df, augmenter

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('data.csv')
dataset = df[(df['Respiratory rate'].notnull() | df['Heart rate'].notnull() | df['Temperature'].notnull() | df['Oxygen saturation'].notnull()) & df['Age'].notnull()]

In [None]:
data = dataset

In [None]:
fernando = data[data['Hospital '] == 'Fernando Fonseca Hospital']
juliana = data[data['Hospital '] == 'Juliana Childrens Hospital']
sophia = data[data['Hospital '] == 'Sophia Childrens Hospital']
mary = data[data['Hospital '] == 'St Mary']

In [None]:
def scale(x, y, scaler = None):
    if scaler:
        return pd.DataFrame(scaler.transform(x), columns = x.columns), y, scaler
    
    scaler = MinMaxScaler()
    norm_data = pd.DataFrame(scaler.fit_transform(x), columns = x.columns)

    return norm_data, y, scaler

In [None]:
def ohe(data):
    variables = data[['Presenting problem']]
    dummmies = pd.get_dummies(variables)
    df_ordinal_removed = data.drop(['Presenting problem'], axis=1)
    data = pd.concat([dummmies, df_ordinal_removed], axis=1)

    variables = data[['Positive discriminator']]
    dummmies = pd.get_dummies(variables)
    df_discriminator_removed = data.drop(['Positive discriminator'], axis=1)
    data = pd.concat([dummmies, df_discriminator_removed], axis=1)

    data['Hospitalisation'] = data['Hospitalisation'].replace({ 'no': 0, 'yes': 1})
    return data

In [None]:
def gen_folds(data, fold):
    num_folds = 10
    fold_no = 1
    kfold = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True)

    data = data.drop([
        'Patientnumber', 'Hospital ', 'Arrival date', 'MTS 1'
    ], axis = 1)
        
    X = data.drop(['Original MTS'], axis = 1)
    Y = data.filter(['Original MTS'])
    
    X = ohe(X)
    Y = Y['Original MTS'].replace({ 'Non urgent': 0, 'Standard': 1, 'Urgent': 2, 'Very urgent': 3, 'Emergent': 4})
       
    X, Y, _, columns = augment(X, Y)
    X[(X < 0) & (X != np.nan)] = np.nan
        
    for train, test in kfold.split(X, Y):
        print('------------------------------------------------------------------------')
        print(f'Training for fold {fold_no} ...')
        
        print('TRAIN: Normalizing data')
        x_train, y_train, escaler = scale(X.loc[train], Y.loc[train])
        print('TRAIN: Imputing')
        x_train, y_train, imputer = impute(x_train, y_train)
        print('TRAIN: Oversampling')
        x_train, y_train = oversample(x_train, y_train)
        print('TRAIN: Selecting rows')
        x_train, y_train, _ = select_rows(x_train, y_train, columns)
        
        print('TEST: Normalizing data')
        x_test, y_test, _ = scale(X.loc[test], Y.loc[test], escaler)
        print('TEST: Imputing')
        x_test, y_test, _ = impute(x_test, y_test, imputer)
        print('TEST: Selecting rows')
        x_test, y_test, filtered_columns = select_rows(x_test, y_test, columns)

        x_train_df = pd.DataFrame(x_train, columns = filtered_columns)
        y_train_df = pd.DataFrame(y_train.values, columns = ['MTS'])
        x_test_df  = pd.DataFrame(x_test, columns = filtered_columns)
        y_test_df  = pd.DataFrame(y_test.values, columns = ['MTS'])
        
        save_folds(x_train_df, y_train_df, x_test_df, y_test_df, fold, fold_no)
                
        fold_no = fold_no + 1

In [None]:
def save_folds(x_train, y_train, x_test, y_test, fold, fold_no):
    path = 'folds/' + fold
    
    if not os.path.exists(path):
        os.makedirs(path)
        
    x_train.to_csv(path  + '/'  + str(fold_no) + '_x_train.csv', index=False)
    y_train.to_csv(path  + '/'  + str(fold_no) + '_y_train.csv', index=False)
    x_test.to_csv(path  + '/'  + str(fold_no) + '_x_test.csv', index=False)
    y_test.to_csv(path  + '/'  + str(fold_no) + '_y_test.csv', index=False)

In [None]:
def run_model(label, fold, model):
    num_folds = 10
    predictions_per_fold = []
    verbosity = 1
    fold_no = 1

    while fold_no <= num_folds:
        x_train = pd.read_csv('folds/' + fold + '/' + str(fold_no) + '_x_train.csv', index_col=False).to_numpy()
        y_train = pd.read_csv('folds/' + fold + '/' + str(fold_no) + '_y_train.csv', index_col=False).to_numpy()
        x_test  = pd.read_csv('folds/' + fold + '/' + str(fold_no) + '_x_test.csv', index_col=False).to_numpy()
        y_test  = pd.read_csv('folds/' + fold + '/' + str(fold_no) + '_y_test.csv', index_col=False).to_numpy()
            
        y_train = np.ravel(y_train, order = 'C')
        y_test = np.ravel(y_test, order = 'C')
         
        print('------------------------------------------------------------------------')
        print(f'Fold {fold_no} ...')
        model(x_train, y_train, x_test, y_test, label, fold_no)
        
        fold_no = fold_no + 1

In [None]:
def svm(x_train, y_train, x_test, y_test, label, fold_no):
    print('Fitting model')
    clf = SVC(decision_function_shape='ovr', kernel='rbf', C=100, gamma=100)
    clf.fit(x_train, y_train)

    print('Predicting')
    predicted = clf.decision_function(x_test)

    class_per_prediction = list(map(greatest_class, predicted))
    save_to_file([class_per_prediction, y_test], label, fold_no)

In [None]:
def mlp(x_train, y_train, x_test, y_test, label, fold_no):
    model = Sequential()
    model.add(Input(shape=(x_train.shape[1],)))
    model.add(Dense(256, activation = 'relu'))
    model.add(Dropout(0.2))
    model.add(Dense(128, activation = 'relu')) 
    model.add(Dropout(0.2))
    model.add(Dense(5, activation = 'softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])

    print('Fitting model')
    history = model.fit(x_train, y_train, batch_size=64, epochs=50)
    scores = model.evaluate(x_test, y_test, verbose=0)

    print('Predicting')
    predicted = model.predict(x_test)

    class_per_prediction = list(map(greatest_class, predicted))
    save_to_file([class_per_prediction, y_test], label, fold_no)

In [None]:
def dt(x_train, y_train, x_test, y_test, label, fold_no):
    print('Fitting model')
    clf = DecisionTreeClassifier()
    clf.fit(x_train, y_train)

    print('Predicting')
    score = clf.score(x_test, y_test)
    print(f'Score for fold {fold_no}: {score}')

    predicted = clf.predict(x_test)
    save_to_file([predicted, y_test], label, fold_no)

In [None]:
def gb(x_train, y_train, x_test, y_test, label, fold_no):
    print('Fitting model')
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=5)
    clf.fit(x_train, y_train)

    print('Predicting')
    score = clf.score(x_test, y_test)
    print(f'Score for fold {fold_no}: {score}')

    predicted = clf.predict(x_test)
    save_to_file([predicted, y_test], label, fold_no)

In [None]:
def ab(x_train, y_train, x_test, y_test, label, fold_no):
    print('Fitting model')
    clf = AdaBoostClassifier(n_estimators=100)
    clf.fit(x_train, y_train)

    print('Predicting')
    score = clf.score(x_test, y_test)
    print(f'Score for fold {fold_no}: {score}')

    predicted = clf.predict(x_test)
    save_to_file([predicted, y_test], label, fold_no)

In [None]:
def rf(x_train, y_train, x_test, y_test, label, fold_no):
    print('Fitting model')
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(x_train, y_train)

    print('Predicting')
    score = clf.score(x_test, y_test)
    print(f'Score for fold {fold_no}: {score}')

    predicted = clf.predict(x_test)
    save_to_file([predicted, y_test], label, fold_no)

In [None]:
def knn(x_train, y_train, x_test, y_test, label, fold_no):
    print('Fitting model')
    clf = KNeighborsClassifier(n_neighbors=10)
    clf.fit(x_train, y_train)

    print('Predicting')
    score = clf.score(x_test, y_test)
    print(f'Score for fold {fold_no}: {score}')

    predicted = clf.predict(x_test)
    save_to_file([predicted, y_test], label, fold_no)

In [None]:
def save_to_file(predictions, tag, fold_no):
    path = 'results/' + tag + '/'
    
    if not os.path.exists(path):
        os.makedirs(path)    

    with open(path + str(fold_no) + '-predicted.txt', 'w+') as writer:
        writer.writelines(list(map(str, predictions[0])))
        writer.write("\n")
            
    with open(path + str(fold_no) + '-label.txt', 'w+') as writer:
        writer.writelines(list(map(str, predictions[1])))
        writer.write("\n")

# Run tests

In [None]:
for i in range(30):
    run_model('juliana-ab/' + str(i), 'juliana', ab)

In [None]:
run_model('juliana-knn', 'juliana-negative', knn)

# Generate folds

In [None]:
gen_folds(pd.concat([juliana]).reset_index(drop=True), 'juliana')

In [None]:
columns = [
    'Presenting problem_Dyspnea',
    'Presenting problem_Ear, nose, throat',
    'Presenting problem_Fever without source',
    'Presenting problem_Gastro-intestinal',
    'Presenting problem_Neurological',
    'Presenting problem_Other problems',
    'Presenting problem_Rash',
    'Presenting problem_Trauma',
    'Presenting problem_Urinary tract problems',
    'Presenting problem_Wounds',
    'Presenting problem_local infection/abscess',
    'Age',
    'Respiratory rate',
    'Heart rate',
    'Temperature',
    'Oxygen saturation',
]

# Analysis

In [None]:
#juliana.groupby('Original MTS').describe()
juliana.describe()

juliana['Oxygen saturation'] = juliana['Oxygen saturation'].astype('float64')
# print(juliana.describe().to_latex())
print(juliana.groupby('Original MTS').describe().to_latex())


#juliana['Original MTS'].value_counts()

In [None]:
#sophia.groupby('Original MTS').describe()

sophia['Oxygen saturation'] = sophia['Oxygen saturation'].astype('float64')


print(sophia.groupby('Original MTS').describe().to_latex())

#print(sophia.describe())