In [161]:
import boto3
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

from sklearn.model_selection import train_test_split

role = get_execution_role()
bucket='hack4med'

przyjecie_file = 'CRACoV-PRZYJECIE.csv'
biochemia_file = 'CRACoV-BIOCHEMIA_3.csv'
echo_file = 'CRACoV-ECHO.csv'
nefro_file = '_CRACoV-NEFRO.csv'
mapowanie_tomografii_file = 'CRACoV-MAPOWANIE-TOMOGRAFII.csv'
radio_file = 'CRACoV-RADIO.csv'
etykiety_file = 'CRACoV-ETYKIETY.csv'

def load_csv(file, skiprows=0):
    data_location = 's3://{}/{}'.format(bucket, file)
    return pd.read_csv(data_location, skiprows=skiprows)

# tutaj mamy wszystkie wczytane dane
przyjecia = load_csv(przyjecie_file)
biochemia = load_csv(biochemia_file)
echo = load_csv(echo_file)
# nefro ma zwalony pierwszy wiersz
nefro = load_csv(nefro_file, 1)
mapowanie_tomografii = load_csv(mapowanie_tomografii_file)
radio = load_csv(radio_file)
etykiety = load_csv(etykiety_file)

def clean_data(df):
    df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
    
    df = df.replace('nie', 0)
    df = df.replace('nie wiadomo', 1)
    df = df.replace('tak', 2)
    
    df = df.replace('covid', 1)
    df = df.replace('inny (współistniejący covid)', 1)
    
    df = df.fillna(0)
    return df

def lpAndidLabToInt(df):
    return df.astype({"LP.": int, "ID_LAB": int})

def createId(df):
    df["ID"] = df["LP."] + 100000*df["ID_LAB"]
    df = df.astype({"ID": int})
    df.drop('LP.', inplace=True, axis=1)
    df.drop('ID_LAB', inplace=True, axis=1)
    return df

def load_and_parse(file):
    df = load_csv(file)
    df = clean_data(df)
    df = lpAndidLabToInt(df)
    df = createId(df)
    return df





def load_przyjecia():
    df = load_and_parse(przyjecie_file)
    df
#     df = df.astype({"WIEK": int, "PRZENIESIENIE": int, "HOSP_PRZYCZ": int, "NT": int, "DM": int, "ASTMA": int, "POCHP": int, "HF": int, "AF": int, "UDAR": int, "CHD": int, "MI": int, "ZAP_PLUC": int, "PCHN": int, "DEKSAMETEZON": int, "HDCZ": int, "BB": int, "STATYNA": int, "ASA": int, "NOAC": int, "MRA": int, "ACE": int, "SARTANY": int, "CA_BLOKER": int})
#     df = df.filter(items=['ID', 'PLEC', 'WIEK', 'WZROST', 'PRZENIESIENIE', 'HOSP_PRZYCZ', 'MASA_CIALA', 'BMI', 'RRS', 'RRD', 'PO2_ATM', 'ODDECH', 'AS', 'NT', 'DM', 'ASTMA', 'POCHP', 'HF', 'AF', 'UDAR', 'CHD', 'MI', 'ZAP_PLUC', 'PCHN', 'DEKSAMETEZON', 'HDCZ', 'BB', 'STATYNA', 'ASA', 'NOAC', 'MRA', 'ACE', 'SARTANY', 'CA_BLOKER'])
    return df

def load_biochemia():
    df = load_and_parse(biochemia_file)
    df = df.filter(items=['ID', 'KOD', 'WYNIK'])
    df = df.drop_duplicates(subset=['ID', 'KOD'], keep='last')
    df = df.pivot(index="ID", columns="KOD", values="WYNIK")
    df = df.applymap(lambda x: x.replace('<', '') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.replace('>', '') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.replace(' mg/l', '') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    return df


def load_echo():
    df = load_and_parse(echo_file)
    return df

# def load_nefro():
#     return load_csv(nefro_file, 1)

def load_mapowanie_tomografii():
    df = load_and_parse(mapowanie_tomografii_file)
    return df


def load_radio():
    df = load_and_parse(radio_file)
    return df


def load_etykiety():
    df = load_and_parse(etykiety_file)
    return df



def all_data():
    joined = pd.merge(load_przyjecia(), load_biochemia(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_echo(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_mapowanie_tomografii(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_radio(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_etykiety(), on=["ID"], how='outer')
    joined = joined.fillna(0)
    return joined


In [169]:
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

labels_in_data = ['WIEK',  'WZROST', 'BMI', 'DM', 'DEKSAMETEZON', 'ZAP_PLUC']
    
data = all_data().filter(labels_in_data + ['ZGON_LUB_OIT'])

for header in labels_in_data:
    print(header)
    data = data.astype({header: np.float32})


# file = 'data_v31.csv'
# data_location = 's3://{}/{}'.format(bucket, file)
# data.to_csv(data_location)

random_state=200

dead = data[data['ZGON_LUB_OIT'] == 1]
alive = data[data['ZGON_LUB_OIT'] == 0]

train_dead=dead.sample(frac=0.9, random_state=random_state)
test_dead=dead.drop(train_dead.index)

train_alive=alive.sample(frac=0.9, random_state=random_state)
test_alive=alive.drop(train_alive.index)

train = pd.concat([train_dead, train_alive])
test = pd.concat([test_dead, test_alive])

train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)



def mahalanobis(x=None, data=None, cov=None):

    x_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(x_mu, inv_covmat)
    mahal = np.dot(left, x_mu.T)
    return mahal.diagonal()


all_cols = [x for x in labels_in_data if x != 'DEKSAMETEZON']

test['mahalanobis_bad'] = mahalanobis(x=test[all_cols], data=train[train['ZGON_LUB_OIT'] == 1][all_cols])
test['mahalanobis_good'] = mahalanobis(x=test[all_cols], data=train[train['ZGON_LUB_OIT'] == 0][all_cols])

test['mahalanobis_ZGON_LUB_OIT'] = test['mahalanobis_bad'] <= test['mahalanobis_good']
test[['ZGON_LUB_OIT', 'mahalanobis_ZGON_LUB_OIT', 'mahalanobis_good','mahalanobis_bad']]

np.count_nonzero(test['mahalanobis_ZGON_LUB_OIT'] == True)
np.count_nonzero(test['ZGON_LUB_OIT'] == True)
godCount = np.count_nonzero(test['ZGON_LUB_OIT'] == test['mahalanobis_ZGON_LUB_OIT'])
allCount = len(test)

tpr = np.count_nonzero((test['ZGON_LUB_OIT'] == 0) & (test['ZGON_LUB_OIT'] == test['mahalanobis_ZGON_LUB_OIT'])) / np.count_nonzero((test['ZGON_LUB_OIT'] == 0))
tnr = np.count_nonzero((test['ZGON_LUB_OIT'] == 1) & (test['ZGON_LUB_OIT'] == test['mahalanobis_ZGON_LUB_OIT'])) / np.count_nonzero((test['ZGON_LUB_OIT'] == 1))

print('{} / {} = {}%'.format(godCount, allCount, int(godCount/allCount*100)))
print('tpr = {}'.format(tpr))
print('tnr = {}'.format(tnr))

WIEK
WZROST
BMI
DM
DEKSAMETEZON
ZAP_PLUC
42 / 50 = 84%
tpr = 0.8444444444444444
tnr = 0.8


In [242]:
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

labels_in_datas = [
    'WIEK', 
                   'PLEC', 
#                    'n58.11.11342_pct', 
#                    'i81.11.1112_crp', 
#                    'g49.122.1113_dd', 
#                    'm05_il-6', 
#                    'o59_tnhs', 
#                    'n11.126.20.1cito_mlecz',
#                    'm37.11.191_krea', 
#                    'c55.103.02_wbc', 
#                    'c55.103.02_plt', 
#                    'WZROST', 
#                    'MASA_CIALA', 
                   'BMI', 
                   'RRS', 
#                    'RRD', 
#                    'PO2_ATM', 
#                    'ODDECH', 
                   'AS', 
#                    'NT', 
#                    'DM', 
                   'ASTMA', 
                   'POCHP', 
                   'HF', 
                   'AF', 
                   'UDAR', 
                   'CHD', 
#                    'MI', 
                   'ZAP_PLUC', 
                   'PCHN', 
#                    'DEKSAMETEZON', 
                   'HDCZ', 
                   'BB',
                   'STATYNA', 
#                    'ASA', 
                   'NOAC', 
#                    'MRA', 
                   'ACE', 
                   'SARTANY', 
                   'CA_BLOKER'
                  ]

for l in labels_in_datas:
    labels_in_data = [x for x in labels_in_datas if x != l]
    
    print('wywalic {}'.format(l))
    
    data = all_data().filter(labels_in_data + ['ZGON_LUB_OIT'])

    # x = list(data['OBJAWY_DATA'].unique())
    # x.remove(0)
    # for _x in x:
    #     data = data.replace(_x, 1)


    for header in labels_in_data:
        data = data.astype({header: np.float32})


    # file = 'data_v31.csv'
    # data_location = 's3://{}/{}'.format(bucket, file)
    # data.to_csv(data_location)

    random_state=200

    dead = data[data['ZGON_LUB_OIT'] == 1]
    alive = data[data['ZGON_LUB_OIT'] == 0]

    train_dead=dead.sample(frac=0.8, random_state=random_state)
    test_dead=dead.drop(train_dead.index)

    train_alive=alive.sample(frac=0.8, random_state=random_state)
    test_alive=alive.drop(train_alive.index)

    train = pd.concat([train_dead, train_alive])
    test = pd.concat([test_dead, test_alive])

    train = train.sample(frac=1).reset_index(drop=True)
    test = test.sample(frac=1).reset_index(drop=True)



    def mahalanobis(x=None, data=None, cov=None, pref=''):

        x_mu = x - np.mean(data)
        np.save('data_mean_{}'.format(pref),np.mean(data))
        if not cov:
            cov = np.cov(data.values.T)
        inv_covmat = np.linalg.inv(cov)
        np.save('inv_covmat_{}'.format(pref),inv_covmat)
        left = np.dot(x_mu, inv_covmat)
        mahal = np.dot(left, x_mu.T)
        return mahal.diagonal()


    #  'DEKSAMETEZON' , 'HOSP_PRZYCZ'
    # all_cols = ['WIEK', 'PRZENIESIENIE', 'MASA_CIALA', 'RRS', 'RRD', 'ODDECH', 'NT', 'DM', 'ASTMA', 'HF', 'AF', 'UDAR', 'CHD', 'MI', 'ZAP_PLUC', 'PCHN', 'HDCZ', 'BB', 'STATYNA', 'ASA', 'NOAC', 'MRA', 'ACE', 'SARTANY', 'CA_BLOKER']
    all_cols = [x for x in labels_in_data if x != 'DEKSAMETEZON']

    test['mahalanobis_bad'] = mahalanobis(x=test[all_cols], data=train[train['ZGON_LUB_OIT'] == 1][all_cols], pref='{}_bad'.format(l))
    test['mahalanobis_good'] = mahalanobis(x=test[all_cols], data=train[train['ZGON_LUB_OIT'] == 0][all_cols], pref='{}_god'.format(l))

    # for multiplier in np.arange(0.1, 100.0, 0.1):
    #     if(multiplier.is_integer()):
    #         print('{}%'.format(multiplier))

    #     test['mahalanobis_bad'] = np.minimum(test['mahalanobis_bad'], mahalanobis(x=test[all_cols]*multiplier, data=train[train['ZGON_LUB_OIT'] == 1][all_cols]))
    #     test['mahalanobis_good'] = np.minimum(test['mahalanobis_good'], mahalanobis(x=test[all_cols]*multiplier, data=train[train['ZGON_LUB_OIT'] == 0][all_cols]))


    test['mahalanobis_ZGON_LUB_OIT'] = test['mahalanobis_bad'] <= test['mahalanobis_good']
    test[['ZGON_LUB_OIT', 'mahalanobis_ZGON_LUB_OIT', 'mahalanobis_good','mahalanobis_bad']]

    np.count_nonzero(test['mahalanobis_ZGON_LUB_OIT'] == True)
    np.count_nonzero(test['ZGON_LUB_OIT'] == True)
    godCount = np.count_nonzero(test['ZGON_LUB_OIT'] == test['mahalanobis_ZGON_LUB_OIT'])
    allCount = len(test)

    tpr = np.count_nonzero((test['ZGON_LUB_OIT'] == 0) & (test['ZGON_LUB_OIT'] == test['mahalanobis_ZGON_LUB_OIT'])) / np.count_nonzero((test['ZGON_LUB_OIT'] == 0))
    tnr = np.count_nonzero((test['ZGON_LUB_OIT'] == 1) & (test['ZGON_LUB_OIT'] == test['mahalanobis_ZGON_LUB_OIT'])) / np.count_nonzero((test['ZGON_LUB_OIT'] == 1))

    print('{} / {} = {}%'.format(godCount, allCount, int(godCount/allCount*100)))
    print('tpr = {}'.format(tpr))
    print('tnr = {}'.format(tnr))
    
    
    
print('wywalic nic')
labels_in_data = labels_in_datas
data = all_data().filter(labels_in_data + ['ZGON_LUB_OIT'])

# x = list(data['OBJAWY_DATA'].unique())
# x.remove(0)
# for _x in x:
#     data = data.replace(_x, 1)


for header in labels_in_data:
    data = data.astype({header: np.float32})


# file = 'data_v31.csv'
# data_location = 's3://{}/{}'.format(bucket, file)
# data.to_csv(data_location)

random_state=200

dead = data[data['ZGON_LUB_OIT'] == 1]
alive = data[data['ZGON_LUB_OIT'] == 0]

train_dead=dead.sample(frac=0.9, random_state=random_state)
test_dead=dead.drop(train_dead.index)

train_alive=alive.sample(frac=0.9, random_state=random_state)
test_alive=alive.drop(train_alive.index)

train = pd.concat([train_dead, train_alive])
test = pd.concat([test_dead, test_alive])

train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)



def mahalanobis(x=None, data=None, cov=None, pref=''):

    x_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    np.save('inv_covmat_{}'.format(pref),inv_covmat)
    left = np.dot(x_mu, inv_covmat)
    mahal = np.dot(left, x_mu.T)
    return mahal.diagonal()


#  'DEKSAMETEZON' , 'HOSP_PRZYCZ'
# all_cols = ['WIEK', 'PRZENIESIENIE', 'MASA_CIALA', 'RRS', 'RRD', 'ODDECH', 'NT', 'DM', 'ASTMA', 'HF', 'AF', 'UDAR', 'CHD', 'MI', 'ZAP_PLUC', 'PCHN', 'HDCZ', 'BB', 'STATYNA', 'ASA', 'NOAC', 'MRA', 'ACE', 'SARTANY', 'CA_BLOKER']
all_cols = [x for x in labels_in_data if x != 'DEKSAMETEZON']

test['mahalanobis_bad'] = mahalanobis(x=test[all_cols], data=train[train['ZGON_LUB_OIT'] == 1][all_cols], pref='bad')
test['mahalanobis_good'] = mahalanobis(x=test[all_cols], data=train[train['ZGON_LUB_OIT'] == 0][all_cols], pref='good')

# for multiplier in np.arange(0.1, 100.0, 0.1):
#     if(multiplier.is_integer()):
#         print('{}%'.format(multiplier))

#     test['mahalanobis_bad'] = np.minimum(test['mahalanobis_bad'], mahalanobis(x=test[all_cols]*multiplier, data=train[train['ZGON_LUB_OIT'] == 1][all_cols]))
#     test['mahalanobis_good'] = np.minimum(test['mahalanobis_good'], mahalanobis(x=test[all_cols]*multiplier, data=train[train['ZGON_LUB_OIT'] == 0][all_cols]))


# test['mahalanobis_ZGON_LUB_OIT'] = (test['DEKSAMETEZON'] >= 0.1) | ((test['DEKSAMETEZON'] < 0.1) & (test['mahalanobis_bad'] <= test['mahalanobis_good']))
test['mahalanobis_ZGON_LUB_OIT'] = test['mahalanobis_bad'] <= test['mahalanobis_good']
test[['ZGON_LUB_OIT', 'mahalanobis_ZGON_LUB_OIT', 'mahalanobis_good','mahalanobis_bad']]

np.count_nonzero(test['mahalanobis_ZGON_LUB_OIT'] == True)
np.count_nonzero(test['ZGON_LUB_OIT'] == True)
godCount = np.count_nonzero(test['ZGON_LUB_OIT'] == test['mahalanobis_ZGON_LUB_OIT'])
allCount = len(test)

tpr = np.count_nonzero((test['ZGON_LUB_OIT'] == 0) & (test['ZGON_LUB_OIT'] == test['mahalanobis_ZGON_LUB_OIT'])) / np.count_nonzero((test['ZGON_LUB_OIT'] == 0))
tnr = np.count_nonzero((test['ZGON_LUB_OIT'] == 1) & (test['ZGON_LUB_OIT'] == test['mahalanobis_ZGON_LUB_OIT'])) / np.count_nonzero((test['ZGON_LUB_OIT'] == 1))

print('{} / {} = {}%'.format(godCount, allCount, int(godCount/allCount*100)))
print('tpr = {}'.format(tpr))
print('tnr = {}'.format(tnr))

wywalic WIEK
85 / 100 = 85%
tpr = 0.9213483146067416
tnr = 0.2727272727272727
wywalic PLEC
87 / 100 = 87%
tpr = 0.9438202247191011
tnr = 0.2727272727272727
wywalic BMI
81 / 100 = 81%
tpr = 0.8539325842696629
tnr = 0.45454545454545453
wywalic RRS
85 / 100 = 85%
tpr = 0.9101123595505618
tnr = 0.36363636363636365
wywalic AS
85 / 100 = 85%
tpr = 0.9101123595505618
tnr = 0.36363636363636365
wywalic ASTMA
87 / 100 = 87%
tpr = 0.9438202247191011
tnr = 0.2727272727272727
wywalic POCHP
83 / 100 = 83%
tpr = 0.898876404494382
tnr = 0.2727272727272727
wywalic HF
87 / 100 = 87%
tpr = 0.9438202247191011
tnr = 0.2727272727272727
wywalic AF
84 / 100 = 84%
tpr = 0.898876404494382
tnr = 0.36363636363636365
wywalic UDAR
85 / 100 = 85%
tpr = 0.9325842696629213
tnr = 0.18181818181818182
wywalic CHD
83 / 100 = 83%
tpr = 0.898876404494382
tnr = 0.2727272727272727
wywalic ZAP_PLUC
88 / 100 = 88%
tpr = 0.9438202247191011
tnr = 0.36363636363636365
wywalic PCHN
86 / 100 = 86%
tpr = 0.9550561797752809
tnr = 0.090