In [6]:
import boto3
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
import matplotlib.pyplot as plt
import seaborn as sns


role = get_execution_role()
bucket='hack4med'

przyjecie_file = 'CRACoV-PRZYJECIE.csv'
biochemia_file = 'CRACoV-BIOCHEMIA_3.csv'
echo_file = 'CRACoV-ECHO.csv'
nefro_file = '_CRACoV-NEFRO.csv'
mapowanie_tomografii_file = 'CRACoV-MAPOWANIE-TOMOGRAFII.csv'
radio_file = 'CRACoV-RADIO.csv'
etykiety_file = 'CRACoV-ETYKIETY.csv'

def load_csv(file, skiprows=0):
    data_location = 's3://{}/{}'.format(bucket, file)
    return pd.read_csv(data_location, skiprows=skiprows)

# tutaj mamy wszystkie wczytane dane
przyjecia = load_csv(przyjecie_file)
biochemia = load_csv(biochemia_file)
echo = load_csv(echo_file)
# nefro ma zwalony pierwszy wiersz
nefro = load_csv(nefro_file, 1)
mapowanie_tomografii = load_csv(mapowanie_tomografii_file)
radio = load_csv(radio_file)
etykiety = load_csv(etykiety_file)

def clean_data(df):
    df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
    
    df = df.replace('nie', 0)
    df = df.replace('nie wiadomo', 1)
    df = df.replace('tak', 2)
    
    df = df.replace('covid', 1)
    df = df.replace('inny (współistniejący covid)', 1)
    
    df = df.fillna(0)
    return df

def lpAndidLabToInt(df):
    return df.astype({"LP.": int, "ID_LAB": int})

def createId(df):
    df["ID"] = df["LP."] + 100000*df["ID_LAB"]
    df = df.astype({"ID": int})
    df.drop('LP.', inplace=True, axis=1)
    df.drop('ID_LAB', inplace=True, axis=1)
    return df

def load_and_parse(file):
    df = load_csv(file)
    df = clean_data(df)
    df = lpAndidLabToInt(df)
    df = createId(df)
    return df





def load_przyjecia():
    df = load_and_parse(przyjecie_file)
    df = df.astype({"WIEK": int, "PRZENIESIENIE": int, "HOSP_PRZYCZ": int, "NT": int, "DM": int, "ASTMA": int, "POCHP": int, "HF": int, "AF": int, "UDAR": int, "CHD": int, "MI": int, "ZAP_PLUC": int, "PCHN": int, "DEKSAMETEZON": int, "HDCZ": int, "BB": int, "STATYNA": int, "ASA": int, "NOAC": int, "MRA": int, "ACE": int, "SARTANY": int, "CA_BLOKER": int})
    df = df.filter(items=['ID', 'PLEC', 'WIEK', 'WZROST', 'PRZENIESIENIE', 'HOSP_PRZYCZ', 'MASA_CIALA', 'BMI', 'RRS', 'RRD', 'PO2_ATM', 'ODDECH', 'AS', 'NT', 'DM', 'ASTMA', 'POCHP', 'HF', 'AF', 'UDAR', 'CHD', 'MI', 'ZAP_PLUC', 'PCHN', 'DEKSAMETEZON', 'HDCZ', 'BB', 'STATYNA', 'ASA', 'NOAC', 'MRA', 'ACE', 'SARTANY', 'CA_BLOKER'])
    return df

def load_biochemia():    
    df = load_and_parse(biochemia_file)
    df = df.filter(items=['ID', 'KOD', 'WYNIK'])
    df = df.drop_duplicates(subset=['ID', 'KOD'], keep='last')
    df = df.pivot(index="ID", columns="KOD", values="WYNIK")
    df = df.applymap(lambda x: x.replace('<', '') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.replace('>', '') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.replace(' mg/l', '') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df = df.astype({"n58.11.11342_pct": float, 'i81.11.1112_crp': float, 'g49.122.1113_dd': float, 'm05_il-6': float, 'o59_tnhs': float, 'n11.126.20.1cito_mlecz': float, 'm37.11.191_krea': float, 'c55.103.02_wbc': float, 'c55.103.02_plt': float})
    return df


def load_echo():
    df = load_and_parse(echo_file)
    return df

# def load_nefro():
#     return load_csv(nefro_file, 1)

def load_mapowanie_tomografii():
    df = load_and_parse(mapowanie_tomografii_file)
    return df


def load_radio():
    df = load_and_parse(radio_file)
    return df


def load_etykiety():
    df = load_and_parse(etykiety_file)
    return df



def all_data():
    joined = pd.merge(load_przyjecia(), load_biochemia(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_echo(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_mapowanie_tomografii(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_radio(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_etykiety(), on=["ID"], how='outer')
    joined = joined.fillna(0)
    return joined

data = all_data()

data

Unnamed: 0,ID,PLEC,WIEK,WZROST,PRZENIESIENIE,HOSP_PRZYCZ,MASA_CIALA,BMI,RRS,RRD,...,Volume of ground glass in both lungs (cm3),Consolidation volume of both lungs (cm3),Proportion of lung lesions (%),Proportion of ground glass in both lungs (%),Consolidation of both lungs accounted for (%),Pneumonia grade,ZGON_SZPITALNY,PROGRESJA,OIT,ZGON_LUB_OIT
0,1,0,84,178.0,2,1,65.0,20.51,98.0,57.0,...,0.00,0.00,0.00,0.00,0.00,0,1,1,0,1
1,2,0,83,170.0,0,1,75.0,25.95,122.0,75.0,...,0.00,0.00,0.00,0.00,0.00,0,0,0,0,0
2,5,0,78,178.0,0,1,90.0,28.40,119.0,65.0,...,1064.70,158.17,24.54,21.56,3.20,critical,1,0,1,1
3,3,0,76,178.0,0,1,75.0,23.67,145.0,90.0,...,0.00,0.00,0.00,0.00,0.00,0,1,0,0,1
4,10,1,72,163.0,0,1,77.0,28.98,131.0,98.0,...,1889.63,660.23,67.30,50.35,17.59,critical,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,49000496,1,65,168.0,0,1,102.0,36.13,150.0,84.0,...,439.32,62.66,9.02,7.97,1.14,moderate,0,0,0,0
496,49100497,0,77,171.0,0,1,86.0,29.41,149.0,91.0,...,1820.57,207.83,42.02,38.05,4.34,critical,0,0,1,1
497,49200498,0,58,0.0,0,0,0.0,0.00,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0,0,0,0,0
498,49300499,1,66,167.0,0,1,93.2,33.41,130.0,85.0,...,0.00,0.00,0.00,0.00,0.00,0,0,0,1,1


In [12]:
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

labels_in_datas = [
                   'PLEC',
                   'ODDECH',
                   'AS',
                   'POCHP',
                   'HF',
                   'CHD',
                   'PCHN',
                  ]

data = all_data().filter(labels_in_data + ['ZGON_LUB_OIT'])

for header in labels_in_data:
    data = data.astype({header: np.float32})
    
random_state=200

dead = data[data['ZGON_LUB_OIT'] == 1]
alive = data[data['ZGON_LUB_OIT'] == 0]

train_dead=dead.sample(frac=1, random_state=random_state)

train_alive=alive.sample(frac=1, random_state=random_state)
train = pd.concat([train_dead, train_alive])

train

Unnamed: 0,WIEK,n58.11.11342_pct,g49.122.1113_dd,m05_il-6,o59_tnhs,m37.11.191_krea,c55.103.02_wbc,WZROST,BMI,PCHN,DEKSAMETEZON,ZGON_LUB_OIT
445,77.0,0.00,0.00,0.000000,0.000000,0.000000,0.00,161.0,25.840000,2.0,0.0,1
263,66.0,0.05,0.77,7.550000,39.080002,141.000000,4.41,172.0,22.980000,2.0,0.0,1
314,37.0,0.00,1.19,120.000000,0.000000,45.000000,4.60,156.0,34.509998,0.0,0.0,1
285,71.0,0.32,0.74,128.600006,17.820000,120.000000,7.62,160.0,37.490002,0.0,0.0,1
412,72.0,0.00,0.00,0.000000,0.000000,0.000000,0.00,182.0,30.180000,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
336,65.0,0.12,0.90,6.720000,12.490000,66.699997,10.63,163.0,28.219999,0.0,0.0,0
363,69.0,0.10,1.19,31.830000,10.510000,80.800003,7.45,176.0,34.730000,0.0,0.0,0
309,41.0,0.05,4.27,17.459999,2.770000,69.000000,3.57,180.0,27.160000,0.0,0.0,0
125,57.0,0.26,1.06,32.520000,9.180000,91.199997,12.69,176.0,32.279999,0.0,0.0,0


In [46]:
import pickle

good = np.load('inv_covmat_PLEC_god.npy')
bad = np.load('inv_covmat_PLEC_bad.npy')

data_mean_good = np.load('data_mean_PLEC_god.npy')
data_mean_bad = np.load('data_mean_PLEC_bad.npy')

pickle.dump([bad, data_mean_bad, good, data_mean_good], open("hindus_model.dat", "wb"))

print('saved')

saved


In [8]:
model = pickle.load(open("hindus_model.dat", "rb"))

model

Unnamed: 0,WIEK,n58.11.11342_pct,g49.122.1113_dd,m05_il-6,o59_tnhs,m37.11.191_krea,c55.103.02_wbc,WZROST,BMI,PCHN,DEKSAMETEZON,ZGON_LUB_OIT
445,77.0,0.00,0.00,0.000000,0.000000,0.000000,0.00,161.0,25.840000,2.0,0.0,1
263,66.0,0.05,0.77,7.550000,39.080002,141.000000,4.41,172.0,22.980000,2.0,0.0,1
314,37.0,0.00,1.19,120.000000,0.000000,45.000000,4.60,156.0,34.509998,0.0,0.0,1
285,71.0,0.32,0.74,128.600006,17.820000,120.000000,7.62,160.0,37.490002,0.0,0.0,1
412,72.0,0.00,0.00,0.000000,0.000000,0.000000,0.00,182.0,30.180000,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
186,56.0,0.00,0.19,7.240000,16.600000,0.000000,0.00,182.0,36.220001,0.0,0.0,0
8,59.0,0.07,15.39,12.870000,2.500000,64.699997,4.21,180.0,30.860001,0.0,0.0,0
213,42.0,0.04,0.70,22.540001,4.360000,82.900002,4.52,183.0,28.959999,0.0,0.0,0
421,61.0,0.00,0.00,0.000000,0.000000,0.000000,0.00,164.0,29.740000,0.0,0.0,0


In [9]:
test = data.sample(n = 1)

test['mahalanobis_bad'] = mahalanobis(x=test[all_cols], data=model[model['ZGON_LUB_OIT'] == 1][all_cols])
test['mahalanobis_good'] = mahalanobis(x=test[all_cols], data=model[model['ZGON_LUB_OIT'] == 0][all_cols])

test['mahalanobis_ZGON_LUB_OIT'] = (test['DEKSAMETEZON'] >= 0.1) | ((test['DEKSAMETEZON'] < 0.1) & (test['mahalanobis_bad'] <= test['mahalanobis_good']))

test['mahalanobis_ZGON_LUB_OIT']

444    False
Name: mahalanobis_ZGON_LUB_OIT, dtype: bool