In [262]:
import boto3
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

role = get_execution_role()
bucket='hack4med'

przyjecie_file = 'CRACoV-PRZYJECIE.csv'
biochemia_file = 'CRACoV-BIOCHEMIA_3.csv'
echo_file = 'CRACoV-ECHO.csv'
nefro_file = '_CRACoV-NEFRO.csv'
mapowanie_tomografii_file = 'CRACoV-MAPOWANIE-TOMOGRAFII.csv'
radio_file = 'CRACoV-RADIO.csv'
etykiety_file = 'CRACoV-ETYKIETY.csv'

def load_csv(file, skiprows=0):
    data_location = 's3://{}/{}'.format(bucket, file)
    return pd.read_csv(data_location, skiprows=skiprows)

# tutaj mamy wszystkie wczytane dane
przyjecia = load_csv(przyjecie_file)
biochemia = load_csv(biochemia_file)
echo = load_csv(echo_file)
# nefro ma zwalony pierwszy wiersz
nefro = load_csv(nefro_file, 1)
mapowanie_tomografii = load_csv(mapowanie_tomografii_file)
radio = load_csv(radio_file)
etykiety = load_csv(etykiety_file)

def clean_data(df):
    df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
    
    df = df.replace('nie', 0)
    df = df.replace('nie wiadomo', 1)
    df = df.replace('tak', 2)
    
    df = df.replace('covid', 1)
    df = df.replace('inny (współistniejący covid)', 1)
    
    df = df.fillna(0)
    return df

def lpAndidLabToInt(df):
    return df.astype({"LP.": int, "ID_LAB": int})

def createId(df):
    df["ID"] = df["LP."] + 100000*df["ID_LAB"]
    df = df.astype({"ID": int})
    df.drop('LP.', inplace=True, axis=1)
    df.drop('ID_LAB', inplace=True, axis=1)
    return df

def load_and_parse(file):
    df = load_csv(file)
    df = clean_data(df)
    df = lpAndidLabToInt(df)
    df = createId(df)
    return df





def load_przyjecia():
    df = load_and_parse(przyjecie_file)
    df = df.astype({"WIEK": int, "PRZENIESIENIE": int, "HOSP_PRZYCZ": int, "NT": int, "DM": int, "ASTMA": int, "POCHP": int, "HF": int, "AF": int, "UDAR": int, "CHD": int, "MI": int, "ZAP_PLUC": int, "PCHN": int, "DEKSAMETEZON": int, "HDCZ": int, "BB": int, "STATYNA": int, "ASA": int, "NOAC": int, "MRA": int, "ACE": int, "SARTANY": int, "CA_BLOKER": int})
    df = df.filter(items=['ID', 'PLEC', 'WIEK', 'WZROST', 'PRZENIESIENIE', 'HOSP_PRZYCZ', 'MASA_CIALA', 'BMI', 'RRS', 'RRD', 'PO2_ATM', 'ODDECH', 'AS', 'NT', 'DM', 'ASTMA', 'POCHP', 'HF', 'AF', 'UDAR', 'CHD', 'MI', 'ZAP_PLUC', 'PCHN', 'DEKSAMETEZON', 'HDCZ', 'BB', 'STATYNA', 'ASA', 'NOAC', 'MRA', 'ACE', 'SARTANY', 'CA_BLOKER'])
    return df

def load_biochemia():    
    df = load_and_parse(biochemia_file)
    df = df.filter(items=['ID', 'KOD', 'WYNIK'])
    df = df.drop_duplicates(subset=['ID', 'KOD'], keep='last')
    df = df.pivot(index="ID", columns="KOD", values="WYNIK")
    df = df.applymap(lambda x: x.replace('<', '') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.replace('>', '') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.replace(' mg/l', '') if isinstance(x, str) else x)
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    return df


def load_echo():
    df = load_and_parse(echo_file)
    return df

# def load_nefro():
#     return load_csv(nefro_file, 1)

def load_mapowanie_tomografii():
    df = load_and_parse(mapowanie_tomografii_file)
    return df


def load_radio():
    df = load_and_parse(radio_file)
    return df


def load_etykiety():
    df = load_and_parse(etykiety_file)
    return df



def all_data():
    joined = pd.merge(load_przyjecia(), load_biochemia(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_echo(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_mapowanie_tomografii(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_radio(), on=["ID"], how='outer')
    joined = pd.merge(joined, load_etykiety(), on=["ID"], how='outer')
    joined = joined.fillna(0)
    return joined

data = all_data()

data

Unnamed: 0,ID,PLEC,WIEK,WZROST,PRZENIESIENIE,HOSP_PRZYCZ,MASA_CIALA,BMI,RRS,RRD,...,Volume of ground glass in both lungs (cm3),Consolidation volume of both lungs (cm3),Proportion of lung lesions (%),Proportion of ground glass in both lungs (%),Consolidation of both lungs accounted for (%),Pneumonia grade,ZGON_SZPITALNY,PROGRESJA,OIT,ZGON_LUB_OIT
0,1,0,84,178.0,2,1,65.0,20.51,98.0,57.0,...,0.00,0.00,0.00,0.00,0.00,0,1,1,0,1
1,2,0,83,170.0,0,1,75.0,25.95,122.0,75.0,...,0.00,0.00,0.00,0.00,0.00,0,0,0,0,0
2,5,0,78,178.0,0,1,90.0,28.40,119.0,65.0,...,1064.70,158.17,24.54,21.56,3.20,critical,1,0,1,1
3,3,0,76,178.0,0,1,75.0,23.67,145.0,90.0,...,0.00,0.00,0.00,0.00,0.00,0,1,0,0,1
4,10,1,72,163.0,0,1,77.0,28.98,131.0,98.0,...,1889.63,660.23,67.30,50.35,17.59,critical,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,49000496,1,65,168.0,0,1,102.0,36.13,150.0,84.0,...,439.32,62.66,9.02,7.97,1.14,moderate,0,0,0,0
496,49100497,0,77,171.0,0,1,86.0,29.41,149.0,91.0,...,1820.57,207.83,42.02,38.05,4.34,critical,0,0,1,1
497,49200498,0,58,0.0,0,0,0.0,0.00,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0,0,0,0,0
498,49300499,1,66,167.0,0,1,93.2,33.41,130.0,85.0,...,0.00,0.00,0.00,0.00,0.00,0,0,0,1,1


In [271]:
badania = ['n58.11.11342_pct', 'i81.11.1112_crp', 'g49.122.1113_dd', 'm05_il-6', 'o59_tnhs', 'n11.126.20.1cito_mlecz', 'm37.11.191_krea', 'c55.103.02_wbc', 'c55.103.02_plt']


cols = ['PLEC', 'WIEK', 'WZROST', 'PRZENIESIENIE', 'MASA_CIALA', 'BMI', 'RRS', 'RRD', 'PO2_ATM', 'ODDECH', 'AS', 'NT', 'DM', 'ASTMA', 'HF', 'AF', 'UDAR', 'CHD', 'MI', 'ZAP_PLUC',  'HDCZ', 'BB', 'STATYNA', 'ASA', 'NOAC', 'MRA', 'ACE', 'SARTANY', 'CA_BLOKER', 'ZGON_LUB_OIT']
all_to_filter = cols + badania

# 'DEKSAMETEZON',
data = data.filter(all_to_filter)

# for header in badania:
#     data = data.astype({header: np.float32})

# dead = data[data['ZGON_LUB_OIT'] == 1]
# dead = dead.head(50)

# alive = data[data['ZGON_LUB_OIT'] == 0]
# alive = alive.head(50)

# data = pd.concat([dead, alive])
# data = data.sample(frac=1).reset_index(drop=True)

data

Unnamed: 0,PLEC,WIEK,WZROST,PRZENIESIENIE,MASA_CIALA,BMI,RRS,RRD,PO2_ATM,ODDECH,...,HDCZ,BB,STATYNA,ASA,NOAC,MRA,ACE,SARTANY,CA_BLOKER,ZGON_LUB_OIT
0,0,84,178.0,2,65.0,20.51,98.0,57.0,98.0,19.0,...,0,2,2,0,0,0,2,0,0,1
1,0,83,170.0,0,75.0,25.95,122.0,75.0,96.0,14.0,...,0,0,0,0,2,0,0,0,2,0
2,0,78,178.0,0,90.0,28.40,119.0,65.0,85.0,20.0,...,0,0,0,0,0,0,2,0,0,1
3,0,76,178.0,0,75.0,23.67,145.0,90.0,0.0,16.0,...,0,1,1,0,0,1,1,1,1,1
4,1,72,163.0,0,77.0,28.98,131.0,98.0,78.0,20.0,...,0,0,0,0,0,0,2,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1,65,168.0,0,102.0,36.13,150.0,84.0,93.0,14.0,...,0,2,2,0,0,0,0,0,0,0
496,0,77,171.0,0,86.0,29.41,149.0,91.0,0.0,14.0,...,2,2,2,2,0,0,0,2,0,1
497,0,58,0.0,0,0.0,0.00,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
498,1,66,167.0,0,93.2,33.41,130.0,85.0,75.0,20.0,...,0,0,0,0,0,0,0,0,0,1


In [272]:
random_state=200

dead = data[data['ZGON_LUB_OIT'] == 1]
alive = data[data['ZGON_LUB_OIT'] == 0]

train_dead=dead.sample(frac=0.8, random_state=random_state)
test_dead=dead.drop(train_dead.index)

train_alive=alive.sample(frac=0.8, random_state=random_state)
test_alive=alive.drop(train_alive.index)

train = pd.concat([train_dead, train_alive])
test = pd.concat([test_dead, test_alive])

train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)


# train, test = train_test_split(data, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

320 train examples
80 validation examples
100 test examples


In [273]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('ZGON_LUB_OIT')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

def get_normalization_layer(name, dataset):
  normalizer = preprocessing.Normalization(axis=None)
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)

  return normalizer

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  if dtype == 'string':
    index = preprocessing.StringLookup(max_tokens=max_tokens)
  else:
    index = preprocessing.IntegerLookup(max_values=max_tokens)
  feature_ds = dataset.map(lambda x, y: x[name])
  index.adapt(feature_ds)
  encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())
  return lambda feature: encoder(index(feature))

batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

train_ds

<PrefetchDataset shapes: ({PLEC: (None,), WIEK: (None,), WZROST: (None,), PRZENIESIENIE: (None,), MASA_CIALA: (None,), BMI: (None,), RRS: (None,), RRD: (None,), PO2_ATM: (None,), ODDECH: (None,), AS: (None,), NT: (None,), DM: (None,), ASTMA: (None,), HF: (None,), AF: (None,), UDAR: (None,), CHD: (None,), MI: (None,), ZAP_PLUC: (None,), HDCZ: (None,), BB: (None,), STATYNA: (None,), ASA: (None,), NOAC: (None,), MRA: (None,), ACE: (None,), SARTANY: (None,), CA_BLOKER: (None,)}, (None,)), types: ({PLEC: tf.int64, WIEK: tf.int64, WZROST: tf.float64, PRZENIESIENIE: tf.int64, MASA_CIALA: tf.float64, BMI: tf.float64, RRS: tf.float64, RRD: tf.float64, PO2_ATM: tf.float64, ODDECH: tf.float64, AS: tf.float64, NT: tf.int64, DM: tf.int64, ASTMA: tf.int64, HF: tf.int64, AF: tf.int64, UDAR: tf.int64, CHD: tf.int64, MI: tf.int64, ZAP_PLUC: tf.int64, HDCZ: tf.int64, BB: tf.int64, STATYNA: tf.int64, ASA: tf.int64, NOAC: tf.int64, MRA: tf.int64, ACE: tf.int64, SARTANY: tf.int64, CA_BLOKER: tf.int64}, tf.

In [274]:
all_inputs = []
encoded_features = []

print('Normalizing numbers')

number_headers = ['WIEK', 'WZROST', 'MASA_CIALA', 'BMI', 'RRS', 'RRD', 'PO2_ATM', 'ODDECH', 'AS']
to_norm_header = number_headers + badania
for header in to_norm_header:
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

print('Normalizing categories')

# 'DEKSAMETEZON',
categorical_cols = ['PLEC', 'PRZENIESIENIE', 'NT', 'DM', 'ASTMA', 'HF', 'AF', 'UDAR', 'CHD', 'MI', 'ZAP_PLUC',  'HDCZ', 'BB', 'STATYNA', 'ASA', 'NOAC', 'MRA', 'ACE', 'SARTANY', 'CA_BLOKER']
for header in categorical_cols:
  categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
  encoding_layer = get_category_encoding_layer(header, train_ds, dtype='int64',
                                               max_tokens=5)
  encoded_categorical_col = encoding_layer(categorical_col)
  all_inputs.append(categorical_col)
  encoded_features.append(encoded_categorical_col)

print('Done')

Normalizing numbers


KeyError: in user code:

    <ipython-input-268-55edeeced9cc>:13 None  *
        feature_ds = dataset.map(lambda x, y: x[name])

    KeyError: 'n58.11.11342_pct'


In [256]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.1)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])


model.fit(train_ds, epochs=15, validation_data=val_ds)

# loss, accuracy = model.evaluate(test_ds)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f717662b410>

In [257]:
test.iloc[0]

PLEC                  1.0
WIEK                 61.0
WZROST              155.0
PRZENIESIENIE         0.0
MASA_CIALA           64.0
                    ...  
o55.11.1137_ft3       0.0
o69.11.1137_ft4       0.0
o77.11.191_ca         0.0
o83.11.1137_b-12      0.0
o91.11.1137_d-3       0.0
Name: 0, Length: 132, dtype: float64

In [261]:
sample = test.iloc[0]
input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = model.predict(input_dict)
tf.nn.sigmoid(predictions[0])

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([nan], dtype=float32)>