<a href="https://colab.research.google.com/github/gokullan/early-ARDS-prediction/blob/main/Early_Prediction_of_ARDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Setup
from google.colab import auth
from google.cloud import bigquery
from google.colab import data_table

project = 'fit-parity-376107' # Project ID inserted based on the query results selected to explore
location = 'US' # Location inserted based on the query results selected to explore
client = bigquery.Client(project=project, location=location)
data_table.enable_dataframe_formatter()
auth.authenticate_user()

## Load data from BigQuery

In [None]:
table_id = 'fit-parity-376107.resultsdata.abvvs_2'
dataset = client.get_table(table_id)
# convert to Dataframe
dataset = client.list_rows(dataset).to_dataframe()

## Exploring the Data

For details on how the data was selected from the `mimiciv` database, please visit [this link](https://annipriscillaa.atlassian.net/l/cp/z107dHaM) 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def display_col_names(cols):
  for num in range(1, len(cols) + 1):
    print("{}. {}".format(num, cols[num - 1]))

columns = dataset.columns
display_col_names(dataset.columns)

1. age
2. hadm_id
3. sub_id
4. stay_id
5. c_time
6. respiratory_rate_set
7. respiratory_rate_total
8. respiratory_rate_spontaneous
9. minute_volume
10. tidal_volume_set
11. tidal_volume_observed
12. tidal_volume_spontaneous
13. plateau_pressure
14. peep
15. fio2
16. flow_rate
17. ventilator_mode
18. ventilator_mode_hamilton
19. ventilator_type
20. heart_rate
21. sbp
22. dbp
23. mbp
24. sbp_ni
25. dbp_ni
26. mbp_ni
27. resp_rate
28. temperature
29. temperature_site
30. spo2
31. glucose
32. charttime
33. specimen
34. so2
35. po2
36. pco2
37. fio2_chartevents
38. aado2
39. aado2_calc
40. pao2fio2ratio
41. ph
42. baseexcess
43. bicarbonate
44. totalco2
45. hematocrit
46. hemoglobin
47. carboxyhemoglobin
48. methemoglobin
49. chloride
50. calcium
51. potassium
52. sodium
53. lactate


In [None]:
def plot

In [None]:
def get_categorical_cols(all_cols):
  num_cols = dataset._get_numeric_data().columns
  categorical = list(set(all_cols) - set(num_cols))
  return categorical

display_col_names(get_categorical_cols(columns))

1. temperature_site
2. ventilator_type
3. ventilator_mode
4. specimen
5. temperature
6. charttime
7. c_time
8. ventilator_mode_hamilton


### Selection of relevant parameters

In [None]:
cols_eliminated = ['sub_id',
 'aado2',
 'aado2_calc',
 'fio2_chartevents',
 'baseexcess',
 'bicarbonate',
 'totalco2',
 'hematocrit',
 'hemoglobin',
 'carboxyhemoglobin',
 'methemoglobin',
 'chloride',
 'calcium',
 'potassium',
 'sodium',
 'lactate']

cols_selected = list(set(dataset._get_numeric_data().columns) - set(cols_eliminated))
cols_selected = ['c_time'] + cols_selected
samples_all = dataset[cols_selected]

In [None]:
cols_selected

['c_time',
 'dbp_ni',
 'hadm_id',
 'pco2',
 'spo2',
 'plateau_pressure',
 'heart_rate',
 'sbp_ni',
 'stay_id',
 'tidal_volume_observed',
 'sbp',
 'ph',
 'fio2',
 'dbp',
 'respiratory_rate_spontaneous',
 'glucose',
 'peep',
 'respiratory_rate_set',
 'age',
 'mbp_ni',
 'tidal_volume_set',
 'respiratory_rate_total',
 'flow_rate',
 'mbp',
 'po2',
 'so2',
 'pao2fio2ratio',
 'minute_volume',
 'resp_rate',
 'tidal_volume_spontaneous']

### Fill missing values using 'forward fill' method

In [None]:
samples_all = samples_all.fillna(method='ffill')
n = len(samples_all)

### Order all samples by their `stay_id`s, then by their `c_time`s

In [None]:
samples_grouped = samples_all.groupby(by='stay_id')
grouped_and_sorted = []
for name, group in samples_grouped.__iter__():
  # consider only those records that have at least 10 samples
  if (len(group) > 10):
    grouped_and_sorted.append(group.sort_values(by='c_time').reset_index(drop=True))

In [None]:
len(grouped_and_sorted)

5084

### Prepare the training set

In [None]:
def find_onset_time(stay):
  for index, row in stay.iterrows():
    if row['peep'] >= 5 and row['pao2fio2ratio'] < 300:
      return index
  return -1

In [None]:
positive = []
negative = []
for group in grouped_and_sorted:
  onset_time_index = find_onset_time(group)
  if onset_time_index != -1:
    duration = group.iloc[onset_time_index]['c_time'] - group.iloc[0]['c_time']
    duration = duration.total_seconds() / 3600
    if duration >= 6 and onset_time_index >= 9:
      positive.append(group.iloc[:onset_time_index + 1])
  else:
    negative.append(group)
print("{} {}".format(len(positive), len(negative)))

84 54


In [None]:
def make_rnn_samples(samples, lookback):
  rnn_samples = []
  for sample in samples:
    n = len(sample)
    for i in range(n - lookback):
      rnn_samples.append(sample.iloc[i: i + lookback])
  return rnn_samples

In [None]:
positive_rnn = make_rnn_samples(positive, 5)
negative_rnn = make_rnn_samples(negative, 5)
print("No. of positive samples: {}".format(len(positive_rnn)))
print("No. of negative samples: {}".format(len(negative_rnn)))

No. of positive samples: 717
No. of negative samples: 494


In [None]:
rnn_samples_all = positive_rnn + negative_rnn
labels_all = [1 for i in range(len(positive_rnn))] + [0 for i in range(len(negative_rnn))]

In [None]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

X, y = shuffle(rnn_samples_all, labels_all)

In [None]:
import numpy as np
X = [x.drop(columns=['hadm_id', 'stay_id', 'c_time', 'glucose', 'sbp_ni', 'dbp_ni', 'mbp_ni']).to_numpy() for x in X]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler()
X_train_s = np.array([scalar.fit_transform(x) for x in X_train]).astype(np.float32)
X_test_s = np.array([scalar.fit_transform(x) for x in X_test]).astype(np.float32)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Sequential

In [None]:
# parameters
n_inputs = 24  # number of features
n_neurons = 100
n_outputs = 2
n_layers = 2
batch_size = 32
n_epochs = 20

In [None]:
RNNcells = [tf.keras.layers.SimpleRNNCell(n_neurons) for _ in range(n_layers)]
rnn = tf.keras.layers.StackedRNNCells(RNNcells, input_shape = (5, n_inputs))
RNNmodel = Sequential()
RNNmodel.add(layers.RNN(rnn))
# RNNmodel.add(layers.Dense(64, activation='relu'))
RNNmodel.add(layers.Dense(n_outputs, activation='softmax'))

In [None]:
RNNmodel.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
  )

In [None]:
y_train = np.array(y_train).astype(np.int32)
y_test = np.array(y_test).astype(np.int32)

In [None]:
RNNmodel.fit(
    X_train_s,
    y_train,
    validation_data=(X_test_s, y_test), 
    batch_size=batch_size, 
    epochs=n_epochs,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb655915fa0>

In [None]:
RNNmodel.evaluate(
    X_test_s,
    y_test
)



[1.0634305477142334, 0.5659340620040894]