# 0. Initialize

In [29]:
import os
from helper_code import *
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import numpy as np
import pandas as pd
from ast import literal_eval

from team_code import get_features, get_outcome, get_times

In [30]:
data_folder = "a_data/00_raw/physionet.org/files/i-care/1.0/training"
output_dir = "a_data/01_intermediate/"
max_hours = 72
min_quality = 0.0

In [31]:
# Check if output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 1. Get and prep data

In [32]:
patient_ids = find_data_folders(data_folder)
num_patients = len(patient_ids)

if num_patients==0:
    raise FileNotFoundError('No data was provided.')

features = list()
recordings = list()
outcomes = list()
cpcs = list()
for i in tqdm(range(num_patients)):
    # Load data.
    patient_id = patient_ids[i]
    patient_metadata, recording_metadata, recording_data = load_challenge_data(data_folder, patient_id)

    # Get recording dataframe
    df_recordings = pd.DataFrame(recording_data, columns=["signals", "frequencies", "channels"])
    df_recordings["quality_score"] = get_quality_scores(recording_metadata)
    df_recordings["hours"] = get_hours(recording_metadata)
    df_recordings["times"] = get_times(recording_metadata)
    df_recordings["patient_id"] = patient_id
    recordings.append(df_recordings)

    # Extract features.
    current_features = get_features(patient_metadata, recording_metadata, recording_data, return_as_dict=True, max_hours=max_hours, min_quality=min_quality)
    features.append(current_features)

    # Extract labels.
    current_outcome = get_outcome(patient_metadata)
    outcomes.append(current_outcome)
    current_cpc = get_cpc(patient_metadata)
    cpcs.append(current_cpc)

df_meta = pd.DataFrame(features)
df_meta["patient_id"] = patient_ids
df_meta["outcomes"] = np.vstack(outcomes)
df_meta["cpcs"]  = np.vstack(cpcs)
df_recordings = pd.concat(recordings, ignore_index=True)
df_recordings_not_nan = df_recordings[df_recordings["signals"].notna()]
df_combined = df_recordings_not_nan.merge(df_meta, on="patient_id", how="left")
assert df_combined.shape[0] == df_recordings_not_nan.shape[0], "The number of rows in the combined dataframe should be the same as the number of rows in the recordings dataframe."

 11%|█         | 67/607 [00:22<02:58,  3.02it/s]


KeyboardInterrupt: 

In [None]:
df_meta.to_csv(output_dir + "df_meta.csv", index=False)
df_recordings_not_nan.to_pickle(output_dir + "df_recordings_not_nan.pkl")

# 2. Analyse

In [None]:
df_meta = pd.read_csv(output_dir + "df_meta.csv")
df_recordings_not_nan = pd.read_pickle(output_dir + "df_recordings_not_nan.pkl")
df_combined = df_recordings_not_nan.merge(df_meta, on="patient_id", how="left")
assert df_combined.shape[0] == df_recordings_not_nan.shape[0], "The number of rows in the combined dataframe should be the same as the number of rows in the recordings dataframe."

In [None]:
print("Number of patients: {}".format(df_meta.shape[0]))
print("Number of recordings: {}".format(df_recordings_not_nan.shape[0]))
print("Number of signals: {}".format(df_recordings_not_nan["signals"].apply(lambda x: x.shape[0]).sum()))
print("{}".format(df_recordings_not_nan.shape[0])+" * 18 = {}".format(df_recordings_not_nan.shape[0]*18))

In [None]:
df_meta[["age", "female", "male", "other", "rosc", "ttm", "outcomes", "cpcs"]].describe()

In [None]:
pd.crosstab(df_meta["male"], df_meta["outcomes"], margins=True, normalize="all")

In [None]:
pd.crosstab(df_meta["male"], df_meta["outcomes"], margins=True, normalize="index")

In [None]:
pd.crosstab(df_meta["cpcs"], df_meta["outcomes"], margins=True, normalize="columns")

In [None]:
sns.scatterplot(data=df_meta, x="age", y="rosc", hue="outcomes")

In [None]:
sns.histplot(data=df_meta, x="age", bins=[10,20,30,40,50,60,70,80,90,100], stat="frequency")

In [None]:
sns.histplot(df_combined.groupby(['patient_id'])['hours', 'outcomes'].max(), x="hours", hue="outcomes", bins=10, stat="frequency")

# 3. EEG

## 3.1 Analysis

In [None]:
sns.histplot(data=df_recordings_not_nan, x="quality_score", bins=10, stat="frequency")

## 3.2 Plots

In [None]:
df_plot_bad_quality = df_recordings_not_nan[df_recordings_not_nan["quality_score"]<0.1].iloc[42]
df_plot_good_quality = df_recordings_not_nan[df_recordings_not_nan["quality_score"]>0.99999].iloc[42]

In [None]:
good_patient_id = df_meta[df_meta["outcomes"]==0]["patient_id"].values[42]
bad_patient_id = df_meta[df_meta["outcomes"]==1]["patient_id"].values[42]

In [None]:
# Convert times column to seconds since zero
df_recordings_not_nan["100Hz_seconds"] = df_recordings_not_nan["times"].apply(lambda x: (int(x.split(":")[0])*60*60 + int(x.split(":")[1])*60)*100)

# Get channels
channels = df_recordings_not_nan["channels"].values[0]

# Filter
df_plot_good = df_recordings_not_nan[df_recordings_not_nan.patient_id==good_patient_id]
df_plot_bad = df_recordings_not_nan[df_recordings_not_nan.patient_id==bad_patient_id]
df_plot_good_last_hour = df_plot_good.iloc[-1]
df_plot_bad_last_hour = df_plot_bad.iloc[-1]

# Prep
empty_signals_good = np.empty((18, 72*60*60*100))
empty_signals_bad = np.empty((18, 72*60*60*100))
for idx, c in enumerate(channels):
    for i, row in df_plot_good.iterrows():
        empty_signals_good[idx, row["100Hz_seconds"]:row["100Hz_seconds"]+row["signals"].shape[1]] = row["signals"][idx]
    for i, row in df_plot_bad.iterrows():
        empty_signals_bad[idx, row["100Hz_seconds"]:row["100Hz_seconds"]+row["signals"].shape[1]] = row["signals"][idx]


In [None]:
# Over all hours
figure(figsize=(12, 8))
plt.plot(empty_signals_good[0, :], label=channels[0])
plt.title(f"Good outcome, patient {good_patient_id}")
plt.legend()
plt.show()

In [None]:
# Last hour in once plot, good outcome
figure(figsize=(12, 8))
for idx, c in enumerate(df_plot_good_last_hour["channels"]):
    plt.plot(df_plot_good_last_hour["signals"][idx], label=c)
plt.title(f"Good outcome, patient {df_plot_good_last_hour['patient_id']} in hour {df_plot_good_last_hour['hours']}, quality {df_plot_good_last_hour['quality_score']}")
plt.legend()
plt.show()

In [None]:
# Last hour in once plot, poor outcome
figure(figsize=(12, 8))
for idx, c in enumerate(df_plot_bad_last_hour["channels"]):
    plt.plot(df_plot_bad_last_hour["signals"][idx], label=c)
plt.title(f"Poor outcome, patient {df_plot_bad_last_hour['patient_id']} in hour {df_plot_bad_last_hour['hours']}, quality {df_plot_bad_last_hour['quality_score']}")
plt.legend()
plt.show()

In [None]:
# Last hour in multiple plots, good outcome
fig, axs = plt.subplots(len(df_plot_good_last_hour["channels"]), 1, figsize=(15, 70))
for idx, c in enumerate(df_plot_good_last_hour["channels"][:18]):
    axs[idx].plot(df_plot_good_last_hour["signals"][idx], label=c)
    axs[idx].title.set_text(c)
fig.suptitle(f"Good outcome, patient {df_plot_good_last_hour['patient_id']} in hour {df_plot_good_last_hour['hours']}, quality {df_plot_good_last_hour['quality_score']}")
fig.tight_layout(pad=7)

In [None]:
# Last hour in multiple plots, poor outcome
fig, axs = plt.subplots(len(df_plot_bad_last_hour["channels"]), 1, figsize=(15, 70))
for idx, c in enumerate(df_plot_bad_last_hour["channels"][:18]):
    axs[idx].plot(df_plot_bad_last_hour["signals"][idx], label=c)
    axs[idx].title.set_text(c)
fig.suptitle(f"Bad outcome, patient {df_plot_bad_last_hour['patient_id']} in hour {df_plot_bad_last_hour['hours']}, quality {df_plot_bad_last_hour['quality_score']}")
fig.tight_layout(pad=7.0)

In [None]:
# Good quality
figure(figsize=(12, 8))
for idx, c in enumerate(df_plot_good_quality["channels"]):
    plt.plot(df_plot_good_quality["signals"][idx], label=c)
plt.title(f"Good quality ({df_plot_good_quality['quality_score']}), patient {df_plot_good_quality['patient_id']} in hour {df_plot_good_quality['hours']}")
plt.legend()
plt.show()

In [None]:
# Bad quality
figure(figsize=(12, 8))
for idx, c in enumerate(df_plot_bad_quality["channels"]):
    plt.plot(df_plot_bad_quality["signals"][idx], label=c)
plt.title(f"Bad quality ({df_plot_bad_quality['quality_score']}), patient {df_plot_bad_quality['patient_id']} in hour {df_plot_bad_quality['hours']}")
plt.legend()
plt.show()