# 0. Initialize

In [None]:
from helper_code import *
from team_code import get_features, get_outcome

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
import pandas as pd

In [None]:
data_folder = "a_data/00_raw/training_subset"
max_hours = 72
min_quality = 0.0


# 1. Get and prep data

In [None]:
patient_ids = find_data_folders(data_folder)
num_patients = len(patient_ids)

if num_patients==0:
    raise FileNotFoundError('No data was provided.')

features = list()
recordings = list()
outcomes = list()
cpcs = list()
for i in range(num_patients):
    # Load data.
    patient_id = patient_ids[i]
    patient_metadata, recording_metadata, recording_data = load_challenge_data(data_folder, patient_id)

    # Get recording dataframe
    df_recordings = pd.DataFrame(recording_data, columns=["signals", "frequencies", "channels"])
    df_recordings["quality_score"] = get_quality_scores(recording_metadata)
    df_recordings["hours"] = get_hours(recording_metadata)
    df_recordings["patient_id"] = patient_id
    recordings.append(df_recordings)

    # Extract features.
    current_features = get_features(patient_metadata, recording_metadata, recording_data, return_as_dict=True, max_hours=max_hours, min_quality=min_quality)
    features.append(current_features)

    # Extract labels.
    current_outcome = get_outcome(patient_metadata)
    outcomes.append(current_outcome)
    current_cpc = get_cpc(patient_metadata)
    cpcs.append(current_cpc)

df_meta = pd.DataFrame(features)
df_meta["patient_id"] = patient_ids
df_meta["outcomes"] = np.vstack(outcomes)
df_meta["cpcs"]  = np.vstack(cpcs)
df_recordings = pd.concat(recordings, ignore_index=True)
df_recordings_not_nan = df_recordings[df_recordings["signals"].notna()]
df_combined = df_recordings_not_nan.merge(df_meta, on="patient_id", how="left")
assert df_combined.shape[0] == df_recordings_not_nan.shape[0], "The number of rows in the combined dataframe should be the same as the number of rows in the recordings dataframe."

# 2. Analyse

In [None]:
df_meta.head(3)

In [None]:
df_meta.describe()

In [None]:
df_combined.describe()

In [None]:
print("Number of patients: {}".format(df_meta.shape[0]))
print("Number of recordings: {}".format(df_recordings_not_nan.shape[0]))
print("Number of signals: {}".format(df_recordings_not_nan["signals"].apply(lambda x: x.shape[0]).sum()))
print("{}".format(df_recordings_not_nan.shape[0])+" * 18 = {}".format(df_recordings_not_nan.shape[0]*18))

In [None]:
i = 0
df_plot = df_combined.iloc[i]

In [None]:
figure(figsize=(12, 8))
for idx, c in enumerate(df_plot["channels"]):
    plt.plot(df_plot["signals"][idx], label=c)
plt.legend()
plt.show()

In [None]:
fig, axs = plt.subplots(len(df_plot["channels"]), 1, figsize=(12, 50))
for idx, c in enumerate(df_plot["channels"]):
    axs[idx].plot(df_plot["signals"][idx], label=c)
    axs[idx].title.set_text(c)
fig.tight_layout(pad=1.0)
