# IMPORTS

In [11]:
import pandas as pd
from functools import reduce
import numpy as np
import datetime as dt

# CONSTANTS

##### USING TWO RELEVANT PARAMETERS

In [12]:
AIRWAY_PATH = r"C:\Users\Dana\Documents\Final_Project\Data\processed\categorial_files\Airway.csv"
ETCO2_PATH = r"C:\Users\Dana\Documents\Final_Project\Data\processed\numeric_files\ETCO2.csv"

In [13]:
airway_df = pd.read_csv(AIRWAY_PATH)
etco2_df = pd.read_csv(ETCO2_PATH)

##### DEFINE PMV 

In [1]:
MIN_DAYS_WITH_ETCO2 = 7

# CREATE GROUPED DATA OF THE RELEVANT PARAMETERS

In [14]:
full_df = reduce(lambda x, y: pd.merge(x, y, on=['key', 'time'], how='outer'), [airway_df,etco2_df])

In [15]:
full_df = full_df[["key", "value_x", "value_y", 'time']]
full_df = full_df.rename(columns={"value_x": "airway_value", "value_y": "etco2_value"})

In [16]:
full_df["time"] = pd.to_datetime(full_df["time"])
full_df.key = full_df.key.astype(int)
full_df.etco2_value = full_df.etco2_value.astype(float)

In [17]:
grp_df = full_df[full_df["etco2_value"] >= 2].sort_values("time")  # ETCO2 should be at least 2
start_time = grp_df.groupby("key").first()["time"]
end_time = grp_df.groupby("key").last()["time"]
good_keys = (end_time - start_time).dt.days >= MIN_DAYS_WITH_ETCO2
good_keys_list = good_keys[good_keys == True].index.tolist()
full_df_filtered = full_df[full_df["key"].isin(good_keys_list)]
keys_with_airway_value = full_df_filtered[~full_df_filtered.airway_value.isnull()].key.unique().tolist()
full_df_filtered = full_df_filtered[full_df_filtered["key"].isin(keys_with_airway_value)]
full_df_filtered = full_df_filtered.sort_values("time")

# CREATE LABELS

### REGARDING TRACHEOSTOMY SUBJECTS

In [40]:
try:
    full_df_filtered = full_df_filtered.drop(columns="Unnamed: 0")
except:
    print("dropped")
full_df_filtered = full_df_filtered.sort_values("time")
trach_keys_list = list(full_df_filtered[(full_df_filtered["airway_value"] == "Trach")]["key"].unique())
etco2_dur = []
for key in trach_keys_list:
    try:
        first_trach = full_df_filtered[(full_df_filtered["key"] == key) & (full_df_filtered["airway_value"] == "Trach")].iloc[0]
        last_etco2 = full_df_filtered[(full_df_filtered["key"] == key) & (full_df_filtered["etco2_value"] > 0)].iloc[-1]
        etco2_dur.append((last_etco2["time"] - first_trach["time"]).days)
    except IndexError:
        etco2_dur.append(0)

dropped


In [49]:
trach_df = pd.DataFrame({ "num_days_after_trach":etco2_dur, "key":trach_keys_list})
trach_df = trach_df.set_index("key")
trach_df = trach_df[trach_df["num_days_after_trach"] >= 0]


#### 'Legitimate tracheostomy placement' VS 'Un-legitimate tracheostomy placement'. 

In [None]:
trach_df.loc[trach_df["num_days_after_trach"] >= 5, "label"] = "positive_trach"
trach_df.loc[trach_df["num_days_after_trach"] < 5, "label"] = "negative_trach"

### REGARDING ENDOTRACHEAL SUBJECTS

In [62]:
tube_keys = full_df_filtered[full_df_filtered["airway_value"] == "Tracheal tube"].key.unique().tolist()
df_tube = full_df_filtered[(full_df_filtered["key"].isin(tube_keys)) & (full_df_filtered["etco2_value"] > 0)].sort_values("time")
df_tube['diff'] = df_tube.groupby('key')['time'].diff()

In [84]:
tube_df_list = []
for key in df_tube["key"].unique():
    try:
        trach_date = df_tube[(df_tube["key"] == key) & (df_tube["airway_value"] == "Trach")].iloc[0,3] #3 is the column index of 'time'
        temp_tube_df = df_tube[(df_tube["key"] == key) & (df_tube["time"] <= trach_date)]  #leave only endotracheal intubation before tracheostomy placement
    except IndexError:
        temp_tube_df = temp_tube_df = df_tube[(df_tube["key"] == key)]
    tube_df_list.append(temp_tube_df)
df_tube_filtered = pd.concat(tube_df_list)

In [89]:
df_tube_max_diff = pd.DataFrame(df_tube_filtered.groupby("key").max()["diff"].dt.days)

#### 'Successful extubation' VS 'Unsuccessful extubation'

In [101]:
df_tube_max_diff.loc[df_tube_max_diff["diff"] >= 2, "label"] = "positive_tube"
df_tube_max_diff.loc[df_tube_max_diff["diff"] < 2, "label"] = "negative_tube"

# CREATE CLASSIFICATION LABELS

#### 'Positive example' VS 'Negative example'

In [124]:
labels_to_num_dict = {'positive_trach':4, 'negative_trach':3, 'negative_tube':2,
       'positive_tube':1}
labels_to_clf_dict = {4:"yes", 2:"yes", 3:"no",
       1:"no"}

In [125]:
labeled_df = pd.concat([trach_df[["label"]],df_tube_max_diff[["label"]]])
labeled_df_numbers = labeled_df.replace({"label":labels_to_num_dict})

In [126]:
labeled_df_numbers = labeled_df_numbers.groupby("key")[["label"]].max()  # leave last procedure

Unnamed: 0_level_0,label
key,Unnamed: 1_level_1
0,2
11,2
20,2
26,4
29,2
43,1
45,2
48,1
49,4
59,1


In [127]:
labeled_df_numbers["label_clf"] = labeled_df_numbers.replace({"label":labels_to_clf_dict})["label"]

In [130]:
labeled_df_numbers["label_clf"].value_counts()

yes    534
no     195
Name: label_clf, dtype: int64

In [132]:
labeled_df_numbers.to_csv(r"C:\Users\Dana\Documents\Final_Project\Data\processed\labels.csv")