In [5]:
EXTERNAL_FOLDER = "../data/external"
INTERIM_DATA_FOLDER = "../data/interim"
PROCESSED_DATA_FOLDER = "../data/processed"

In [6]:
import pandas as pd
import plotly.graph_objects as go

In [7]:
glucose_norm = pd.read_csv(f"{PROCESSED_DATA_FOLDER}/glucose_norm.csv")
insulin_inputs_in_glucose_pacients = pd.read_csv(f"{INTERIM_DATA_FOLDER}/insulin_inputs_in_glucose_pacients.csv")
bmi_glucose = pd.read_csv(f"{PROCESSED_DATA_FOLDER}/bmi_glucose.csv")
d_items = pd.read_csv(f"{EXTERNAL_FOLDER}/d_items.csv")
patients = pd.read_csv(f"{EXTERNAL_FOLDER}/patients.csv")

bmi_norm = pd.read_csv(f"{PROCESSED_DATA_FOLDER}/bmi_norm.csv")
ck_norm = pd.read_csv(f"{PROCESSED_DATA_FOLDER}/ck_norm.csv")
insuline_in_ck_patients = pd.read_csv(f"{INTERIM_DATA_FOLDER}/insulin_inputs_in_ck_pacients.csv")

In [8]:
glucose_norm = (
    glucose_norm
    .merge(d_items, on="itemid", how="left")
    [["subject_id", "hadm_id", "stay_id", "charttime", "valuenum", "valueuom", "itemid", "label", "category"]]
)

glucose_norm['chartdate'] = pd.to_datetime(glucose_norm['charttime'])
glucose_norm['chartdate'] = glucose_norm['chartdate'].dt.strftime('%Y-%m-%d')

bmi_glucose = bmi_glucose[["subject_id", "chartdate", "result_name", "result_value"]]
bmi_glucose = bmi_glucose.groupby("subject_id", as_index=False).first()

glucose = (
    glucose_norm
    .merge(bmi_glucose, on=["subject_id"], how="left")  # Aqui o mais correto deveria ser on=["subject_id", "chartdate"], mas o join dessa forma não funciona, possivelmente por haver inconsistências na definição das datas.
    .dropna(subset=["result_value"])
)
glucose["charttime"] = pd.to_datetime(glucose["charttime"]) 
glucose = glucose.sort_values(['stay_id', 'charttime'])

insulin = (
    insulin_inputs_in_glucose_pacients[["subject_id", "hadm_id", "stay_id", "starttime", "endtime", "itemid", "amount", "amountuom", "rate", "rateuom", "ordercategoryname", "ordercategorydescription", "totalamount", "totalamountuom"]]
)
insulin["charttime"] = pd.to_datetime(insulin["starttime"]) 
insulin = insulin.sort_values(['stay_id', 'charttime'])

patients["died"] = patients["dod"].isna()
patients = patients[["subject_id", "died"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


* Rule 1: A glucose reading should precede a regular insulin administration by up to 90 minutes. Tis basis
for this time window was derived from the diabetic ketoacidosis guidelines which recommend measuring
glucose values every 60minutes while receiving an insulin infusion10. An additional 30minutes were added,
90minutes in total, to this interval to account for the time it may take for providers to register the event. Tese
time intervals are within the recommendations11.

* Rule 2: When a regular insulin event was not preceded, but instead followed, by a blood glucose measurement, this glucose reading was paired with the regular insulin administration if they were recorded within
90minutes of each other.

* Rule 3: Sometimes a regular insulin infusion/bolus appeared between 2 blood glucose measurements. In this
case, the higher glucose value was paired with the regular insulin entry as long as they were entered within
90minutes of each other.

* Rule 4: When a regular insulin bolus occurred very close to a regular insulin infusion rate, it was assumed
that the patient was given a bolus and then commenced on an infusion. Both regular insulin entries were
paired with the preceding blood glucose measurement, or the posterior glucose reading in case its value was
higher than the preceding blood glucose and was entered within 90minutes of the insulin dose.

* Rule 5: No glucose values below 90mg/dL were paired with a subsequent regular insulin bolus or infusion.
No clinician will treat this low of a blood glucose value with a regular insulin bolus or infusion.

In [9]:
glucose_insulin = (
    glucose
    .merge(insulin, on="stay_id", how="left")
)

glucose_insulin = glucose_insulin[
    (glucose_insulin["charttime_x"] - glucose_insulin["charttime_y"] >= pd.Timedelta('-90 minutes')) &
    (glucose_insulin["charttime_x"] - glucose_insulin["charttime_y"] < pd.Timedelta('90 minutes'))
]

In [10]:
glucose_insulin = glucose_insulin[glucose_insulin['valuenum'] >= 90]

In [11]:
def is_intravenous(row):
    if "Non IV" in row['insulin_ordercategoryname']:
        return 0
    return 1

glucose_insulin = glucose_insulin[
    [
        "subject_id_x", 
        "hadm_id_x", 
        "stay_id", 
        "charttime_x", 
        "valuenum", 
        "valueuom", 
        "label", 
        "category", 
        "result_value", 
        "starttime", 
        "endtime", 
        "itemid_y", 
        "amount", 
        "amountuom", 
        "rate", 
        "rateuom", 
        "ordercategoryname", 
        "ordercategorydescription", 
        "totalamount", 
        "totalamountuom"
    ]
]

glucose_insulin = glucose_insulin.rename(
    columns={
        "subject_id_x": "subject_id", 
        "hadm_id_x": "hadm_id",
        "charttime_x": "glucose_charttime",
        "valuenum": "glucose_value", 
        "valueuom": "glucose_valueuom", 
        "result_value": "BMI (kg/m2)", 
        "starttime": "insulin_starttime",
        "endtime": "insulin_endtime",
        "itemid_y": "insulin_itemid",
        "amount": "insulin_amount", 
        "amountuom": "insulin_amountuom", 
        "rate": "insulin_rate", 
        "rateuom": "insulin_rateuom", 
        "ordercategoryname": "insulin_ordercategoryname", 
        "ordercategorydescription": "insulin_ordercategorydescription", 
        "totalamount": "insuline_totalamount", 
        "totalamountuom": "insuline_totalamountuom",
    }
)
glucose_insulin["is_overweight"] = (glucose_insulin["BMI (kg/m2)"] >= 30).astype(int)
glucose_insulin['is_intravenous'] = glucose_insulin.apply(is_intravenous, axis=1)

In [12]:
glucose_insulin.to_csv(f"{INTERIM_DATA_FOLDER}/paired_glucose_insulin.csv", index=False)

In [13]:
ck_norm = ck_norm.rename(
    columns={
        "charttime": "ck_charttime", 
        "valuenum": "CK (IU/L)",
    }
)[["stay_id", "ck_charttime", "CK (IU/L)"]]

ck_norm["ck_charttime"] = pd.to_datetime(ck_norm["ck_charttime"]) 

In [14]:
glucose_insulin = (
    glucose_insulin.merge(
        ck_norm, 
        on="stay_id", how="left"
    )
)

glucose_insulin = glucose_insulin[
    (glucose_insulin["glucose_charttime"] - glucose_insulin["ck_charttime"] >= pd.Timedelta('-4 hours')) &
    (glucose_insulin["glucose_charttime"] - glucose_insulin["ck_charttime"] < pd.Timedelta('4 hours'))
]

In [15]:
pd.set_option('display.max_columns', None)
glucose_insulin = glucose_insulin.merge(patients, on="subject_id", how="left")

In [16]:
glucose_insulin.to_csv(f"{INTERIM_DATA_FOLDER}/paired_glucose_insulin_ck_death.csv", index=False)

In [17]:
glucose_insulin["died"] = glucose_insulin["died"].astype(int)

In [18]:
import plotly.express as px

def plot_xy_distribution(df, col_1, col_2, num_bins=10):
    x = df[[col_1, col_2]]

    bin_edges = pd.qcut(x[col_1], num_bins, precision=1, duplicates='drop')
    grouped_df = x.groupby(bin_edges).mean()
    df_plot = pd.DataFrame({col_1: [interval.mid for interval in grouped_df.index], f'mean_{col_2}': grouped_df[col_2]})
    fig = px.line(df_plot, x=col_1, y=f'mean_{col_2}')
    fig.update_layout(
        xaxis_title=col_1,
        yaxis_title=f"Mean of {col_2}",
        title=f"Mean of {col_2} vs {col_1} (Binned)"
    )
    fig.show()

In [19]:
glucose_insulin["high_ck"] = (glucose_insulin["CK (IU/L)"] >= 200).astype(int)

(
    glucose_insulin
    .drop_duplicates(subset=["subject_id"])
    .groupby(["is_intravenous", "is_overweight", "high_ck"], as_index=False)["died"].mean()
)

Unnamed: 0,is_intravenous,is_overweight,high_ck,died
0,0,0,0,0.451613
1,0,0,1,0.555556
2,0,1,0,0.457399
3,0,1,1,0.592233
4,1,0,0,0.521212
5,1,0,1,0.608247
6,1,1,0,0.580952
7,1,1,1,0.662338


In [20]:
import plotly.express as px
import pandas as pd

import plotly.express as px
import pandas as pd

def plot_xy_distribution(df, col_1, col_2, num_bins=10):
    x = df[[col_1, col_2]].sort_values(by=col_1)

    x['cumulative_percentage'] = x[col_1].rank(pct=True)
    x['bin'] = pd.cut(x['cumulative_percentage'], bins=num_bins, labels=False, include_lowest=True)

    grouped_df = x.groupby('bin').mean()
    df_plot = pd.DataFrame({col_1: grouped_df[col_1], f'mean_{col_2}': grouped_df[col_2]})
    
    fig = px.line(df_plot, x=col_1, y=f'mean_{col_2}')
    fig.update_layout(
        xaxis_title=col_1,
        yaxis_title=f"Mean of {col_2}",
        title=f"Mean of {col_2} vs {col_1} (Binned)"
    )
    fig.show()

In [21]:
col_1 = "CK (IU/L)"
col_2 = "died"
plot_xy_distribution(glucose_insulin, col_1, col_2)

In [29]:
col_1 = "BMI (kg/m2)"
col_2 = "is_intravenous"
plot_xy_distribution(glucose_insulin[glucose_insulin["died"] == 1], col_1, col_2)

In [37]:
glucose_insulin[["BMI (kg/m2)", "is_intravenous"]].corr()

Unnamed: 0,BMI (kg/m2),is_intravenous
BMI (kg/m2),1.0,-0.034169
is_intravenous,-0.034169,1.0


In [38]:
import scipy.stats as stats

filtered_data = glucose_insulin[["BMI (kg/m2)", "is_intravenous"]]
corr_coeff, p_value = stats.pearsonr(filtered_data["BMI (kg/m2)"], filtered_data["is_intravenous"])

print("Correlation Coefficient:", corr_coeff)
print("P-value:", p_value)

Correlation Coefficient: -0.03416911268595384
P-value: 0.005890699757027705


In [61]:
col_1 = "CK (IU/L)"
col_2 = "is_intravenous"

filtered_data = glucose_insulin[glucose_insulin["is_overweight"] == 0][[col_1, col_2]]
corr_coeff, p_value = stats.pearsonr(filtered_data[col_1], filtered_data[col_2])

print("Correlation Coefficient:", corr_coeff)
print("P-value:", p_value)

Correlation Coefficient: 0.07350583199757205
P-value: 9.181435753008023e-06


In [60]:
col_1 = "CK (IU/L)"
col_2 = "is_intravenous"

filtered_data = glucose_insulin[glucose_insulin["is_overweight"] == 1][[col_1, col_2]]
corr_coeff, p_value = stats.pearsonr(filtered_data[col_1], filtered_data[col_2])

print("Correlation Coefficient:", corr_coeff)
print("P-value:", p_value)

Correlation Coefficient: 0.010694147464575003
P-value: 0.5675399366218201


In [55]:
col_1 = "CK (IU/L)"
col_2 = "is_intravenous"

filtered_data = glucose_insulin[(glucose_insulin["is_overweight"] == 1) & (glucose_insulin["died"] == 1)][[col_1, col_2]]
corr_coeff, p_value = stats.pearsonr(filtered_data[col_1], filtered_data[col_2])

print("Correlation Coefficient:", corr_coeff)
print("P-value:", p_value)

Correlation Coefficient: -0.04325374645054212
P-value: 0.08828337603658072


In [85]:
col_1 = "CK (IU/L)"
col_2 = "is_intravenous"
plot_xy_distribution(glucose_insulin[glucose_insulin["is_overweight"] == 1], col_1, col_2)

In [89]:
col_1 = "glucose_value"
col_2 = "is_intravenous"
plot_xy_distribution(glucose_insulin[glucose_insulin["is_overweight"] == 1], col_1, col_2)