In [None]:
import sys
import os
import yaml
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import plotly.graph_objects as go
import sklearn.preprocessing
import sklearn.cluster

import pandas as pd
pd.options.mode.chained_assignment = None

sys.path.append('../python/')

# Plotly Dash doesn't display error messages well, so we use logging 
import logging
logging.basicConfig(level=logging.DEBUG, filename="dash_logs.log")

LIST_COHORT_NAMES = ["Melbourne", "Adelaide"]

In [None]:
dict_parameters = yaml.safe_load(open("../parameters/july_2024_data_parameters.yaml", "r"))

In [None]:
pd_df_estimated_concentrations = pd.read_csv(
    open(
        os.path.join(
            dict_parameters["output directory path"],
            dict_parameters["estimated concentrations file name"]
        ),
        "rb"
    )
)

In [None]:
pd_df_estimated_concentrations

Separate out the Adelaide and Melbourne samples into separate dataframes

In [None]:
dict_pd_df_cohort_tables = {}
dict_pd_df_cohort_tables["Melbourne"] = pd_df_estimated_concentrations[
    pd_df_estimated_concentrations["sample name annotations"].str.contains('\d{3,4}[ _][A-Za-z]', regex=True)
]
dict_pd_df_cohort_tables["Adelaide"] = pd_df_estimated_concentrations[
    pd_df_estimated_concentrations["sample name annotations"].str.contains('\d{4}A_.*', regex=True)
]

In [None]:
dict_pd_df_cohort_tables["Melbourne"]

In [None]:
dict_pd_df_cohort_tables["Adelaide"]

Discard unnecessary columns

In [None]:
list_columns_to_keep = ["sample name annotations"]
for str_analyte in dict_parameters["list of analytes"]:
        list_columns_to_keep.append(dict_parameters["column name prefix for estimated concentrations"] + str_analyte)
for cohort_name in LIST_COHORT_NAMES:
    dict_pd_df_cohort_tables[cohort_name] = dict_pd_df_cohort_tables[cohort_name][list_columns_to_keep]
dict_pd_df_cohort_tables["Adelaide"]

Take the mean if there are multiple reads per sample

In [None]:
for cohort_name in LIST_COHORT_NAMES:
    dict_pd_df_cohort_tables[cohort_name] = dict_pd_df_cohort_tables[cohort_name].groupby("sample name annotations").mean().reset_index()

In [None]:
dict_pd_df_cohort_tables["Adelaide"]

In [None]:
dict_pd_df_cohort_tables["Adelaide"]["sample name annotations"] = (
    dict_pd_df_cohort_tables["Adelaide"]["sample name annotations"].str.replace("_D2", "-D2")
)

In [None]:
dict_pd_df_cohort_tables["Adelaide"]

Extract the patient number and time code from the sample name

In [None]:
for cohort_name in LIST_COHORT_NAMES:
    dict_pd_df_cohort_tables[cohort_name][["patient number", "time code"]] = (
        dict_pd_df_cohort_tables[cohort_name]["sample name annotations"]
        .str.strip()
        .str.split(r"[_ ]", expand = True)    
    )
    dict_pd_df_cohort_tables[cohort_name] = dict_pd_df_cohort_tables[cohort_name].drop(columns = ["sample name annotations"])

Reorder the columns

In [None]:
list_first_columns = [
    "patient number", "time code",
]
for cohort_name in LIST_COHORT_NAMES:
    dict_pd_df_cohort_tables[cohort_name] = (
        dict_pd_df_cohort_tables[cohort_name][
            list_first_columns +
            [column_name for column_name in dict_pd_df_cohort_tables[cohort_name].columns if column_name not in list_first_columns]
        ]
    )

In [None]:
dict_pd_df_cohort_tables["Adelaide"]

Check the number of unique patients that we have

In [None]:
dict_pd_df_cohort_tables["Adelaide"]["patient number"].unique().shape

In [None]:
dict_pd_df_cohort_tables["Melbourne"]["patient number"].unique().shape

Discard all the Melbourne patients which do not have all 5 time points. This is just for the preliminary analysis, I will come up with better ways to handle the missing data later. 

In [None]:
dict_pd_df_cohort_tables["Melbourne"] = (
    dict_pd_df_cohort_tables["Melbourne"][
        dict_pd_df_cohort_tables["Melbourne"].groupby("patient number")["patient number"].transform('count') >= 5 
    ]
)
dict_pd_df_cohort_tables["Melbourne"] = dict_pd_df_cohort_tables["Melbourne"].reset_index(drop = True)

Check the number of unique patients after trimming

In [None]:
dict_pd_df_cohort_tables["Melbourne"]["patient number"].unique().shape

Map time codes to integers

In [None]:
dict_melbourne_time_code_mapping = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
}
dict_pd_df_cohort_tables["Melbourne"]["time int"] = (
    dict_pd_df_cohort_tables["Melbourne"]["time code"].map(dict_melbourne_time_code_mapping)
)
dict_adelaide_time_code_mapping = {
    'Pre-1hr': 1,
    '15min': 2,
    '0.5hr': 3,
    '1hr': 4,
    '2hr': 5,
    '4hr': 6,
    '8hr': 7,
}
dict_pd_df_cohort_tables["Adelaide"]["time int"] = (
    dict_pd_df_cohort_tables["Adelaide"]["time code"].map(dict_adelaide_time_code_mapping)
)

for cohort_name in LIST_COHORT_NAMES:
    dict_pd_df_cohort_tables[cohort_name] = dict_pd_df_cohort_tables[cohort_name].sort_values(by = ["patient number", "time int"]).reset_index()
    
list_first_columns = [
    "patient number", "time code", "time int"
]
for cohort_name in LIST_COHORT_NAMES:
    dict_pd_df_cohort_tables[cohort_name] = (
        dict_pd_df_cohort_tables[cohort_name][
            list_first_columns +
            [column_name for column_name in dict_pd_df_cohort_tables[cohort_name].columns if column_name not in list_first_columns]
        ]
    )

In [None]:
dict_pd_df_cohort_tables["Adelaide"]

Calculated the differences in estimated concentrations between each time point and the second time point (-1 hour) for each patient

In [None]:
dict_pd_df_cohort_tables["Adelaide"]

In [None]:
dict_pd_df_cohort_tables["Melbourne"]

In [None]:
dict_pd_df_cohort_tables["Adelaide"].groupby("patient number").apply(
    lambda x: x["estimated concentration " + "IFN-gamma"] 
                          - x[x["time int"] == 1]["estimated concentration " + "IFN-gamma"].iloc[0],
                include_groups = False,
)


In [None]:
for str_cohort_name, int_base_time_code in zip(LIST_COHORT_NAMES, [2, 1]):
    for str_analyte in dict_parameters["list of analytes"]:
        dict_pd_df_cohort_tables[str_cohort_name][str_analyte + " diff"] = (
            dict_pd_df_cohort_tables[str_cohort_name].groupby("patient number").apply(
                lambda x: x["estimated concentration " + str_analyte] 
                          - x[x["time int"] == int_base_time_code]["estimated concentration " + str_analyte].iloc[0],
                include_groups = False,
            )
        ).reset_index(drop = True)


In [None]:
dict_pd_df_cohort_tables["Melbourne"][["patient number", "time code", "time int", "IP-10 diff", "IL-6 diff", "IFN-gamma diff"]]

In [None]:
dict_pd_df_cohort_tables["Adelaide"][dict_pd_df_cohort_tables["Adelaide"]["time code"] == "Pre-1hr"][["patient number", "time code", "IP-10 diff", "IL-6 diff", "IFN-gamma diff"]]

Do some clustering on the Melbourne patients

In [None]:
list_pivot_values_columns = [
    str_column_name for str_column_name in dict_pd_df_cohort_tables["Melbourne"].columns
    if str_column_name not in list_first_columns + ["time int"]
]
pd_df_melbourne_concentrations_pivoted = (
    dict_pd_df_cohort_tables["Melbourne"].pivot(
        index="patient number", 
        columns="time code", 
        values=list_pivot_values_columns
    )
)
pd_df_melbourne_concentrations_pivoted.columns = [
    '_'.join(col).strip() if isinstance(col, tuple) else col for col in pd_df_melbourne_concentrations_pivoted.columns.values
]
pd_df_melbourne_concentrations_pivoted = pd_df_melbourne_concentrations_pivoted.reset_index()

In [None]:
pd_df_melbourne_concentrations_pivoted

In [None]:
list_time_codes_for_clustering = ['C', 'D', 'E']
list_columns_to_cluster = []
for str_time_code in list_time_codes_for_clustering:
    list_columns_to_cluster += [
        "IP-10 diff_" + str_time_code, 
        "TNF-a diff_" + str_time_code, 
        "IFN-gamma diff_" + str_time_code, 
        "IL-6 diff_" + str_time_code, 
    ]

scaler_object = sklearn.preprocessing.StandardScaler()
np_arr_scaled_data = scaler_object.fit_transform(pd_df_melbourne_concentrations_pivoted[list_columns_to_cluster])

num_clusters = 5
kmeans_object = sklearn.cluster.KMeans(n_clusters=num_clusters, random_state=42)
np_arr_cluster_labels = kmeans_object.fit_predict(np_arr_scaled_data)

pd_df_melbourne_concentrations_pivoted['cluster'] = np_arr_cluster_labels

In [None]:
dict_pd_df_cohort_tables["Melbourne"] = dict_pd_df_cohort_tables["Melbourne"].merge(
    pd_df_melbourne_concentrations_pivoted[["patient number", "cluster"]],
    on = "patient number",
    how = "left"
)
dict_pd_df_cohort_tables["Adelaide"]["cluster"] = 1
dict_pd_df_cohort_tables["Melbourne"]["cluster"] = 1

In [None]:
dict_pd_df_cohort_tables["Melbourne"]

In [None]:
dash_app_object_boxes = dash.Dash(__name__)

# Define the layout
dash_app_object_boxes.layout = html.Div([
    html.H1("Plots of estimated concentrations"),

    html.Div([
        html.Label("Cohort:"),
        dcc.Dropdown(
            id='cohort-dropdown',
            options=[{'label': col, 'value': col} for col in LIST_COHORT_NAMES],
            value=LIST_COHORT_NAMES[0]
        ),
    ], style={'width': '20%', 'display': 'inline-block'}),
    

    html.Div([
        html.Label("Plot type:"),
        dcc.Dropdown(
            id='plot-type-dropdown',
            options=[
                {'label': 'box', 'value': 'box'},
                {'label': 'strip', 'value': 'strip'},
            ],
            value='box',
        ),
    ], style={'width': '20%', 'display': 'inline-block'}),
    
    html.Div([
        html.Label("Data type:"),
        dcc.Dropdown(
            id='data-type-dropdown',
            options=[
                {'label': 'raw estimates', 'value': 'raw estimates'},
                {'label': 'differences', 'value': 'differences'},
            ],
            value='raw estimates',
        ),
    ], style={'width': '20%', 'display': 'inline-block'}),

    html.Div([
        html.Label("Analyte:"),
        dcc.Dropdown(
            id='analyte-dropdown',
            options=[{'label': col, 'value': col} for col in dict_parameters["list of analytes"]],
            value=dict_parameters["list of analytes"][0]
        ),
    ], style={'width': '20%', 'display': 'inline-block'}),
    
    dcc.Graph(id='scatter-plot')
], style={'backgroundColor': 'white', 'padding': '20px'})


@dash_app_object_boxes.callback(
    Output('scatter-plot', 'figure'),
    Input('cohort-dropdown', 'value'),
    Input('analyte-dropdown', 'value'),
    Input('data-type-dropdown', 'value'),
    Input('plot-type-dropdown', 'value'),
)
def update_graph(str_cohort_name, str_analyte, str_data_type, str_plot_type):
    
    if str_data_type == "raw estimates":
        str_column_name = dict_parameters["column name prefix for estimated concentrations"] + str_analyte
    elif str_data_type == "differences":
        str_column_name = str_analyte + " diff"
    
    if str_plot_type == "box":
        fig = px.box(
            dict_pd_df_cohort_tables[str_cohort_name], 
            x = "time code", 
            y = str_column_name,
            color="cluster",
        )
    elif str_plot_type == "strip":
        fig = px.strip(
            dict_pd_df_cohort_tables[str_cohort_name], 
            x = "time code", 
            y = str_column_name,
            hover_name = "patient number",
            color="cluster",
        )    
        fig.update_traces(marker = dict(opacity = 0.75))
    
    if str_data_type == "raw estimates":
        str_y_label = "concentration"
    elif str_data_type == "differences":
        str_y_label = "diff. in concentration"
    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        xaxis_title="time",
        # xaxis=dict(
        #     tickmode='array',
        #     tickvals=['A', 'B', 'C', 'D', 'E'],
        #     ticktext=['minus 1wk', 'minus 1hr', 'plus 3hr', 'plus 7hr', 'plus 25hr'],
        # ),
        yaxis_title=str_y_label,
        font=dict(
            family="Arial",
            size=16,
            color="black"
        )
    )
    return fig

# Run the app
if __name__ == '__main__':
    dash_app_object_boxes.run(jupyter_mode="inline", debug = True, port=3312)

In [None]:
dash_app_object_lines = dash.Dash(__name__)

list_patient_ids_melbourne = list(dict_pd_df_cohort_tables["Melbourne"]["patient number"].unique())
list_patient_ids_adelaide = list(dict_pd_df_cohort_tables["Adelaide"]["patient number"].unique())
list_patient_id_dropdown_dicts = [
    {'label': str_patient_id, 'value': str_patient_id} for str_patient_id in list_patient_ids_adelaide + list_patient_ids_melbourne
]

# Define the layout
dash_app_object_lines.layout = html.Div([
    html.H1("Line plots by patient number"),

    html.Div([
        html.Label("Patient number:"),
        dcc.Dropdown(
            id='patient-number-dropdown',
            options=list_patient_id_dropdown_dicts,
            value=list_patient_id_dropdown_dicts[0]["value"],
        ),
    ], style={'width': '20%', 'display': 'inline-block'}),
    
    html.Div([
        html.Label("Data type:"),
        dcc.Dropdown(
            id='data-type-dropdown',
            options=[
                {'label': 'raw estimates', 'value': 'raw estimates'},
                {'label': 'normalised raw estimates', 'value': 'normalised raw estimates'},
                {'label': 'differences', 'value': 'differences'},
            ],
            value='raw estimates',
        ),
    ], style={'width': '20%', 'display': 'inline-block'}),

    dcc.Graph(id='line-plot')
], style={'backgroundColor': 'white', 'padding': '20px'})


@dash_app_object_lines.callback(
    Output('line-plot', 'figure'),
    Input('patient-number-dropdown', 'value'),
    Input('data-type-dropdown', 'value'),
)
def update_graph(str_patient_number, str_data_type):
    
    fig = go.Figure()
    if str_patient_number in list_patient_ids_melbourne:
        pd_df_patients_data = (
            dict_pd_df_cohort_tables["Melbourne"][dict_pd_df_cohort_tables["Melbourne"]["patient number"] == str_patient_number]
        )
    elif str_patient_number in list_patient_ids_adelaide:
        pd_df_patients_data = (
            dict_pd_df_cohort_tables["Adelaide"][dict_pd_df_cohort_tables["Adelaide"]["patient number"] == str_patient_number]
        )

    for str_analyte in ["IP-10", "IFN-gamma", "IL-6", "TNF-a"]:
        
        if str_data_type in ["raw estimates", "normalised raw estimates"]:
            str_column_name = dict_parameters["column name prefix for estimated concentrations"] + str_analyte
        elif str_data_type == "differences":
            str_column_name = str_analyte + " diff"
        pd_series_y = pd_df_patients_data[str_column_name]
        if str_data_type == "normalised raw estimates":
            pd_series_y = (pd_series_y - pd_series_y.mean()) / (pd_series_y.max() - pd_series_y.min())
        fig.add_trace(go.Scatter(x=pd_df_patients_data["time int"], y=pd_series_y, mode='lines', name=str_analyte))
    
    if str_data_type == "raw estimates":
        str_y_label = "concentration (pg/ml)"
    if str_data_type == "normalised raw estimates":
        str_y_label = "normalised concentration"
    elif str_data_type == "differences":
        str_y_label = "diff. in concentration (pg/ml)"
        
    if str_patient_number in list_patient_ids_melbourne:
        dict_x_ticks = dict(
            tickmode='array',
            tickvals=[1, 2, 3, 4, 5],
            ticktext=['minus 1wk', 'minus 1hr', 'plus 3hr', 'plus 7hr', 'plus 25hr'],
        )
    elif str_patient_number in list_patient_ids_adelaide:
        dict_x_ticks = dict(
            tickmode='array',
            tickvals=[1, 2, 3, 4, 5, 6, 7],
            ticktext=['minus 1hr', 'plus 15min', 'plus 30min', 'plus 1hr', 'plus 2hr', 'plus 4hr', 'plus 8hr'],
        )
        
    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        xaxis_title="time",
        # title={
        #      'text': f'Concentrations for Patient {str_patient_number}',
        #      'y':0.95,
        #      'x':0.5,
        #      'xanchor': 'center',
        #      'yanchor': 'top',
        #      'font': {'size': 24}
        # },
        xaxis=dict_x_ticks,
        yaxis_title=str_y_label,
        font=dict(
            family="Arial",
            size=16,
            color="black"
        )
    )
    return fig

# Run the app
if __name__ == '__main__':
    dash_app_object_lines.run(jupyter_mode="inline", debug = True, port=3311)