In [None]:
import sys
import os
import yaml
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import seaborn as sns

import pandas as pd
pd.options.mode.chained_assignment = None

sys.path.append('../python/')

# Plotly Dash doesn't display error messages well, so we use logging 
import logging
logging.basicConfig(level=logging.DEBUG, filename="dash_logs.log")

In [None]:
dict_parameters = yaml.safe_load(open("../parameters/july_2024_data_parameters.yaml", "r"))

In [None]:
pd_df_estimated_concentrations = pd.read_csv(
        open(
            os.path.join(
                dict_parameters["output directory path"],
                dict_parameters["estimated concentrations file name"]
            ),
            "rb"
        ),
    )

In [None]:
pd_df_estimated_concentrations

Filter to have only the Melbourne samples

In [None]:
pd_df_estimated_concentrations = pd_df_estimated_concentrations[
    pd_df_estimated_concentrations["sample name annotations"].str.contains('\d{3,4}[ _][A-Za-z]', regex=True)
]

In [None]:
pd_df_estimated_concentrations

Discard unnecessary columns

In [None]:
list_columns_to_keep = ["sample name annotations"]
for str_analyte in dict_parameters["list of analytes"]:
        list_columns_to_keep.append(dict_parameters["column name prefix for estimated concentrations"] + str_analyte)
pd_df_estimated_concentrations = pd_df_estimated_concentrations[list_columns_to_keep]
pd_df_estimated_concentrations

Take the mean if there are multiple reads per sample

In [None]:
pd_df_estimated_concentrations = pd_df_estimated_concentrations.groupby("sample name annotations").mean().reset_index()

In [None]:
pd_df_estimated_concentrations

Extract the patient number and time code from the sample name

In [None]:
pd_df_estimated_concentrations[["patient number", "time code"]] = (
    pd_df_estimated_concentrations["sample name annotations"]
    .str.strip()
    .str.split(r"[_ ]", expand = True)    
)
pd_df_estimated_concentrations = pd_df_estimated_concentrations.drop(columns = ["sample name annotations"])

In [None]:
list_first_columns = [
    "patient number", "time code",
]
pd_df_estimated_concentrations = (
    pd_df_estimated_concentrations[
        list_first_columns +
        [column_name for column_name in pd_df_estimated_concentrations.columns if column_name not in list_first_columns]
    ]
)

In [None]:
pd_df_estimated_concentrations.sort_values(by = ["patient number"])

In [None]:
pd_df_estimated_concentrations["patient number"] = pd_df_estimated_concentrations["patient number"].astype(int)

In [None]:
pd_df_estimated_concentrations["patient number"].unique().shape

In [None]:
pd_df_patient_list = pd.read_excel(
    os.path.join(
        dict_parameters["data directory path"],
        "HISS_Multiplex_layouts_PLATES_8_TO_17_JB.xlsx",
    ),
    sheet_name="UofM",
    header=1,
)

In [None]:
pd_df_patient_list

In [None]:
pd_df_patient_list["Participant ID"] = pd_df_patient_list["Participant ID"].astype(int)

In [None]:
pd_df_patient_list

In [None]:
pd_df_estimated_concentrations["patient number"].unique().shape

In [None]:
pd_df_patient_list["cum count"] = pd_df_patient_list.groupby("Participant ID").cumcount() + 1

In [None]:
pd_df_patient_list[pd_df_patient_list["cum count"] > 1]

In [None]:
pd_df_estimated_concentrations["patient number"].unique()

In [None]:
pd_df_patient_list["Participant ID"].unique().shape

In [None]:
set(pd_df_estimated_concentrations["patient number"]) - set(pd_df_patient_list["Participant ID"])

In [None]:
set(pd_df_patient_list["Participant ID"]) - set(pd_df_estimated_concentrations["patient number"]) 

In [None]:
set(pd_df_patient_list["Participant ID"]) - set(pd_df_estimated_concentrations["patient number"]) - {1208, 1236, 1252, 1451, 1483, 1744, 3607, 620, 2848}
