In [None]:
import sys
import os
import yaml
import numpy as np
from scipy import stats

import pandas as pd
pd.options.mode.chained_assignment = None

sys.path.append('../python/')


In [None]:
dict_parameters = yaml.safe_load(open("../parameters/july_2024_data_parameters.yaml", "r"))

In [None]:
pd_df_plate_data = pd.read_csv(
    open(
        os.path.join(
            dict_parameters["output directory path"],
            dict_parameters["plate data with locations file name"]
        ),
        "rb"
    ), 
    index_col=0,
)

In [None]:
pd_df_estimated_concentrations = pd.read_csv(
    open(
        os.path.join(
            dict_parameters["output directory path"],
            dict_parameters["estimated concentrations file name"]
        ),
        "rb"
    )
)

In [None]:
pd_df_quality_control_concentrations = pd.read_csv(
    open(
        os.path.join(
            dict_parameters["data directory path"],
            dict_parameters["quality control concentrations file name"]
        ),
        "rb"
    )
)

In [None]:
pd_df_plate_data

In [None]:
pd_df_quality_control_concentrations

In [None]:
def perform_t_test_on_paired_wells(pd_group):
    if len(pd_group) != 2:
        return np.nan
    mean_1, mean_2 = pd_group["IFN-gamma Trimmed Mean"].values
    std_dev_1, std_dev_2 = pd_group["IFN-gamma Trimmed Standard Deviation"].values
    count_1, count_2 = pd_group["IFN-gamma Count"].values
    t_statistic, p_value = stats.ttest_ind_from_stats(
        mean_1, 
        std_dev_1, 
        count_1, 
        mean_2, 
        std_dev_2, 
        count_2,
        equal_var=False,
    )
    return p_value

In [None]:

pd_df_tested = (
    pd_df_plate_data
    .groupby(["sample name plate", "plate number"])
    .apply(perform_t_test_on_paired_wells, include_groups=False)
    .reset_index()
)
pd_df_tested.columns = ["sample name plate", "plate number"] + ['t test p value']
pd_df_tested = pd.merge(
    pd_df_tested, 
    pd_df_plate_data[["sample name plate", "plate number", "IFN-gamma Trimmed Mean", "IFN-gamma Trimmed Standard Deviation", "IFN-gamma Count"]], 
    on = ["sample name plate", "plate number"], 
    how = "left"
)

In [None]:
pd_df_tested

In [None]:
pd_df_tested_unique = pd_df_tested.drop_duplicates(subset = ["sample name plate", "plate number"])

In [None]:
pd_df_tested_unique

In [None]:
import seaborn as sns
sns.histplot(pd_df_tested_unique, x = "t test p value", bins = 20)

In [None]:
def calculated_paired_intra_array_cv(pd_group):
    if len(pd_group) != 2:
        return np.nan
    mean_1, mean_2 = pd_group["IFN-gamma Trimmed Mean"].values
    std_dev_1, std_dev_2 = pd_group["IFN-gamma Trimmed Standard Deviation"].values
    count_1, count_2 = pd_group["IFN-gamma Count"].values
    t_statistic, p_value = stats.ttest_ind_from_stats(
        mean_1, 
        std_dev_1, 
        count_1, 
        mean_2, 
        std_dev_2, 
        count_2,
        equal_var=False,
    )
    return p_value

In [None]:
pd_df_estimated_qc_concentrations = pd_df_estimated_concentrations[pd_df_estimated_concentrations["sample name annotations"].str.contains("QC-\d+", regex = True)]

In [None]:
pd_df_estimated_qc_concentrations

In [None]:
selected_columns = ["sample name annotations", "plate number"]
for column in pd_df_estimated_qc_concentrations.columns:
    if "estimated" in column:
        selected_columns.append(column)
pd_df_estimated_qc_concentrations = pd_df_estimated_qc_concentrations[selected_columns]

In [None]:
pd_df_estimated_qc_concentrations

In [None]:
def check_value_within_qc_bounds(pd_row, str_col_name, low_vals, high_vals):
    qc_number = int(pd_row["sample name annotations"].split("-")[1]) - 1
    if (pd_row[str_col_name] >= low_vals[qc_number]) & (pd_row[str_col_name] <= high_vals[qc_number]):
        return 'within range'
    elif (pd_row[str_col_name] < low_vals[qc_number]):
        return f'below range by {low_vals[qc_number] - pd_row[str_col_name]:.2f}'
    else:
        return f'above range by {pd_row[str_col_name] - high_vals[qc_number]:.2f}'


In [None]:
for str_analyte in dict_parameters["list of analytes"]:
    low_vals = []
    high_vals = []
    for i in range(1, 3):
        low_vals.append(
            pd_df_quality_control_concentrations[
                pd_df_quality_control_concentrations["QC name"] == f"QC-{i}"
            ][f"{str_analyte} low"].values[0]
        )
        high_vals.append(
            pd_df_quality_control_concentrations[
                pd_df_quality_control_concentrations["QC name"] == f"QC-{i}"
            ][f"{str_analyte} high"].values[0]
        )
    
    new_col_name = f'{str_analyte} check'
    pd_df_estimated_qc_concentrations[new_col_name] = (
        pd_df_estimated_qc_concentrations.apply(
            lambda row: check_value_within_qc_bounds(
                row, 
                f"estimated concentration {str_analyte}",
                low_vals, 
                high_vals
            ), axis=1
        )
    )


In [None]:
pd_df_estimated_qc_concentrations

In [None]:
column_order = ["sample name annotations", "plate number"]
for str_analyte in dict_parameters["list of analytes"]:
    column_order.append(f"estimated concentration {str_analyte}")
    column_order.append(f"{str_analyte} check")
pd_df_estimated_qc_concentrations = pd_df_estimated_qc_concentrations[column_order]

In [None]:
pd_df_estimated_qc_concentrations