In [1]:
import pandas as pd
import scipy.stats

Introducing `find_best_pvalues()`, a Python function designed for precise data analysis. Let's break it down:

1. **Getting Started:**

We begin by defining some key elements: `filenames`, `file2prs`, and `prs2col`. These help organize our data.

2. **What It Does:**

This function looks through each phenotype file to find the best p-value. It's like finding the best clue to understand the connection between a trait and genetic risk scores.

3. **Options for Details:**

There's a choice to turn on `verbose` mode for more detailed updates during the process.

4. **Revealing Insights:**

Armed with data and math, the function crunches numbers to find the most revealing p-values.

5. **Results Display:**

Once done, it shows which p-value is best for each trait, along with how much it explains (R-squared value).

6. **Easy Integration:**

While the code needs specific data formats, it's built to fit smoothly into our workflow, using familiar tools like `pandas` and `scipy.stats`.

With `find_best_pvalues()`, we're equipped to analyze physiological data efficiently, unlocking insights for smarter decision-making.


In [10]:
filenames = [
    "physiological_bodylength_w_tail",
    "physiological_parafat",
    "physiological_bmi_bodylength_wo_tail",
    "physiological_bodyweight",
    "physiological_retrofat",
    "physiological_bmi_bodylength_w_tail",
    "physiological_epifat",
    "physiological_bodylength_wo_tail",
    "physiological_fasting_glucose",
]
file2prs = {
    "physiological_bodylength_w_tail": "bodylength_w_tail",
    "physiological_parafat": "parafat",
    "physiological_bmi_bodylength_wo_tail": "bmi_bodylength_wo_tail",
    "physiological_bodyweight": "bodyweight",
    "physiological_retrofat": "retrofat",
    "physiological_bmi_bodylength_w_tail": "bmi_bodylength_w_tail",
    "physiological_epifat": "epifat",
    "physiological_bodylength_wo_tail": "bodylength_wo_tail",
    "physiological_fasting_glucose": "fasting_glucose",
}

prs2col = {
    "bodylength_w_tail": "length_w_tail_cm",
    "parafat": "parametrial_fat_weight_g",
    "bmi_bodylength_wo_tail": "bmi_wo_tail",
    "bodyweight": "body_weight_g",
    "retrofat": "retroperitoneal_fat_weight_g",
    "bmi_bodylength_w_tail": "bmi_w_tail",
    "epifat": "epididymis_fat_weight_g",
    "bodylength_wo_tail": "length_wo_tail_cm",
    "fasting_glucose": "glucose_reading_mg_dl",
}


def find_best_pvalues(verbose=False):
    """
    Find the best p-values for each phenotype based on the highest R-squared value.

    This function iterates through a list of phenotype filenames and determines the best p-value
    that yields the highest R-squared value when calculating the correlation between the phenotype
    and the polygenic risk scores (PRS). It reads PRS and phenotype data files, computes the
    correlation coefficient (Pearson's r), and squares it to obtain the R-squared value.

    Returns:
    dict: A dictionary containing the best p-value and corresponding R2 value for each phenotype.
    """
    out_dir = "out"
    phenotype_data_path = (
        "../data/bmi/phenotype_data/Obesity_normalized_phenotypes_n3173.csv"
    )
    best_p = {}

    # Iterate through each phenotype file
    for fname in filenames:
        if verbose:
            print(f"Processing {fname}")
        p_max = (
            None  # Initialize variables to track the best p-value and R-squared value
        )
        r2_max = None
        input_file_template = f"../{out_dir}/{file2prs[fname]}/{file2prs[fname]}"

        # Iterate through a predefined list of p-values
        for pval in ["0.000001", "0.00001", "0.0001", "0.001", "0.05", "0.1"]:
            # Read PRS and phenotype data files
            prs = pd.read_csv(
                f"{input_file_template}.{pval}.profile", delim_whitespace=True
            )
            phen = pd.read_csv(phenotype_data_path)

            # Select relevant columns and rename them for consistency
            phen = phen[["rat_rfid", prs2col[file2prs[fname]]]]
            phen = phen.rename(
                columns={"rat_rfid": "IID", prs2col[file2prs[fname]]: "phen"}
            )
            phen = phen.dropna()  # Remove rows with missing values

            # Merge PRS and phenotype data on the shared column 'IID'
            merged_df = pd.merge(prs, phen, on=["IID"])

            # Compute the correlation coefficient (Pearson's r) and square it to obtain R-squared
            curr_r2 = (
                scipy.stats.pearsonr(merged_df["phen"], merged_df["SCORE"])[0] ** 2
            )
            if verbose:
                print(f"pval: {pval}\tcurr_r2: {curr_r2}")

            # Update the best p-value and R-squared value if necessary
            if r2_max is None:
                r2_max = curr_r2
                p_max = pval
            else:
                if curr_r2 > r2_max:
                    r2_max = curr_r2
                    p_max = pval
        if verbose:
            print(f"MAX_P: {p_max}")
            print()
        # Store the best p-value and corresponding R-squared value for the current phenotype
        best_p[fname] = [p_max, r2_max]

    return best_p


# Execute the function to find the best p-values for each phenotype
best_p_values = find_best_pvalues(verbose=True)

# Print the results
for name, values in best_p_values.items():
    print(f"Best p-value for {name}: {values[0]}, R-squared: {values[1]}")


Processing physiological_bodylength_w_tail
pval: 0.000001	curr_r2: 0.03295774003231058
pval: 0.00001	curr_r2: 0.048944454751620806
pval: 0.0001	curr_r2: 0.06445191229957449
pval: 0.001	curr_r2: 0.12169483656901386
pval: 0.05	curr_r2: 0.3208891010666829
pval: 0.1	curr_r2: 0.35016668169586007
MAX_P: 0.1

Processing physiological_parafat
pval: 0.000001	curr_r2: 0.022264264664412888
pval: 0.00001	curr_r2: 0.032840995726334275
pval: 0.0001	curr_r2: 0.08072737033242265
pval: 0.001	curr_r2: 0.1488676581461826
pval: 0.05	curr_r2: 0.3755319441719301
pval: 0.1	curr_r2: 0.4266713335718723
MAX_P: 0.1

Processing physiological_bmi_bodylength_wo_tail
pval: 0.000001	curr_r2: 0.01651802539395314
pval: 0.00001	curr_r2: 0.02220872664061226
pval: 0.0001	curr_r2: 0.050723044157946325
pval: 0.001	curr_r2: 0.08499428086252804
pval: 0.05	curr_r2: 0.21084501207747885
pval: 0.1	curr_r2: 0.23739378392724345
MAX_P: 0.1

Processing physiological_bodyweight
pval: 0.000001	curr_r2: 0.06661401623067977
pval: 0.00001