<a href="https://colab.research.google.com/github/comparativechrono/100KGP_enrichment/blob/main/PRS/case_control_PRS_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

# Load the weights file
variant_weights = pd.read_csv("variant_weights.txt", sep="\t", header=None, names=["ID", "Weight"])

# Calculate Individual PRS

def parse_vcf_info_with_prs(file_path, weights):
    # Read the VCF-like file
    vcf_data = pd.read_csv(file_path, sep="\t", header=None, names=["Sample", "Chrom", "Pos", "ID", "Ref", "Alt", "Qual", "Filter", "GT", "GQ", "DP"])

    # Split GT field and count alleles
    vcf_data['AlleleCount'] = vcf_data['GT'].apply(lambda gt: sum(int(allele) for allele in gt.split("/")))

    # Join with weights to calculate PRS
    vcf_data = pd.merge(vcf_data, weights, on="ID")
    vcf_data['PRS'] = vcf_data['AlleleCount'] * vcf_data['Weight']

    return vcf_data

# Calculate PRS for cases and controls
cases_with_prs = parse_vcf_info_with_prs("cases.txt", variant_weights)
controls_with_prs = parse_vcf_info_with_prs("control.txt", variant_weights)

# Aggregate PRS and Calculate Mean and Standard Deviation

def aggregate_prs(data):
    return data.groupby("Sample")['PRS'].sum().reset_index()

cases_prs_summary = aggregate_prs(cases_with_prs)
controls_prs_summary = aggregate_prs(controls_with_prs)

# Write aggregated PRS to CSV
cases_prs_summary.to_csv("cases_prs_summary.csv", index=False)
controls_prs_summary.to_csv("controls_prs_summary.csv", index=False)

# Calculate mean and standard deviation for cases and controls
cases_stats = {'mean': cases_prs_summary['PRS'].mean(), 'sd': cases_prs_summary['PRS'].std()}
controls_stats = {'mean': controls_prs_summary['PRS'].mean(), 'sd': controls_prs_summary['PRS'].std()}

print("Cases PRS Mean and SD:", cases_stats)
print("Controls PRS Mean and SD:", controls_stats)

# Prepare summary statistics for output
summary_stats_df = pd.DataFrame({
    "Group": ["Cases", "Controls"],
    "Mean_PRS": [cases_stats['mean'], controls_stats['mean']],
    "SD_PRS": [cases_stats['sd'], controls_stats['sd']]
})

# Write summary statistics to CSV
summary_stats_df.to_csv("summary_statistics_prs.csv", index=False)


Cases PRS Mean and SD: {'mean': 2.9, 'sd': 1.4730919862656235}
Controls PRS Mean and SD: {'mean': 1.52, 'sd': 0.6058052492344385}
