<a href="https://colab.research.google.com/github/comparativechrono/100KGP_enrichment/blob/main/case_control_enrichment_with_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from pathlib import Path

def parse_vcf_info(file_path):
    col_names = ["Sample", "Chrom", "Pos", "ID", "Ref", "Alt", "Qual", "Filter", "GT", "GQ", "DP"]
    vcf_data = pd.read_csv(file_path, sep='\t', header=None, names=col_names)

    # Split GT field to count alleles
    vcf_data['AlleleCount'] = vcf_data['GT'].apply(lambda gt: sum(int(allele) for allele in gt.split("/")))

    return vcf_data

def calculate_allele_frequencies(vcf_data):
    # Group by ID and sum AlleleCount for frequencies
    allele_freq = vcf_data.groupby('ID')['AlleleCount'].sum().reset_index()
    return allele_freq

# Load cases and controls
cases_file = "cases.txt"
controls_file = "control.txt"

cases = parse_vcf_info(cases_file)
controls = parse_vcf_info(controls_file)

cases_freq = calculate_allele_frequencies(cases)
controls_freq = calculate_allele_frequencies(controls)

# Combine case and control frequencies
combined_freq = pd.merge(cases_freq, controls_freq, on='ID', suffixes=('_cases', '_controls'))

# Assuming total number of alleles analysed
total_cases_alleles = 80 * 2  # Each case can contribute 2 alleles max
total_controls_alleles = 230 * 2

# Prepare data for Fisher's Exact Test
results = []
for _, row in combined_freq.iterrows():
    # Construct 2x2 table: [case_present, case_absent], [control_present, control_absent]
    table = np.array([
        [row['AlleleCount_cases'], total_cases_alleles - row['AlleleCount_cases']],
        [row['AlleleCount_controls'], total_controls_alleles - row['AlleleCount_controls']]
    ])
    # Perform Fisher's Exact Test
    odds_ratio, p_value = fisher_exact(table, alternative='two-sided')
    results.append({'ID': row['ID'], 'p_value': p_value, 'odds_ratio': odds_ratio})

results_df = pd.DataFrame(results)

# Write results to CSV
output_file = "3A_fishers_results_python.csv"
results_df.to_csv(output_file, index=False)

print(f"Results written to {output_file}")


Results written to 3A_fishers_results_python.csv
