# 🧬 PheWAS Pipeline with PheTK and Hail-Extracted Variant Data
This notebook demonstrates how to perform a Phenome-Wide Association Study (PheWAS) using the PheTK toolkit, starting from a variant file extracted using Hail from the All of Us dataset.

In [None]:
# Install PheTK but skip if already installed

!pip install PheTK --upgrade

## Import Libraries

In [None]:
import pandas as pd
from PheTK.Cohort import Cohort
from PheTK.Phecode import Phecode
from PheTK.PheWAS import PheWAS
from PheTK.Plot import Plot

## Parameters

In [None]:
# Set file paths and parameters
bucket_path = "your/local/or/gcs/path"  # Replace with your actual path
SNP_ID = "rs75853687"

variant_file = f"{bucket_path}/{SNP_ID}_genotypes.tsv"
phecode_output = f"{bucket_path}/filtered_phecode_counts.csv"
cohort_with_covariates = f"{bucket_path}/{SNP_ID}_cohort_with_covariates.csv"
phewas_results = f"{bucket_path}/{SNP_ID}_phewas_results.csv"


## Step 1: Generate Phecode Count Table

In [None]:
phecode = Phecode(platform="aou")
phecode.count_phecode(
    phecode_version="X",
    icd_version="US",
    phecode_map_file_path=None,
    output_file_name=phecode_output
)

## Step 2: Load Variant File Extracted with Hail

In [None]:
genotypes_df = pd.read_csv(variant_file, sep='\t', dtype={'s': str})
genotypes_df = genotypes_df.rename(columns={'s': 'person_id', 'n_alt': 'case'})
genotypes_df = genotypes_df.drop(columns=['AD'], errors='ignore')
genotypes_df.to_csv(f"{bucket_path}/{SNP_ID}_cohort.csv", index=False)


## Step 3: Add Covariates

In [None]:
cohort = Cohort(platform="aou", aou_db_version=8)
cohort.add_covariates(
    cohort_csv_path=f"{bucket_path}/{SNP_ID}_cohort.csv",
    natural_age=False,
    age_at_last_event=True,
    sex_at_birth=True,
    genetic_ancestry=True,
    first_n_pcs=10,
    drop_nulls=True,
    output_file_name=cohort_with_covariates
)

## Step 4: Run PheWAS

In [None]:
phewas = PheWAS(
    phecode_version="X",
    phecode_count_csv_path=phecode_output,
    cohort_csv_path=cohort_with_covariates,
    sex_at_birth_col="sex_at_birth",
    male_as_one=True,
    covariate_cols=[
        "age_at_last_event", "sex_at_birth",
        "pc0", "pc1", "pc2", "pc3", "pc4", "pc5", "pc6", "pc7", "pc8", "pc9"
    ],
    independent_variable_of_interest="case",
    min_cases=10,
    min_phecode_count=2,
    output_file_name=phewas_results
)
phewas.run()


## Visualize Results

In [None]:
Plot.manhattan(phewas_results, pval_col="pval", title=f"PheWAS Manhattan Plot for {SNP_ID}")
