In [1]:
# install.packages(c("readr", "devtools"))
# devtools::install_github('PheWAS/PheWAS')


In [2]:
library(dplyr)
library(tidyr)
library(readr)
library(stringr)
library(PheWAS)
library(jsonlite)



Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: ggplot2

Loading required package: parallel

Welcome to the new version of PheWAS. This version has many updates; please see https://github.com/PheWAS/PheWAS/tree/legacy for the legacy release if needed. Check ?PheWAS for more documentation



# Create genotype file

In [3]:
# Function to create a gene burden table helper
create_gene_burden_table_helper <- function(burden_df, annotations, maf, lf_samples_df, hgnc_dict) {
  # Map the 'gene' column using hgnc_dict
  burden_df$gene <- hgnc_dict[burden_df$gene]
  
  # Subset the burden_df based on annotations and maf_max
  masked_burden_df <- burden_df[
    burden_df$annotation %in% annotations & burden_df$maf_max <= maf, 
    c("gene", "samples")
  ]
  
    
  masked_burden_df <- masked_burden_df %>%
    group_by(gene) %>%
    summarise(samples = list(unique(unlist(strsplit(paste(samples, collapse=","), split=",")))))

  
  # Combine with lf_samples_df (assumed to be a data frame with gene and samples columns)
  masked_burden_df <- rbind(masked_burden_df, lf_samples_df)
  
  return(masked_burden_df)
}

# Function to create gene burden tables
create_gene_burden_tables <- function(burden_df, maf, lf_samples_df, hgnc_dict) {
  masks <- c("pLoF", "Missense_strict", "Missense_lenient")
  annot_terms <- list(c("lof"), 
                      c("lof", "missense_strict"), 
                      c("lof", "missense_strict", "missense_lenient"))
  
  # Create a list of gene burden tables for each annotation term
  gene_burden_dict <- setNames(
    lapply(annot_terms, function(at) create_gene_burden_table_helper(burden_df, at, maf, lf_samples_df, hgnc_dict)),
    masks
  )
  
  return(gene_burden_dict)
}

      
get_samples_helper <- function(combos, genotype_df, cohort_samples) {
  if (length(intersect(combos, genotype_df$gene)) == length(combos)) {
    samples_per_gene <- genotype_df %>%
      filter(gene %in% combos) %>%
      pull(samples)

    samples_per_combo <- Reduce(intersect, samples_per_gene)
    samples_per_combo <- intersect(cohort_samples, samples_per_combo)
  } else {
    samples_per_combo <- character(0)
  }
  
  return(samples_per_combo)
}

                                        
get_samples <- function(gene, mask, gene_burden_dict, pop_samples) {
  # Extract gene and mask from the input `ser`
  
  # Access the gene samples dataframe from `gene_burden_dict` using the mask
  gene_samples_df <- gene_burden_dict[[mask]]
  
  # Initialize combos with the gene
  combos <- c(gene)
  
  # Call the helper function with combos, gene_samples_df, and pop_samples
  samples <- get_samples_helper(combos, gene_samples_df, pop_samples)
  
  return(samples)
}

In [4]:
# Load gene burden data
gene_burden_df <- read.delim(
  file = "/mnt/project/notebooks/wes/burden_preparation/data/ukb_burden.tsv.gz",
  sep = "\t",
  header = TRUE
)

# Load gnomAD annotation data
gnomad_df <- read.delim(
  file = "/mnt/project/notebooks/wes/burden_preparation/data/gnomad_annot.tsv.gz",
  sep = "\t",
  header = TRUE,
  colClasses = c("locus" = "character", "alleles" = "character", "maf_gnomad_popmax" = "numeric")
)[, c("locus", "alleles", "maf_gnomad_popmax")]

# Merge dataframes on 'locus' and 'alleles'
gene_burden_df <- merge(gene_burden_df, gnomad_df, by = c("locus", "alleles"))

# Calculate the maximum MAF
gene_burden_df$maf_max <- apply(gene_burden_df[, c("maf", "maf_gnomad_popmax")], 1, max)

# Read the JSON file into an R list
hgnc_dict <- fromJSON("/mnt/project/notebooks/bmi/data/hgnc_gene_map.json")

# Assuming the function 'create_gene_burden_tables' is already implemented in R

# Create gene burden tables
gene_burden_dict <- create_gene_burden_tables(gene_burden_df, 0.001, data.frame(), hgnc_dict)


# Get phecode file

In [5]:
phecode_file = "/mnt/project/notebooks/bmi/data/downstream/phecodes/phecodes.csv.gz"

phecode_df <- read_csv(phecode_file, col_types = cols(id = col_character()))


# Run phewas per ancestry

In [6]:
# Function to normalize covariates
normalize_covariates <- function(pheno_df, covariates, exclude = character()) {
    norm_pheno_df <- pheno_df
    for (cov in covariates) {
    if (!(cov %in% exclude)) {
      norm_pheno_df[[cov]] <- scale(norm_pheno_df[[cov]])  # Standardize column
    }
    }
    return(norm_pheno_df)
}


In [8]:
ancestries = c("afr", "amr", "eas", "eur", "sas", "mid")
genes = c("YLPM1", "RIF1", "GIGYF1", "SLC5A3", "GRM7")
masks = c("pLoF", "pLoF", "pLoF", "Missense_lenient", "Missense_lenient")

for (anc in ancestries) {
    pheno_file <- paste0("/mnt/project/notebooks/bmi/data/processed/", anc, "_phenotype.tsv.gz")

    pheno_df <- read.delim(
        pheno_file, sep="\t", header = TRUE, colClasses = c("IID" = "character")
    )

    # Normalize covariates
    covariates <- c("age", "age_2", "age_sex", "genetic_sex", "exome_release_batch", paste0("genetic_pca", 1:10))
    norm_pheno_df <- normalize_covariates(pheno_df, covariates, exclude = c("genetic_sex", "exome_release_batch"))
    norm_pheno_df <- norm_pheno_df %>%
      select(id=IID, all_of(covariates))

    # Add phecode info
    norm_pheno_df <- merge(norm_pheno_df, phecode_df, by='id')
    phenos <- colnames(phecode_df)
    phenos <- phenos[phenos != 'id']
    
    # get gene samples and annotate gene column
    pop_samples <- unique(as.character(norm_pheno_df$id))

    for (i in seq_along(genes)) {
        gene <- genes[i]
        mask <- masks[i]
        
        # Get gene samples and annotate gene column
        gene_samples <- get_samples(gene, mask, gene_burden_dict, pop_samples)
        norm_pheno_df$gene <- as.integer(norm_pheno_df$id %in% gene_samples)


        # Run PheWAS
        cores <- as.integer(24)
        res <- phewas(data=norm_pheno_df, phenotypes=phenos, genotypes='gene',
                      covariates=covariates,
                      min.records=5, additive.genotypes=F, cores=cores)

        # Save results in CSV format with unique filenames
        output_file <- paste0("phewas_", anc, "_ukb.csv.gz")
        write.csv(res, gzfile(output_file), row.names = FALSE)

        # Upload results
        system(paste0('dx upload ', output_file, ' --path /notebooks/bmi/data/downstream/phewas/', gene, '/'), intern=T)
    }
}


Starting cluster...

Cluster created, finding associations...

Compiling results...

Cleaning up...

Starting cluster...

Cluster created, finding associations...

Compiling results...

Cleaning up...

Starting cluster...

Cluster created, finding associations...

Compiling results...

Cleaning up...

Starting cluster...

Cluster created, finding associations...

Compiling results...

“Not all models converged, check the notes column for details.”
Cleaning up...

Starting cluster...

Cluster created, finding associations...

Compiling results...

Cleaning up...

Starting cluster...

Cluster created, finding associations...

Compiling results...

Cleaning up...

