In [1]:
######## snakemake preamble start (automatically inserted, do not edit) ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character",
        bench_iteration = "numeric",
        scriptdir = "character",
        source = "function"
    )
)
snakemake <- Snakemake(
    input = list(),
    output = list("resources/GWAS/MungedSumStats/done"),
    params = list("resources/GWAS/selected-gwas.yaml", 1e-05, "resources/GWAS/MungedSumStats", "gwas_yaml" = "resources/GWAS/selected-gwas.yaml", "min_p_val" = 1e-05, "out_dir" = "resources/GWAS/MungedSumStats"),
    wildcards = list(),
    threads = 1,
    log = list(),
    resources = list("tmpdir", "tmpdir" = "/scratch/midway3/chaodai/TMP"),
    config = list("Dataset" = list("Geuvadis" = list("Metadata" = "resources/geuvadis-sample-run-pop-lookup.tsv", "Linked_SampleIDs" = "resources/geuvadis-1kgp-common-sample-id.txt", "Linked_1to1_SampleIDs" = "resources/geuvadis-1kgp-common-1to1only-sample-id.txt"), "GTEx" = list("Metadata" = "resources/GTEx-SampleID-Tissue-Lookup.csv", "Junc_meta" = "resources/GTEx/juncs/sampid-smts-smtsd-subjid.tsv")), "junction_list" = list("test" = "resources/leafcutter_junc_files_list.txt", "Geuvadis" = "resources/Geuvadis_juncs.txt"), "intron_class" = c("resources/gencode_v43_plus_v37_productive.intron_by_transcript_BEDlike.txt.gz"), "annotation" = list("gencode" = "/project/yangili1/cdai/genome_index/hs38/gencode.v38.primary_assembly.annotation.dataframe.csv", "gencode_v26_genes" = "/project/yangili1/cdai/genome_index/hs38/gencode.v26.primary_assembly.annotation.genes.bed.gz", "gencode_v38_genes" = "/project/yangili1/cdai/genome_index/hs38/gencode_gene_v38_anno.bed", "gtf" = list("v43" = "/project/yangili1/cdai/annotations/hg38/gencode.v43.primary_assembly.annotation.gtf.gz")), "genome38" = "/project/yangili1/cdai/genome_index/hs38/GRCh38.primary_assembly.genome.fa", "VCF" = list("GTEx" = list("HG38_v7" = "/project/yangili1/cdai/genome_index/hs38/GTEx_v7/GTEx_Analysis_2017-06-05_v8_WGS_VCF_files_GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.SHAPEIT2_phased.vcf.gz", "HG38_v7_indivs" = "/project/yangili1/cdai/genome_index/hs38/GTEx_v7/GTEx_Analysis_2017-06-05_v8_WGS_VCF_Indiv_ids.txt"), "Geuvadis" = list("HG38_1kg_b38" = "/project/yangili1/cdai/SNP/1kg_b38", "HG38_1kg_b38_indivs" = "/project/yangili1/cdai/SNP/1kg_b38/CCDG_14151_B01_GRM_WGS_2020-08-05_individual-ids.txt")), "alignments" = list("LCL" = list("bam" = "/project2/yangili1/ankeetashah/hg38_LCL", "bigwig" = "/project2/yangili1/ankeetashah/hg38_LCL")), "HG38_CHROM_SIZES" = "/project/yangili1/cdai/annotations/hg38/hg38.chrom.sizes", "contrasts" = list("GTEx" = "config/ds_dge_tissue_contrast_levels.txt"), "crossspecies" = list("gtf" = list("Human" = "/project/yangili1/cdai/annotations/hg19/Homo_sapiens.GRCh37.75.gtf.gz", "Mouse" = "/project/yangili1/cdai/annotations/GRCm38/Mus_musculus.GRCm38.77.gtf.gz", "Rat" = "/project/yangili1/cdai/annotations/RGSC5.0/Rattus_norvegicus.Rnor_5.0.77.gtf.gz", "Macaque" = "/project/yangili1/cdai/annotations/mmul1.0/Macaca_mulatta.MMUL_1.77.gtf.gz"), "genome" = list("Human" = "/project/yangili1/cdai/annotations/hg19/hg19.fa.gz", "Mouse" = "/project/yangili1/cdai/annotations/GRCm38/mm10.fa.gz", "Rat" = "/project/yangili1/cdai/annotations/RGSC5.0/rn5.fa.gz", "Macaque" = "/project/yangili1/cdai/annotations/mmul1.0/Macaca_mulatta.MMUL_1.dna_rm.fa.gz"))),
    rule = "MungeGwasSummaryStats",
    bench_iteration = as.numeric(NA),
    scriptdir = "/project/yangili1/cdai/SpliFi/code/workflow/rules/../scripts",
    source = function(...) {
        wd <- getwd()
        setwd(snakemake@scriptdir)
        source(...)
        setwd(wd)
    }
)
setwd("/project/yangili1/cdai/SpliFi/code")
######## snakemake preamble end #########


In [11]:
# start coding here
getwd()
snakemake@params


In [5]:
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
library(glue)


In [6]:
library(furrr)
plan(multisession, workers = min(availableCores(), 6))


Loading required package: future



In [25]:
suppressMessages(library(GenomicRanges))

In [7]:
MinPVal <- snakemake@params$min_p_val %>% as.numeric()
OutDir <- snakemake@params$out_dir


In [8]:
gwas <- yaml::yaml.load_file(snakemake@params$gwas_yaml)


In [9]:
print(gwas)


$GWAS
$GWAS$AD
$GWAS$AD$trait
[1] "Alzheimer's disease"

$GWAS$AD$label
[1] "AD"

$GWAS$AD$path
[1] "/project/yangili1/cdai/SpliFi/code/resources/GWAS/zpmu-GWAS_loci/AD_sumstats_Jansenetal.txt.gz"

$GWAS$AD$isHg38
[1] FALSE


$GWAS$PD
$GWAS$PD$trait
[1] "Parkinson's disease"

$GWAS$PD$label
[1] "PD"

$GWAS$PD$path
[1] "/project/yangili1/cdai/SpliFi/code/resources/GWAS/zpmu-GWAS_loci/pkd_nallsEtAl2019_excluding23andMe_allVariants.tab.gz"

$GWAS$PD$isHg38
[1] FALSE


$GWAS$T2D
$GWAS$T2D$trait
[1] "Type 2 diabetes"

$GWAS$T2D$label
[1] "T2D"

$GWAS$T2D$path
[1] "/project/yangili1/cdai/SpliFi/code/resources/GWAS/zpmu-GWAS_loci/"

$GWAS$T2D$isHg38
[1] FALSE


$GWAS$HT
$GWAS$HT$trait
[1] "Height"

$GWAS$HT$label
[1] "HT"

$GWAS$HT$path
[1] "/project/yangili1/cdai/SpliFi/code/resources/GWAS/zpmu-GWAS_loci/Locke_height_UKBiobank_2018.txt.gz"

$GWAS$HT$isHg38
[1] FALSE





# Set up hg18 - to hg38 liftover


In [20]:
suppressMessages(library(rtracklayer))

chain = import.chain("/project/yangili1/cdai/annotations/liftover/hg19ToHg38.over.chain")

# Munge data

output consistent dataframe with

`CHR, POS, SNP, BETA, SE, P, DOF`


---


## Munge AD, Alzheimer's disease


In [12]:
g <- pluck(gwas, "GWAS", "AD")
g


In [13]:
df <- fread(g$path)


In [14]:
df[1:3]


uniqID.a1a2,CHR,BP,A1,A2,SNP,Z,P,Nsum,Neff,dir,MAF,BETA,SE
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>
1:715265_T_C,1,715265,T,C,rs12184267,2.121973,0.03384,381761,381761,??+?,0.0408069,0.01227458,0.005784513
1:715367_G_A,1,715367,G,A,rs12184277,1.957915,0.05024,382151,382151,??+?,0.0410687,0.01128522,0.005763896
1:717485_A_C,1,717485,A,C,rs12184279,1.912438,0.05582,382180,382180,??+?,0.0405759,0.01108656,0.005797083


In [15]:
# select snps based on min pval
print(glue("Select snps using min p value: p < {MinPVal}"))


Select snps using min p value: p < 1e-05


In [16]:
df <- df[P < MinPVal]

print(glue("dim after filtering with P value: {dim(df)}"))


dim after filtering with P value: 4968
dim after filtering with P value: 14


In [17]:
outdf <- df[, .(CHR = CHR, POS = BP, SNP = SNP, BETA = BETA, SE = SE, P = P, DOF = Nsum)][order(CHR, POS)]


In [29]:
leadSNPs <- df[, .(CHR = CHR, POS = BP, SNP = SNP)]
leadSNPs <- GRanges(
  seqnames = leadSNPs$CHR,
  ranges = IRanges(start = leadSNPs$POS, end = leadSNPs$POS),
  strand = "*",
  SNP = leadSNPs$SNP
)
seqlevelsStyle(leadSNPs) <- "UCSC"

In [31]:
# use leadSNPs to construct loci, each locus is a 1Mb window centered at the lead SNP
loci <- GenomicRanges::reduce(leadSNPs, min.gapwidth = 1e6)

In [30]:
leadSNPs

GRanges object with 4968 ranges and 1 metadata column:
         seqnames    ranges strand |         SNP
            <Rle> <IRanges>  <Rle> | <character>
     [1]     chr1   6989416      * | rs111677930
     [2]     chr1  31566864      * |  rs11336043
     [3]     chr1  31567151      * |  rs59705505
     [4]     chr1  95840521      * |  rs78722519
     [5]     chr1 161094834      * |  rs10908823
     ...      ...       ...    ... .         ...
  [4964]    chr20  55099470      * |   rs6024927
  [4965]    chr21  20170364      * | rs778350343
  [4966]    chr21  27522908      * |  rs12483003
  [4967]    chr21  27523988      * |    rs455047
  [4968]    chr21  27534261      * |   rs4817090
  -------
  seqinfo: 21 sequences from an unspecified genome; no seqlengths

In [40]:
findOverlaps(loci[2], leadSNPs, minoverlap = 1)

Hits object with 2 hits and 0 metadata columns:
      queryHits subjectHits
      <integer>   <integer>
  [1]         1           2
  [2]         1           3
  -------
  queryLength: 1 / subjectLength: 4968

In [55]:
loci[width(loci) < 2] %>% promoters(5e5, 5e5)
loci[width(loci) < 2] %>% head

GRanges object with 47 ranges and 0 metadata columns:
       seqnames              ranges strand
          <Rle>           <IRanges>  <Rle>
   [1]     chr1     6489416-7489415      *
   [2]     chr1   95340521-96340520      *
   [3]     chr2   71526348-72526347      *
   [4]     chr2 135161678-136161677      *
   [5]     chr2 186250120-187250119      *
   ...      ...                 ...    ...
  [43]    chr17    9675446-10675445      *
  [44]    chr18   55689459-56689458      *
  [45]    chr19     4603567-5603566      *
  [46]    chr19   14198231-15198230      *
  [47]    chr21   19670364-20670363      *
  -------
  seqinfo: 21 sequences from an unspecified genome; no seqlengths

GRanges object with 6 ranges and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]     chr1   6989416      *
  [2]     chr1  95840521      *
  [3]     chr2  72026348      *
  [4]     chr2 135661678      *
  [5]     chr2 186750120      *
  [6]     chr2 191095557      *
  -------
  seqinfo: 21 sequences from an unspecified genome; no seqlengths

In [56]:
6989416-6489416
7489415-6989416

In [22]:
head(outdf)
dim(outdf)

CHR,POS,SNP,BETA,SE,P,DOF
<int>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<int>
1,6989416,rs111677930,-0.01397147,0.00303309,4.098005e-06,390467
1,31566864,rs11336043,0.047737,0.0107656,9.24e-06,17477
1,31567151,rs59705505,0.0494909,0.0106984,3.728e-06,17477
1,95840521,rs78722519,0.04814822,0.010740759,7.368436e-06,457840
1,161094834,rs10908823,0.01021586,0.002182186,2.848093e-06,458400
1,161097241,rs12727614,0.01157593,0.002370435,1.042386e-06,458498


In [39]:
OutFile <- glue("{OutDir}/{g$label}.txt")
fwrite(outdf, OutFile, sep = " ")


---


## Munge HT, Height


In [None]:
g <- pluck(gwas, "GWAS", "PD")
g


In [None]:
df <- fread(g$path)


In [None]:
df[1:3]
dim(df)


SNPID,Chr,Position,Pval
<chr>,<int>,<int>,<dbl>
1:155135036,1,155135036,5.022e-30
1:156007988,1,156007988,9.46e-18
1:152192927,1,152192927,6.326e-14


In [None]:
# select snps based on min pval
print(glue("Select snps using min p value: p < {MinPVal}"))


Select snps using min p value: p < 1e-05


In [None]:
df <- df[P < MinPVal]

print(glue("dim after filtering with P value: {dim(df)}"))


dim after filtering with P value: 4968
dim after filtering with P value: 14


In [None]:
outdf <- df[, .(CHR = CHR, POS = BP, SNP = SNP, BETA = BETA, SE = SE, P = P, DOF = Nsum)][order(CHR, POS)]


In [None]:
OutFile <- glue("{OutDir}/{g$label}.txt")
fwrite(outdf, OutFile, sep = " ")


---


In [70]:
geno1 <- fread("results/geno/GTEx/Liver/chr1.pca")
pheno1 <- fread("results/eqtl/GTEx/Liver/qqnorm.sorted.bed.pca")


In [61]:
geno1[1:5, 1:5]
dim(geno1)


SampleID,GTEX-11DXZ,GTEX-11EQ9,GTEX-11GSP,GTEX-11NUK
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
results/geno/GTEx/Liver/chr1_1_1_svd_PC1,454.958,-46.6376,-52.4983,-60.3099
results/geno/GTEx/Liver/chr1_1_1_svd_PC2,-30.2854,-15.6233,15.786,-4.85989
results/geno/GTEx/Liver/chr1_1_1_svd_PC3,-3.58442,5.17735,-89.5607,-6.54335
results/geno/GTEx/Liver/chr1_1_1_svd_PC4,-8.37309,101.819,53.0242,68.913
results/geno/GTEx/Liver/chr1_1_1_svd_PC5,11.1468,49.2165,16.4646,20.6576


In [71]:
pheno1[1:5, 1:5]
dim(pheno1)


SampleID,GTEX-11DXZ,GTEX-11EQ9,GTEX-11GSP,GTEX-11NUK
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
PC1,-6.54117,50.09002,-43.9912,-81.67349
PC2,63.01419,45.85104,-32.92342,-2.26365
PC3,-43.55778,-7.79232,-48.03783,-7.64037
PC4,3.85122,3.03478,11.87543,12.90839
PC5,-7.12123,-0.09161,-26.82369,8.49484


In [68]:
pheno1[1:5, 1:5]
dim(pheno1)


SampleID,GTEX-11DXY,GTEX-11DXZ,GTEX-11EQ9,GTEX-11GSP
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
PC1,63.67026,4.73872,-50.98865,41.82375
PC2,-15.17553,-62.39309,-47.27571,35.455
PC3,34.56147,-42.4615,-8.78532,-52.34034
PC4,-11.20506,6.93315,-1.65863,-1.66468
PC5,20.06355,-4.57093,0.36809,-22.76012


In [65]:
s <- read_lines("results/pheno/noisy/GTEx/Liver/separateNoise/leafcutter_names.txt")


In [66]:
str(s)

 chr [1:208] "GTEX-11DXZ" "GTEX-11EQ9" "GTEX-11GSP" "GTEX-11NUK" ...
