# Eigengene Single-Variant Association Preparation

**Created**: 25 November 2021

## Environment

In [1]:
if (!requireNamespace("BiocManager", quietly = TRUE)) {
    install.packages("BiocManager")
}

if (!requireNamespace("ComplexHeatmap", quietly=TRUE)) {
    BiocManager::install("ComplexHeatmap")
}

In [2]:
library(tidyverse)
library(RColorBrewer)
library(ComplexHeatmap)
library(data.table)

setwd("~/eQTL_pQTL_Characterization/")

source("04_Expression/scripts/utils/ggplot_theme.R")

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.8
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: grid

ComplexHeatmap version 2.6.2
Bioconductor page: http://bioconductor.org/packages

## Load Data

In [3]:
demographics <- readxl::read_xls("/nfs/team282/data/gains_team282/ClinicalData/DEMO_12jun2019.xls", sheet=1) %>%
    dplyr::mutate(sex=as.numeric(sex)) %>%
    dplyr::mutate(diagnosis=as.numeric(diagnosis)) %>%
    dplyr::mutate(SubjectBarCode=gsub("^GA", "", SubjectBarCode))

In [4]:
srs.info <- read.table("/nfs/team282/data/gains_team282/full-gains-SRS-predictions_mNN-RF.tsv", header=T) %>%
    dplyr::mutate(Sample_id=gsub("^GA", "", Sample_id)) %>%
    dplyr::filter(Assay=="RNA-seq")

rownames(srs.info) <- srs.info$Sample_id

In [6]:
covs <- read.table("~/gains_team282/eqtl/data/covs_and_peer_factors.txt") %>%
    dplyr::mutate(Sample.ID=gsub("^GA", "", rownames(.))) %>%
    dplyr::mutate(GAinS.ID=gsub("\\_.", "", Sample.ID))

In [8]:
held.out <- c("Neutrophils", "Lymphocytes", "Monocytes", paste0("PC", 1:7), "Diagnosis", "SRSq", "sex")
peer <- paste0("PEER_", 1:30)

covs <- covs %>%
    merge(., demographics, by.x="GAinS.ID", by.y="SubjectBarCode") %>%
    merge(., srs.info, by.x="Sample.ID", by.y=0) %>%
    dplyr::select(diagnosis=diagnosis, everything()) %>%
    dplyr::select(Sample.ID, any_of(held.out), any_of(peer)) %>%
    as.data.frame()
                                  
rownames(covs) <- covs$Sample.ID
covs <- covs %>%
    dplyr::select(-Sample.ID)
                                  
head(covs)

Unnamed: 0_level_0,Neutrophils,Lymphocytes,Monocytes,PC1,PC2,PC3,PC4,PC5,PC6,PC7,⋯,PEER_21,PEER_22,PEER_23,PEER_24,PEER_25,PEER_26,PEER_27,PEER_28,PEER_29,PEER_30
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
UK01050129_5,-1.7381318,1.8281826,1.2829455,0.177279994,0.051694602,0.0248603,-0.035102502,-0.00455149,-0.0603707,0.0396488,⋯,-0.001172263,-0.05533835,0.049742281,0.03725176,0.04495762,0.12696229,0.08175036,0.05295999,0.164950684,-0.10379386
UK01060123_3,-0.384537,0.3878375,0.4548023,-0.000388133,-0.000674099,0.00810357,0.000212791,0.0339615,0.0266317,-0.0188528,⋯,-0.012723563,0.02226736,-0.027520355,-0.05782387,0.03653939,0.05959492,-0.02611259,-0.01790827,0.012047987,0.05554044
UK01070117_3,-0.7738039,0.5846032,0.9325467,0.0164323,-0.048464298,0.00813966,-0.0476996,-0.0792683,0.0263629,-0.0822474,⋯,0.040162142,0.06840651,6.2529e-05,0.05292596,0.01705806,0.05931079,0.0896273,0.03772458,0.008254956,-0.01570964
UK01080111_1,0.1305646,0.2888638,-0.6232047,0.161789,0.036224801,0.00467903,-0.048611499,0.00180036,-0.0352357,0.0501251,⋯,-0.002928361,-0.03274319,0.088269033,0.07396927,-0.01906336,0.02454052,0.04243371,0.03055887,0.12338376,-0.02276623
UK01110093_1,1.0909779,-1.248816,-0.4429439,-0.00629672,0.00282151,0.00699239,0.00435165,0.00268802,0.0166922,-0.00733983,⋯,-0.108193189,0.01590979,-0.093242317,-0.03037383,-0.07385907,-0.05968965,0.0969543,-0.03975484,0.130198911,-0.05011296
UK01110093_5,0.2302416,0.0,-0.3306479,-0.00629672,0.00282151,0.00699239,0.00435165,0.00268802,0.0166922,-0.00733983,⋯,-0.018054215,0.02094322,-0.008623131,0.09560122,-0.07802416,0.02365689,0.08040638,-0.02349218,0.189585552,-0.07635343


In [9]:
eigengenes <- read.csv("~/gains_team282/nikhil/expression/gene_expression/eigengenes.csv", row.names=1)

head(eigengenes)

Unnamed: 0_level_0,ME_1,ME_2,ME_3,ME_4,ME_5,ME_6,ME_7,ME_8,ME_9,ME_10,⋯,ME_97,ME_98,ME_99,ME_100,ME_101,ME_102,ME_103,ME_104,ME_105,ME_106
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
UK02270173_3,0.008640296,0.007926439,0.0147219682,-0.011774847,0.029042358,-0.006588595,-0.004034583,-0.013647582,0.020445938,0.03657146,⋯,0.105461618,0.014671996,-0.0318624,-0.019044546,-0.001067989,0.03791366,0.008340969,-0.002505741,0.033258706,-0.003317747
UK15130120_3,-0.037073495,-0.03317947,-0.0311931107,-0.006189163,0.039065099,-0.011809051,0.037283445,-0.060021257,-0.036280957,0.03133065,⋯,-0.006444524,-0.019642856,0.01880373,-0.044684726,-0.043663066,-0.0255029,0.031624201,-0.011931533,0.051184001,-0.012327128
UK58000006_3,0.029670524,0.02149011,0.0369543066,-0.015010027,-0.00474517,0.02313339,-0.027585817,0.010804862,0.044649369,-0.01014143,⋯,0.085257258,-0.006499496,-0.0158215,0.030709065,0.040952839,0.04332857,-0.003376643,0.012633897,-0.00662284,-0.004740631
UK47010004_3,0.011954974,0.007360515,0.0073725862,-0.011608271,-0.011341294,-0.025127169,-0.009500677,0.013387153,0.007126337,-0.02607067,⋯,0.075920669,-0.016223383,0.02515734,-0.008023902,0.018023055,-0.01461801,0.010948287,0.023381207,-0.003072117,0.012014706
UK42020088_5,0.009818957,0.008322535,0.0004121376,-0.008882007,-0.028000468,0.002082996,-0.00953291,0.0114778,-0.001521594,-0.02223628,⋯,0.035896079,0.00927405,0.03406072,-0.005459722,0.037120618,-0.01089588,-0.008355454,-0.034402696,-0.032697222,0.007915598
UK47490007_3,-0.021930377,-0.024756322,-0.0294342468,-0.007742353,-0.005486842,-0.049842615,0.020825112,0.007441717,-0.030615907,-0.01853294,⋯,0.092538453,0.067616309,0.02790283,-0.036887001,-0.017742254,0.01562787,0.026717734,-0.009877409,-0.00360727,0.024871961


In [10]:
modules <- read.csv("~/gains_team282/nikhil/expression/gene_expression/modules.csv")

In [11]:
head(modules)

Unnamed: 0_level_0,Gene,Module
Unnamed: 0_level_1,<chr>,<chr>
1,ENSG00000001167,Module_1
2,ENSG00000002330,Module_1
3,ENSG00000002822,Module_1
4,ENSG00000005175,Module_1
5,ENSG00000005194,Module_1
6,ENSG00000005893,Module_1


In [14]:
gene.exp <- read.table("/lustre/scratch119/humgen/projects/gains_team282/eqtl/data/logcpm_864_20412_hla.txt")

colnames(gene.exp) <- gsub("^GA", "", colnames(gene.exp))

In [15]:
head(gene.exp)

Unnamed: 0_level_0,UK02270173_3,UK15130120_3,UK58000006_3,UK47010004_3,UK42020088_5,UK47490007_3,UK02770164_3,UK02770164_5,UK02630151_3,UK42150107_1,⋯,UK59070043_3,UK59070043_5,UK02510223_3,UK02XX0336_5,UK29090086_3,UK02XX0335_1,UK02XX0334_3,UK01210130_3,UK01210130_5,UK01380125_1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000230521,0.6644339,0.58271604,0.4046756,0.20905157,0.2211571,0.4288519,0.5133516,0.57768877,0.6990766,0.3812037,⋯,0.3823184,0.5423684,0.8676263,0.73833461,0.6461583,0.1227063,0.9621386,0.65838459,1.03561474,0.4457301
ENSG00000225864,0.9817872,0.43879936,0.6022344,0.92340489,0.2211571,1.1450686,0.1921362,0.57768877,0.1974851,1.2290495,⋯,0.6842527,0.8539227,0.91886777,0.1258162,0.5511865,0.4384007,0.5465139,0.40803066,0.46899151,0.3544115
ENSG00000227766,1.2117156,0.30682669,0.662501,0.79963286,0.6445686,0.313029,0.1183231,0.1675045,0.15059,1.0533199,⋯,0.4901384,0.4340781,0.37504164,0.04316943,0.6151905,0.3406488,0.3147798,0.43511377,0.17382742,0.1157514
ENSG00000237669,0.6420763,0.6925683,0.9792012,0.69929584,0.3765156,1.0756817,0.891205,0.60129641,0.8534668,1.0533199,⋯,1.0790156,0.6102946,1.03953331,0.97965846,0.7061683,0.7489857,0.7919934,0.85168813,0.69120049,1.0798771
ENSG00000271581,2.4744895,1.3315314,1.8310464,2.47531701,1.8812042,2.0531366,0.5972544,1.00686213,1.2811361,2.4244938,⋯,2.4728831,1.5338898,1.59080138,0.62923925,1.2804253,1.4727511,1.218765,1.46471386,1.44139443,1.189625
ENSG00000285647,0.0,0.03379072,0.0,0.05517067,0.674827,0.0,0.0,0.03509293,0.6990766,1.6556266,⋯,3.4117627,2.4012963,0.04682116,0.0,0.0,0.4695662,0.3762952,0.07078527,0.07206309,0.4159282


In [16]:
lead.cis.eqtl <- read.table("/nfs/users/nfs_n/nm18/gains_team282/eqtl/cisresults/eigenMT/ciseqtl_eigenMT_corrected.txt") %>%
    dplyr::filter(Sig)

In [17]:
head(lead.cis.eqtl)

Unnamed: 0_level_0,snps,gene,statistic,pvalue,beta,se,chr,SNPpos,TSS,BF,TESTS,BF.FDR,Sig,threshold
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<lgl>,<dbl>
22,rs3131972,ENSG00000237491,5.643405,2.400875e-08,0.07174216,0.01271257,1,817341,778747,7.058573e-06,294,1.993414e-05,True,9.170798e-05
23,rs3131972,ENSG00000230092,4.756587,2.329109e-06,0.06239499,0.0131176,1,817341,817712,0.0006917454,297,0.001593483,True,9.078164e-05
25,rs3131972,ENSG00000225880,6.292961,5.949542e-10,0.1001677,0.01591742,1,817341,827522,1.767014e-07,297,5.668498e-07,True,9.078164e-05
28,rs2272757,ENSG00000188976,-7.026045,4.904859e-12,-0.05107767,0.007269761,1,946247,959309,1.545031e-09,315,5.762103e-09,True,8.559411e-05
29,rs13303327,ENSG00000187961,-7.175012,1.845538e-12,-0.1351992,0.01884306,1,960326,960584,5.813445e-10,315,2.228468e-09,True,8.559411e-05
30,rs13303056,ENSG00000187583,-9.081031,1.922802e-18,-0.2963377,0.03263261,1,953778,966482,6.056826e-16,315,3.262796e-15,True,8.559411e-05


In [18]:
conditional.cis.eqtl <- readRDS("/nfs/users/nfs_n/nm18/gains_team282/eqtl/cisresults/conditionalanalysis/conditional_eQTL_results_final.rds")

In [19]:
head(conditional.cis.eqtl)

Unnamed: 0_level_0,SNP,Gene,eQTL_beta,eQTL_SE,pvalue,Number
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>
1,rs10753794,ENSG00000000457,0.0498587312408011,0.0070851585157937,5.19259471106013e-12,1
2,rs10919255,ENSG00000000460,-0.0726264894633498,0.0136853304567293,1.72815915933528e-07,1
3,rs77006036,ENSG00000000460,0.166236949102582,0.0334472538699365,8.47290238610159e-07,2
4,rs12406047,ENSG00000000971,0.17135374034767,0.0379807691167313,7.51523462853891e-06,1
5,rs6696136,ENSG00000001460,0.45884434698694,0.0871081626842823,1.94887370606405e-07,1
6,rs6676449,ENSG00000001460,-0.371531778822554,0.0880672792106829,2.8471651589813e-05,2


In [20]:
ebi.studies <- fread("04_Expression/data/gwas_catalog_v1.0.2-studies_r2022-02-21.tsv", header=TRUE, quote="") %>%
    as.data.frame()

In [21]:
head(ebi.studies, n=1)

Unnamed: 0_level_0,DATE ADDED TO CATALOG,PUBMEDID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,PLATFORM [SNPS PASSING QC],ASSOCIATION COUNT,MAPPED_TRAIT,MAPPED_TRAIT_URI,STUDY ACCESSION,GENOTYPING TECHNOLOGY
Unnamed: 0_level_1,<date>,<int>,<chr>,<date>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>
1,2021-08-24,34124712,Sherva R,2021-02-28,Explor Med,www.ncbi.nlm.nih.gov/pubmed/34124712,Genome-wide association study of phenotypes measuring progression from first cocaine or opioid use to dependence reveals novel risk genes.,Cocaine dependence (time to event),"3,554 African American cases, 478 African American controls, 2,712 European ancestry cases, 915 European ancestry controls","572 African American cases, 416 African American controls, 759 European ancestry cases, 1,620 European ancestry controls",Illumina [NR] (imputed),5,cocaine dependence,http://www.ebi.ac.uk/efo/EFO_0002610,GCST012225,Genome-wide genotyping array


In [22]:
ebi.assoc <- fread("04_Expression/data/gwas_catalog_v1.0.2-associations_e105_r2022-02-21.tsv", header=TRUE, quote="") %>%
    as.data.frame()

In [23]:
head(ebi.assoc, n=1)

Unnamed: 0_level_0,DATE ADDED TO CATALOG,PUBMEDID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,⋯,PVALUE_MLOG,P-VALUE (TEXT),OR or BETA,95% CI (TEXT),PLATFORM [SNPS PASSING QC],CNV,MAPPED_TRAIT,MAPPED_TRAIT_URI,STUDY ACCESSION,GENOTYPING TECHNOLOGY
Unnamed: 0_level_1,<date>,<int>,<chr>,<date>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,2021-07-07,34127860,Robertson CC,2021-06-14,Nat Genet,www.ncbi.nlm.nih.gov/pubmed/34127860,"Fine-mapping, trans-ancestral and genomic analyses identify causal variants, cells, genes and drug targets for type 1 diabetes.",Type 1 diabetes,"20,065 European ancestry cases, 33,065 European ancestry controls, 1,045 other-admixed ancestry cases, 1,103 other-admixed ancestry controls, 1,043 African ancestry cases, 3,206 African ancestry controls",,⋯,8.522879,,0.912,,Illumina [715631] (imputed),N,type 1 diabetes mellitus,http://purl.obolibrary.org/obo/MONDO_0005147,GCST90013445,Targeted genotyping array [ImmunoChip]


In [24]:
geno.bim <- fread("~/gains_team282/Genotyping/All_genotyping_merged_filtered_b38_refiltered_rsID.bim") %>%
    as.data.frame()
colnames(geno.bim) <- c("chr", "snps", "cM", "pos", "minor", "major")

In [25]:
head(geno.bim)

Unnamed: 0_level_0,chr,snps,cM,pos,minor,major
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<chr>,<chr>
1,1,rs3131972,0,817341,A,G
2,1,rs546843995,0,818053,0,G
3,1,rs553916047,0,818359,0,A
4,1,1:818740_T_C,0,818740,T,C
5,1,rs145604921,0,819378,0,C
6,1,rs535256652,0,821053,0,T


## Heatmap

Before performing single-variant association analysis, we want to check if any covariates are strongly associated with the module eigengenes.

In [26]:
all.vars <- merge(covs, eigengenes, by.x=0, by.y=0) %>%
    dplyr::select(-Row.names)

In [27]:
colors <- colorRampPalette(brewer.pal(11, "RdBu"))(101)

cor.mtx <- cor(all.vars)
cor.mtx.peer <- cor.mtx[peer, colnames(eigengenes)]
cor.mtx.held.out <- cor.mtx[held.out, colnames(eigengenes)]

h1 <- Heatmap(cor.mtx.peer, col=colors, heatmap_legend_param=list(
    title="Correlation",
    at=c(-1, 0, 1)
))

h2 <- Heatmap(cor.mtx.held.out, col=colors, cluster_rows=F, heatmap_legend_param=list(
    title="Correlation",
    at=c(-1, 0, 1)
))

svg("04_Expression/results/eigengene_peer_factor_correlation.svg", width=12, height=8)
h1 %v% h2
dev.off()

The PEER factors are built using gene expression data. It's not surprising that many of the eigengenes are associated with the PEER factors. WGCNA discarded around 8000 genes. The residual gene expression variation from these genes is likely captured by some of the PEER factors. We model this specifically using 30 gene expression PCs from the discarded genes.

The eigengenes are not associated with genotyping PCs (which is good).

![](../results/eigengene_peer_factor_correlation.svg)

Ideally, I will include only sex and genotyping PCs as covariates in this mapping, since everything else (cell counts, SRS group) are important signatures that may be correlated with module eigengenes. I will include the 30 PCs from the unassigned gene expression data to control for technical confounders.

## Save Covariates

Save list of patients (family ID in the first column and individual ID in the second column) to subset the genotyping data using PLINK. There are 638 unique patients with genotyping information.

In [28]:
eigengene.patients <- sapply(strsplit(rownames(eigengenes), "_"), function(x) { x[1] })

geno.fam <- fread("~/gains_team282/Genotyping/All_genotyping_merged_filtered_b38_refiltered_rsID.fam") %>%
    dplyr::select(Family.ID=1, Individual.ID=2) %>%
    dplyr::mutate(GAinS.ID=gsub("^GA", "", Individual.ID)) %>%
    dplyr::filter(GAinS.ID %in% eigengene.patients) %>%
    unique()

In [29]:
dim(geno.fam)

In [30]:
write.table(
    geno.fam %>% dplyr::select(Family.ID, Individual.ID), 
    "~/gains_team282/nikhil/expression/eigengene_sva/mapping_patients.txt", row.names=F, quote=F, col.names=F, sep="\t"
)

Save the list of module eigengenes as a text file. This will be used by NextFlow to parallelize the association mapping.

In [31]:
write.table(colnames(eigengenes), "~/gains_team282/nikhil/expression/eigengene_sva/mapping_eigengenes.txt", row.names=F, quote=F, col.names=F, sep="\t")

### Initial Pass

I will be using a linear mixed model (LMM) to test for association between genotypes and eigengene expression. In the past, the lab has used a likelihood test (F-Test) to compare a null model of the covariates against an alternative model where the genotype is included. I will be building the following model:

1. Let $\mathbf{E}_i\in\mathbb{R}^n$ be a vector representing the values of the $i$-th eigengene.
2. Let $\mathbf{Y}\in\mathbb{R}^{n\times c}$ be a matrix of covariates. These covariates include Sex, 16 PEER factors, and 7 Genotyping PCs.
3. Let $\mathbf{Z}\in\mathbb{R}^n$ be a vector representing the random effects. The only random effect in this model is the Patient ID.
4. Let $\mathbf{X}\in\mathbb{R}^n$ be a vector representing the genotypes of the patients.
5. Let $\beta\in\mathbb{R}$ be a scalar value representing the genotypic effect on eigengene expression.
6. Let $\mathbf{\alpha}\in\mathbb{R}^c$ be a vector of covariate effects on eigengene expression.
7. Let $\gamma\in\mathbb{R}$ be a scalar value representing the random effect on eigengene expression.

The null model is:

$$\mathbf{E}_i \sim \mathbf{Y}\alpha + \mathbf{Z}\gamma$$

The alternative model is:

$$\mathbf{E}_i \sim \mathbf{X}\beta + \mathbf{Y}\alpha + \mathbf{Z}\gamma$$

There are 823 samples from the RNA-Seq data that have genotypes as well.

In [32]:
samples.with.genotypes <- rownames(eigengenes)[
    sapply(
        strsplit(rownames(eigengenes), "_"),
        function(x) { x[1] %in% geno.fam$GAinS.ID }
    )
]

length(samples.with.genotypes)

In [33]:
peer.included <- peer[1:20]
peer.included

cov.names <- c("sex", paste0("PC", 1:7), "Neutrophils", "Lymphocytes", "Monocytes")
cov.names

rand.effect.names <- c("GAinS.ID")
rand.effect.names

In [34]:
mapping.data <- merge(eigengenes, covs, by=0) %>%
    dplyr::filter(Row.names %in% samples.with.genotypes) %>%
    dplyr::select(Sample.ID=Row.names, everything()) %>%
    dplyr::mutate(GAinS.ID.NonPrefix=sapply(strsplit(Sample.ID, "_"), function(x) { x[1] })) %>%
    merge(., geno.fam, by.x="GAinS.ID.NonPrefix", by.y="GAinS.ID") %>%
    dplyr::select(GAinS.ID=Individual.ID, everything()) %>%
    dplyr::select(Sample.ID, any_of(rand.effect.names), any_of(colnames(eigengenes)), any_of(cov.names), any_of(peer.included))

In [35]:
head(mapping.data)

Unnamed: 0_level_0,Sample.ID,GAinS.ID,ME_1,ME_2,ME_3,ME_4,ME_5,ME_6,ME_7,ME_8,⋯,PEER_11,PEER_12,PEER_13,PEER_14,PEER_15,PEER_16,PEER_17,PEER_18,PEER_19,PEER_20
Unnamed: 0_level_1,<I<chr>>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,UK01050129_5,UK01050129,-0.011965012,-0.00813517,0.02189999,-0.01050068,0.0739432434,0.0443109416,0.014796688,-0.06393004,⋯,0.047811605,-0.016539395,0.05109594,-0.026148196,-0.021135228,0.002443923,0.032696567,0.035078023,0.03328779,-0.072080098
2,UK01060123_3,UK01060123,-0.002691514,-0.003642363,0.00948328,-0.01205738,0.0369338683,-0.0251717916,0.004446174,-0.033855082,⋯,-0.007224238,0.016796881,-0.010660484,-0.017610908,0.031190908,0.005303205,0.040227711,0.023455137,-0.06972641,-0.01642975
3,UK01070117_3,UK01070117,-0.027671656,-0.018676902,-0.02153764,0.02309788,0.0213639304,-0.0033192787,0.027935528,-0.043763863,⋯,0.002017805,0.013472254,0.044990435,0.017342746,0.040239949,-0.037767619,-0.002149181,0.010321584,-0.03437537,-0.008777559
4,UK01080111_1,UK01080111,0.021899313,0.011468622,0.02719409,-0.01318245,0.0004065718,-0.0360484365,-0.022309759,-0.001370201,⋯,-0.063317396,-0.072918773,-0.010724876,-0.016138442,-0.003775158,-0.032603394,-0.068149783,-0.007441163,-0.04162392,-0.056151949
5,UK01110093_1,UK01110093,-0.053133706,-0.054170247,-0.09207201,-0.00263561,-0.0545464726,-0.0527906474,0.047146445,0.024393979,⋯,-0.009035902,-0.001971741,-0.003907994,-0.001863737,-0.031021271,-0.001989635,0.010791964,-0.013796184,0.09261347,-0.072555415
6,UK01110093_5,UK01110093,-0.097745041,-0.09362011,-0.11078002,0.02619148,-0.0161876898,0.0004345917,0.08719428,-0.028478758,⋯,0.076265849,-0.071181759,0.045724034,-0.083036296,0.009541817,-0.043817975,0.079534695,0.008144007,0.09541912,-0.201759934


In [36]:
colnames(mapping.data)

Save the mapping data design matrix for linear mixed modeling.

In [37]:
write.csv(mapping.data, "~/gains_team282/nikhil/expression/eigengene_sva/mapping_data.csv", row.names=F)

### Identify SNPs for mQTL Mapping

Since *trans* mapping in a genome-wide fashion will require a lot of testing (the multiple-testing burden will be high), we will use SNPs that we have a more biological hypothesis for. Specifically, I will include all lead eSNPs, all lead eSNPs from the conditional cis-eQTL analysis, and related associations from the EBI GWAS Catalog.

In [38]:
length(unique(ebi.studies[,"DISEASE/TRAIT"]))

After filtering based on the genotyping in GAinS, we have 124,524 studies and 56,218 SNPs.

In [39]:
# Studies with terms specified
studies <- ebi.studies[, "STUDY ACCESSION"]

# Associations from the studies that are genome-wide significant
studies.assoc <- ebi.assoc[ebi.assoc[, "STUDY ACCESSION"] %in% studies,] %>%
    dplyr::mutate(`P-VALUE` = as.numeric(`P-VALUE`)) %>%
    dplyr::filter(`P-VALUE` < 5e-8)

# Association SNPs that overlap with genotyping data in GAinS
# Ignore SNPs that are on the X chromosome
studies.assoc <- studies.assoc %>%
    dplyr::filter(SNPS %in% geno.bim$snps[geno.bim$chr %in% as.character(1:22)])

nrow(studies.assoc)

length(unique(studies.assoc$SNPS))

In [40]:
mqtl.snp.table <- dplyr::bind_rows(
    lead.cis.eqtl %>%
        dplyr::mutate(source="Lead cis-eQTL SNP") %>%
        dplyr::mutate(conditional_number=NA, accession=NA) %>%
        dplyr::select(snps, source, egene=gene, conditional_number, accession),
    conditional.cis.eqtl %>%
        dplyr::mutate(source="Conditional cis-eQTL SNP") %>%
        dplyr::mutate(accession=NA) %>%
        dplyr::select(snps=SNP, source, egene=Gene, conditional_number=Number, accession),
    studies.assoc %>%
        dplyr::mutate(source="EBI GWAS Catalog") %>%
        dplyr::mutate(egene=NA, conditional_number=NA) %>%
        dplyr::select(snps=SNPS, source, egene, conditional_number, accession=37)
)

In [41]:
write.csv(mqtl.snp.table, "~/gains_team282/nikhil/expression/eigengene_sva/mqtl_snp_table.csv", quote=F, row.names=F)

In [42]:
length(unique(mqtl.snp.table$snps))