# HDL in MVP

## Reference panel

`download_1000G()` in `bigsnpr`

Including 503 (mostly unrelated) European individuals and ~1.7M SNPs in common with either HapMap3 or the UK Biobank. Classification of European populstion can be found at [IGSR](https://www.internationalgenome.org/category/population/). European individuals ID are from [IGSR data portal](https://www.internationalgenome.org/data-portal/sample).

## Base data: summary Statistics from MVP

Posterior betas for traits HDL.

## Target data: UK biobank

covariates, phenotype related to HDL and genotypes of 2000 individuals `UKB.QC.fam`. 

## Model

Auto model runs the algorithm for 30 different $p$ (the proportion of causal variants) values range from 10e-4 to 0.9, and heritability $h^2$ from LD score regression as initial value.

Grid model tries a grid of parameters $p$, ranges from 0 to 1 and three $h^2$ which are 0.7/1/1.4 times of initial $h^2$ estimated by LD score regression.

## Test genotype data preparation

Use `awk` select columns in phenotypes file saved to traits file `UKB.hdl.cov` and covariates file `UKB.ind.cov`.


In [1]:
fam_UKB <- read.table("ukbiobank/UKB.fam", header = F, stringsAsFactors = F)
colnames(fam_UKB)=c("FID","IID","paternal.ID","maternal.ID","sex","affection")
covariates <- read.table("ukbiobank/UKBCauc_cholesterolandbloodpressurefields_inverseranknorm_covariatesage_sex_alcohol_smokingpackyears_foranalysis", header = T, stringsAsFactors = F)
suppressMessages(library(tidyverse))

set.seed(2021)
covariate = covariates %>%
    drop_na() %>%
    filter(FID %in% sample(FID,2000))
    
covariate = covariate[order(covariate[,1]),]

fam_UKB = fam_UKB %>% filter(FID %in% covariate$FID)
fam_UKB = fam_UKB[order(covariate[,1]),]
colnames(fam_UKB)<-NULL

write.table(covariate, file = "ukbiobank/UKB.cov", sep = " ", 
            row.names = F, col.names = T,)
write.table(fam_UKB, file = "ukbiobank/UKB.QC.fam", sep = " ", 
            row.names = FALSE, col.names = FALSE)

In [2]:
cd ukbiobank
awk '{print $3, $4}' UKB.cov > UKB.ind.cov
awk '{print $14}' UKB.cov > UKB.hdl.cov
cd ..

[?2004h[?2004l[?2004l[?2004l

: 1

## Summary statistics data preparation

In [None]:
library(tidyverse)

format_data <- function(sums, chrpos, betaty = c("pos","raw")){
  sums = inner_join(sums, chrpos, by = c("chr","pos")) %>% select(chr, pos, rsid, a0, a1, everything())
  
  print("inner join completed")
  
  if(betaty == "raw"){
    sums = sums %>% 
      mutate(hdl = hdl_z*hdl_se,
            ldl = ldl_z*ldl_se,
            tc = tc_z*tc_se,
            tg = tg_z*tg_se) %>% 
      select(-hdl_z,-ldl_z, -tc_z, -tg_z)
  }

  
  sumstats_hdl  = sums %>% 
    select(chr, pos, rsid, a0,a1, hdl, hdl_se,hdl_p) %>% 
    rename(beta = hdl,
          beta_se = hdl_se) %>% 
    mutate(n_eff = 200000)
  
  print("finish hdl")
  
  sumstats_ldl  = sums %>% 
   select(chr, pos, rsid, a0, a1, ldl, ldl_se,ldl_p) %>% 
    rename(beta = ldl,
          beta_se = ldl_se) %>% 
    mutate(n_eff = 200000)
  
  print("finish ldl")

  sumstats_tc  = sums %>% 
    select(chr, pos, rsid, a0,a1, tc, tc_se,tc_p) %>% 
    rename(beta = tc,
          beta_se = tc_se) %>% 
    mutate(n_eff = 200000)
  
  print("finish tc")

  sumstats_tg  = sums %>% 
    select(chr, pos, rsid, a0,a1, tg, tg_se,tg_p) %>% 
    rename(beta = tg,
          beta_se = tg_se) %>% 
    mutate(n_eff = 200000)
  
  print("finish tg")
  
  print("data ready to save")
  
  if(betaty == "raw"){
    saveRDS(sumstats_hdl, file = "mvpdata/sumstats_hdl.rds")
    saveRDS(sumstats_ldl, file = "mvpdata/sumstats_ldl.rds")
    saveRDS(sumstats_tc, file = "mvpdata/sumstats_tc.rds")
    saveRDS(sumstats_tg, file = "mvpdata/sumstats_tg.rds")
  }
  if(betaty =="pos"){
    saveRDS(sumstats_hdl, file = "mvpdata/pos_sumstats_hdl.rds")
    saveRDS(sumstats_ldl, file = "mvpdata/pos_sumstats_ldl.rds")
    saveRDS(sumstats_tc, file = "mvpdata/pos_sumstats_tc.rds")
    saveRDS(sumstats_tg, file = "mvpdata/pos_sumstats_tg.rds")
  }
  
}

In [None]:
chr_pos = bigreadr::fread2("./all_MVP_posteriors/identifying_info/chr_pos_allele2_lfsr.txt")
head(chr_pos)

chr_pos = chr_pos %>%
  extract(V1, into = c("chr", "pos"), regex = "([0-9]+)(\\:[0-9]+)") %>% 
  mutate(chr = as.numeric(chr),
         pos = as.numeric(str_replace(pos, ":",""))) %>% 
  rename(a0 = Allele1,
         a1 = Allele2,
         rsid = rsID) %>% 
  select(chr,pos,rsid,a0,a1)
head(chr_pos)

In [None]:
raw_beta = readRDS("./all_MVP_posteriors/raw_univariate_estimates/zmash_raw_univariate_MVP.rds")

raw_beta_se = bigreadr::fread2("./all_MVP_posteriors/raw_univariate_estimates/Merged_MVP_Full_se_raw.txt")

In [None]:
raw_beta_se = raw_beta_se %>% 
  data.frame() %>% 
  rename(marker = m.Marker)
raw_beta = raw_beta %>% 
  data.frame() %>% 
  mutate(marker = rownames(raw_beta))

sumstats = inner_join(raw_beta_se,raw_beta, by = 'marker') %>% 
  select(marker, hdl,HDLSe,ldl,LDLSe, tg, TGSe, tc, TCSe)

colnames(sumstats) = c("marker", "hdl_z", "hdl_se", "ldl_z", "ldl_se", "tg_z", "tg_se", "tc_z", "tc_se")

sumstats = sumstats %>%
  extract(marker, into = c("chr", "pos"), regex = "([0-9]+)(\\:[0-9]+)") %>% 
  mutate(chr = as.numeric(chr),
         pos = as.numeric(str_replace(pos, ":","")))

In [None]:
format_data(sumstats, chr_pos, "raw")

In [None]:
pos_raw_beta = bigreadr::fread2("./all_MVP_posteriors/posterior_estimates/MVP_all_beta_posterior_beta.txt")

pos_raw_beta_se = bigreadr::fread2("./all_MVP_posteriors/posterior_estimates/posterior_beta_se.txt")

colnames(pos_raw_beta)=c("marker", "hdl","ldl","tg","tc")
colnames(pos_raw_beta_se)=c("marker", "hdl_se","ldl_se","tg_se","tc_se")

In [None]:
pos_raw_beta_se = pos_raw_beta_se %>% 
  data.frame()
pos_raw_beta = pos_raw_beta %>% 
  data.frame()

In [None]:
sumstats_pos = inner_join(pos_raw_beta_se,pos_raw_beta, by = 'marker') %>% 
  select(marker, hdl,hdl_se,ldl,ldl_se, tg, tg_se, tc, tc_se)

head(sumstats_pos)

colnames(sumstats_pos) = c("marker", "hdl", "hdl_se", "ldl", "ldl_se", "tg", "tg_se", "tc", "tc_se")

sumstats_pos = sumstats_pos %>%
  extract(marker, into = c("chr", "pos"), regex = "([0-9]+)(\\:[0-9]+)") %>% 
  mutate(chr = as.numeric(chr),
         pos = as.numeric(str_replace(pos, ":","")))

In [None]:
format_data(sumstats_pos, chr_pos, "pos")

## Step 1: common snps

In [6]:
sos run ldpred.ipynb extract_snp -v1 \
    --outpath res-data \
    --testpath ukbiobank \
    --ref_bed 1000G/1000G.EUR.bed \
    --test_bed ukbiobank/UKB.bed \
    --ref_snp 1000G/1000G.QC.snplist \
    --test_snp ukbiobank/UKB.QC.snplist \
    --summstats_file mvpdata/sumstats_hdl.rds \
    --stat_snp mvpdata/sumstats_hdl.snplist

[?2004h[?2004l[?2004l[?2004l[?2004l[?2004l[?2004l[?2004l[?2004l

: 1

In [7]:
sos run ldpred.ipynb common_snp \
    --outpath res-data \
    --testpath ukbiobank \
    --stat_snp mvpdata/sumstats_hdl.snplist \
    --ref_snp 1000G/1000G.QC.snplist \
    --test_snp ukbiobank/UKB.QC.snplist \
    --summstats_file mvpdata/sumstats_hdl.rds \
    --sub_stats mvpdata/sumstats_hdl.SUB.rds

INFO: Running [32mcommon_snp[0m: 2004l[?2004l[?2004l[?2004l
1: Setting LC_COLLATE failed, using "C" 
2: Setting LC_TIME failed, using "C" 
3: Setting LC_MESSAGES failed, using "C" 
4: Setting LC_MONETARY failed, using "C" 
[1] "There are  409486  common SNPs."
INFO: [32mcommon_snp[0m is [32mcompleted[0m.
INFO: [32mcommon_snp[0m output:   [32mmvpdata/sumstats_hdl.SUB.rds res-data/common.snplist[0m
INFO: Workflow common_snp (ID=wda5082e8178fcf44) is executed successfully with 1 completed step.
[?2004h

: 1

## Step 2: subsetting reference panel

In [9]:
sos run ldpred.ipynb subsets \
    --outpath res-data \
    --testpath ukbiobank \
    --bed_file 1000G/1000G.EUR.bed \
    --fam_file 1000G/1000G.EUR.fam \
    --snp_file res-data/common.snplist \
    --sub_bedfile 1000G/1000G.SUB.bed

INFO: Running [32msubsets[0m: [?2004l[?2004l[?2004l
PLINK v1.90b6.22 64-bit (16 Apr 2021)          www.cog-genomics.org/plink/1.9/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 1000G/1000G.SUB.log.
Options in effect:
  --bfile 1000G/1000G.EUR
  --extract res-data/common.snplist
  --keep 1000G/1000G.EUR.fam
  --make-bed
  --out 1000G/1000G.SUB

8192 MB RAM detected; reserving 4096 MB for main workspace.
1664852 variants loaded from .bim file.
503 people (240 males, 263 females) loaded from .fam.
--extract: 409486 variants remaining.
--keep: 503 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 503 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
409486 variants and 503 people pass fi

: 1

Totally 31566 varients

    ./plink \
        --bfile 1000G/1000G.EUR \
        --keep 1000G/1000G.EUR.fam \
        --extract res-data/common.snplist \
        --make-bed \
        --out 1000G/1000G.SUB


## Step 3: SNP Matching


In [None]:
sos run ldpred.ipynb snp_match \
    --outpath res-data \
    --testpath ukbiobank \
    --ref_bfile 1000G/1000G.SUB.bed \
    --ref_file 1000G/1000G.SUB.rds \
    --summstats_file mvpdata/sumstats_hdl.SUB.rds \
    --n_eff 200000 \
    --test_snplist UKB.SUB.snplist

## Step 4: Quality control (or do not)

Greatly drop variants. 

In [None]:
sos run ldpred.ipynb QControl \
    --qc_in res-data/MatchedSnp.RData \
    --outpath res-data \
    --testpath ukbiobank \
    --test_snplist UKB.QC.SUB.snplist

## Step 5: subsetting target data

In [None]:
sos run ldpred.ipynb subsets \
    --outpath res-data \
    --testpath ukbiobank \
    --bed_file ukbiobank/UKB.bed \
    --fam_file ukbiobank/UKB.QC.fam \
    --snp_file ukbiobank/UKB.SUB.snplist \
    --sub_bedfile ukbiobank/UKB.SUB.bed

## Step 6: Calculate LD matrix and correlation

In [None]:
sos run ldpred.ipynb LD \
    --outpath res-data \
    --testpath ukbiobank \
    --ld_in res-data/MatchedSnp.Rdata

## Step 7: Estimate posterior effect sizes and PRS

In [None]:
sos run ldpred.ipynb load_testdata+inf_prs \
    --outpath res-data \
    --testpath ukbiobank \
    --inf_in res-data/LdMatrix.Rdata \
    --test_bfile ukbiobank/UKB.SUB.bed \
    --test_file ukbiobank/UKB.SUB.rds

In [None]:
sos run ldpred.ipynb grid_prs \
    --outpath res-data \
    --testpath ukbiobank \
    --grid_in res-data/LdMatrix.Rdata \
    --test_bfile ukbiobank/UKB.SUB.bed \
    --cov_file ukbiobank/UKB.ind.cov \
    --trait_file ukbiobank/UKB.hdl.cov \
    --test_file ukbiobank/UKB.SUB.rds \
    --response continuous

In [None]:
sos run ldpred.ipynb auto_prs \
    --outpath res-data \
    --testpath ukbiobank \
    --auto_in res-data/LdMatrix.Rdata \
    --test_bfile ukbiobank/UKB.SUB.bed \
    --test_file ukbiobank/UKB.SUB.rds

## Step 8: predict phenotypes

Null model: Traits ~ Sex + Age

In [5]:
sos run ldpred.ipynb null_phenopred \
    --outpath res-data \
    --testpath ukbiobank \
    --cov_file ukbiobank/UKB.ind.cov \
    --trait_file ukbiobank/UKB.hdl.cov \
    --response continuous

INFO: Running [32mnull_phenopred[0m: l[?2004l
1: Setting LC_COLLATE failed, using "C" 
2: Setting LC_TIME failed, using "C" 
3: Setting LC_MESSAGES failed, using "C" 
4: Setting LC_MONETARY failed, using "C" 
Loading required package: bigstatsr
HDL_inverseranknorm ~ AGE + SEX
<environment: 0x7fdc0ec54d20>

Call:
lm(formula = ., data = data[train.ind, ])

Residuals:
    Min      1Q  Median      3Q     Max 
-2.7973 -0.6148 -0.0072  0.5943  3.2185 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.083025   0.177526  -0.468   0.6401    
AGE          0.007225   0.003028   2.386   0.0171 *  
SEX         -0.857958   0.046395 -18.492   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9257 on 1597 degrees of freedom
Multiple R-squared:  0.177,	Adjusted R-squared:  0.176 
F-statistic: 171.7 on 2 and 1597 DF,  p-value: < 2.2e-16

[90m# A tibble: 1 x 3[39m
  model         R2   MSE
  [3m[90m<chr>[39m

: 1

Inf/grid/auto model: Traits ~ Sex + Age + PRS

In [4]:
sos run ldpred.ipynb inf_phenopred \
    --outpath res-data \
    --testpath ukbiobank \
    --cov_file ukbiobank/UKB.ind.cov \
    --trait_file ukbiobank/UKB.hdl.cov \
    --prs_file res-data/InfPred.Rdata \
    --mod_summary InfSummary.pdf \
    --model InfModel.Rdata \
    --response continuous

INFO: Running [32minf_phenopred[0m: 4l[?2004l[?2004l[?2004l[?2004l
1: Setting LC_COLLATE failed, using "C" 
2: Setting LC_TIME failed, using "C" 
3: Setting LC_MESSAGES failed, using "C" 
4: Setting LC_MONETARY failed, using "C" 
Loading required package: bigstatsr
HDL_inverseranknorm ~ PRS + AGE + SEX
<environment: 0x7fe452c226e0>

Call:
lm(formula = reg.formula, data = data[train.ind, ])

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9595 -0.5773 -0.0030  0.5575  3.8241 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.303484   0.166778  -1.820   0.0690 .  
PRS         -2.628765   0.174294 -15.082   <2e-16 ***
AGE          0.006512   0.002834   2.298   0.0217 *  
SEX         -0.839663   0.043436 -19.331   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.8663 on 1596 degrees of freedom
Multiple R-squared:  0.2797,	Adjusted R-squared:  0.2783 
F-statistic: 206.5 on 3 and 1596 DF, 

: 1

In [None]:
sos run ldpred.ipynb grid_phenopred \
    --outpath res-data \
    --testpath ukbiobank \
    --cov_file ukbiobank/UKB.ind.cov \
    --trait_file ukbiobank/UKB.hdl.cov \
    --prs_file res-data/GridPred.Rdata \
    --mod_summary GridSummary.pdf \
    --model GridModel.Rdata \
    --response continuous

In [None]:
sos run ldpred.ipynb auto_phenopred \
    --outpath res-data \
    --testpath ukbiobank \
    --cov_file ukbiobank/UKB.ind.cov \
    --trait_file ukbiobank/UKB.hdl.cov \
    --prs_file res-data/AutoPred.Rdata \
    --mod_summary AutoSummary.pdf \
    --model AutoModel.Rdata \
    --response continuous

# Results

Following table shows adjusted R squared of HDL prediction model. QC is quality control in step 4.

|   Betas   | QC? |   Null  |   Inf   |   Grid  |   Auto  |
|:---------:|:---:|:-------:|:-------:|:-------:|:-------:|
|  Original | Yes | 0.2901 | 0.2899 | 0.2901  | 0.2897 |
|  Original |  No$^*$ | - | - |    -  |    -   |
| Posterior | Yes | 0.2901 | 0.2897 | 0.2901| 0.29 |
| Posterior |  No$^*$ |  -  |  - |    -    |    -   |

|   Betas   | QC? |   Null  |   Inf   |   Grid  |   Auto  |
|:---------:|:---:|:-------:|:-------:|:-------:|:-------:|
|  Original | Yes | 0.176 | 0.1756 | 0.1757  | 0.1755 |
|  Original |  No$^*$ | - | - |    -  |    -   |
| Posterior | Yes | 0.176 | 0.1756 | 0.1758| 0.1756 |
| Posterior |  No$^*$ |  0.176  |  0.2783 |    -    |    -   |

$*$: Take long very long time to get results (4hrs+)