# HDL in MVP

## Reference panel

`download_1000G()` in `bigsnpr`

Including 503 (mostly unrelated) European individuals and ~1.7M SNPs in common with either HapMap3 or the UK Biobank. Classification of European populstion can be found at [IGSR](https://www.internationalgenome.org/category/population/). European individuals ID are from [IGSR data portal](https://www.internationalgenome.org/data-portal/sample).

## Base data: summary Statistics from MVP

Posterior betas for traits HDL.

## Target data: UK biobank

covariates, phenotype related to HDL and genotypes of 2000 individuals `UKB.QC.fam`. 

## Model

Auto model runs the algorithm for 30 different $p$ (the proportion of causal variants) values range from 10e-4 to 0.9, and heritability $h^2$ from LD score regression as initial value.

Grid model tries a grid of parameters $p$, ranges from 0 to 1 and three $h^2$ which are 0.7/1/1.4 times of initial $h^2$ estimated by LD score regression.

## Test genotype data preparation

Use `awk` select columns in phenotypes file saved to traits file `UKB.hdl.cov` and covariate file `UKB.ind.cov`.


In [None]:
fam_UKB <- read.table("ukbiobank/UKB.fam", header = F, stringsAsFactors = F)
colnames(fam_UKB)=c("FID","IID","paternal.ID","maternal.ID","sex","affection")
covariates <- read.table("ukbiobank/UKBCauc_cholesterolandbloodpressurefields_inverseranknorm_covariatesage_sex_alcohol_smokingpackyears_foranalysis", header = T, stringsAsFactors = F)
suppressMessages(library(tidyverse))

set.seed(2021)
covariate = covariates %>%
    drop_na() %>%
    filter(FID %in% sample(FID,2000))
    
covariate = covariate[order(covariate[,1]),]

fam_UKB = fam_UKB %>% filter(FID %in% covariate$FID)
fam_UKB = fam_UKB[order(covariate[,1]),]
colnames(fam_UKB)<-NULL

write.table(covariate, file = "ukbiobank/UKB.cov", sep = " ", 
            row.names = F, col.names = T,)
write.table(fam_UKB, file = "ukbiobank/UKB.QC.fam", sep = " ", 
            row.names = FALSE, col.names = FALSE)

In [None]:
cd ukbiobank
awk '{print $3, $4, $5, $6, $8, $9, $10, $11}' UKB.cov > UKB.ind.cov
awk '{print $14}' UKB.cov > UKB.hdl.cov
cd ..

## Summary statistics data preparation

In [None]:
raw_beta = readRDS("./all_MVP_posteriors/raw_univariate_estimates/zmash_raw_univariate_MVP.rds")
raw_beta_se = bigreadr::fread2("./all_MVP_posteriors/raw_univariate_estimates/Merged_MVP_Full_se_raw.txt")
#chr_pos_allele2_lfsr = read.table("./all_MVP_posteriors/identifying_info/chr_pos_allele2_lfsr.txt")

In [None]:
raw_beta_se = raw_beta_se %>% 
  data.frame() %>% 
  rename(marker = m.Marker)
raw_beta = raw_beta %>% 
  data.frame() %>% 
  mutate(marker = rownames(raw_beta))

sumstats = inner_join(raw_beta_se,raw_beta, by = 'marker') %>% 
  select(marker, hdl,HDLSe,ldl,LDLSe, tg, TGSe, tc, TCSe)

head(sumstats)

colnames(sumstats) = c("marker", "hdl", "hdl_se", "ldl", "ldl_se", "tg", "tg_se", "tc", "tc_se")

sumstats = sumstats %>%
  extract(marker, into = c("chr", "pos"), regex = "([0-9]+)(\\:[0-9]+)") %>% 
  mutate(chr = as.numeric(chr),
         pos = as.numeric(str_replace(pos, ":","")))

In [None]:
chr_pos = bigreadr::fread2("./all_MVP_posteriors/identifying_info/chr_pos_allele2_lfsr.txt")
head(chr_pos)

In [None]:
chr_pos = chr_pos %>%
  extract(V1, into = c("chr", "pos"), regex = "([0-9]+)(\\:[0-9]+)") %>% 
  mutate(chr = as.numeric(chr),
         pos = as.numeric(str_replace(pos, ":",""))) %>% 
  rename(a0 = Allele1,
         a1 = Allele2,
         rsid = rsID) %>% 
  select(chr,pos,rsid,a0,a1)

In [None]:
sumstats = inner_join(sumstats, chr_pos, by = c("chr","pos")) %>% select(chr, pos, rsid, a0, a1, everything())
sumstats = sumstats %>% 
  mutate(hdl_z = hdl/hdl_se,
         hdl_p = 2*pnorm(-abs(hdl_z)),
         ldl_z = ldl/ldl_se,
         ldl_p = 2*pnorm(-abs(ldl_z)),
         tc_z = tc/tc_se,
         tc_p = 2*pnorm(-abs(tc_z)),
         tg_z = tg/tg_se,
         tg_p = 2*pnorm(-abs(tg_z))) %>% 
  select(-hdl_z,-ldl_z, -tc_z, -tg_z)

head(sumstats)

In [None]:
sumstats_hdl  = sumstats %>% 
  select(chr, pos, rsid, a0,a1, hdl, hdl_se,hdl_p) %>% 
  rename(beta = hdl,
         beta_se = hdl_se,
         p = hdl_p)
head(sumstats_hdl)

sumstats_ldl  = sumstats %>% 
  select(chr, pos, rsid, a0, a1, ldl, ldl_se,ldl_p) %>% 
  rename(beta = ldl,
         beta_se = ldl_se,
         p = ldl_p)

sumstats_tc  = sumstats %>% 
  select(chr, pos, rsid, a0,a1, tc, tc_se,tc_p) %>% 
  rename(beta = tc,
         beta_se = tc_se,
         p = tc_p)

sumstats_tg  = sumstats %>% 
  select(chr, pos, rsid, a0,a1, tg, tg_se,tg_p) %>% 
  rename(beta = tg,
         beta_se = tg_se,
         p = tg_p)

In [None]:
saveRDS(sumstats_hdl, file = "mvpdata/sumstats_hdl.rds")
saveRDS(sumstats_ldl, file = "mvpdata/sumstats_ldl.rds")
saveRDS(sumstats_tc, file = "mvpdata/sumstats_tc.rds")
saveRDS(sumstats_tg, file = "mvpdata/sumstats_tg.rds")
head(sumstats_tg)

In [None]:
pos_raw_beta = bigreadr::fread2("./all_MVP_posteriors/posterior_estimates/MVP_all_beta_posterior_beta.txt")

pos_raw_beta_se = bigreadr::fread2("./all_MVP_posteriors/posterior_estimates/posterior_beta_se.txt")

colnames(pos_raw_beta)=c("marker", "hdl","ldl","tg","tc")
colnames(pos_raw_beta_se)=c("marker", "hdl_se","ldl_se","tg_se","tc_se")

In [None]:
head(pos_raw_beta)

In [None]:
head(pos_raw_beta_se)

In [None]:
pos_raw_beta_se = pos_raw_beta_se %>% 
  data.frame()
pos_raw_beta = pos_raw_beta %>% 
  data.frame()

In [None]:
sumstats = inner_join(pos_raw_beta_se,pos_raw_beta, by = 'marker') %>% 
  select(marker, hdl,hdl_se,ldl,ldl_se, tg, tg_se, tc, tc_se)

head(sumstats)

colnames(sumstats) = c("marker", "hdl", "hdl_se", "ldl", "ldl_se", "tg", "tg_se", "tc", "tc_se")

sumstats = sumstats %>%
  extract(marker, into = c("chr", "pos"), regex = "([0-9]+)(\\:[0-9]+)") %>% 
  mutate(chr = as.numeric(chr),
         pos = as.numeric(str_replace(pos, ":","")))

In [None]:
chr_pos = bigreadr::fread2("./all_MVP_posteriors/identifying_info/chr_pos_allele2_lfsr.txt")
head(chr_pos)

## Step 1: common snps

In [None]:
sos run ldpred.ipynb extract_snp -v1 \
    --outpath res-data \
    --testpath ukbiobank \
    --ref_bed 1000G/1000G.EUR.bed \
    --test_bed ukbiobank/UKB.bed \
    --ref_snp 1000G/1000G.QC.snplist \
    --test_snp ukbiobank/UKB.QC.snplist \
    --summstats_file mvpdata/pos_sumstats_hdl.rds \
    --stat_snp mvpdata/pos_sumstats_hdl.snplist

In [None]:
sos run ldpred.ipynb common_snp \
    --outpath res-data \
    --testpath ukbiobank \
    --stat_snp mvpdata/pos_sumstats_hdl.snplist \
    --ref_snp 1000G/1000G.QC.snplist \
    --test_snp ukbiobank/UKB.QC.snplist \
    --summstats_file mvpdata/pos_sumstats_hdl.rds \
    --sub_stats mvpdata/pos_sumstats_hdl.SUB.rds

## Step 2: subsetting reference panel

In [None]:
sos run ldpred.ipynb subsets \
    --outpath res-data \
    --testpath ukbiobank \
    --bed_file 1000G/1000G.EUR.bed \
    --fam_file 1000G/1000G.EUR.fam \
    --snp_file res-data/common.snplist \
    --sub_bedfile 1000G/1000G.SUB.bed

Totally 31566 varients

    ./plink \
        --bfile 1000G/1000G.EUR \
        --keep 1000G/1000G.EUR.fam \
        --extract res-data/common.snplist \
        --make-bed \
        --out 1000G/1000G.SUB


## Step 3: SNP Matching


In [None]:
sos run ldpred.ipynb data_load \
    --outpath res-data \
    --testpath ukbiobank \
    --ref_bfile 1000G/1000G.SUB.bed \
    --ref_file 1000G/1000G.SUB.rds \
    --summstats_file mvpdata/pos_sumstats_hdl.SUB.rds \
    --n_eff 200000 \
    --test_snplist UKB.SUB.snplist

## Step 4: Quality control (or do not)

Greatly drop variants. 

In [None]:
sos run ldpred.ipynb QControl \
    --qc_in res-data/MatchedSnp.RData \
    --outpath res-data \
    --testpath ukbiobank \
    --test_snplist UKB.QC.SUB.snplist

## Step 5: subsetting target data

In [None]:
sos run ldpred.ipynb subsets \
    --outpath res-data \
    --testpath ukbiobank \
    --bed_file ukbiobank/UKB.bed \
    --fam_file ukbiobank/UKB.QC.fam \
    --snp_file ukbiobank/UKB.QC.SUB.snplist \
    --sub_bedfile ukbiobank/UKB.SUB.bed

## Step 6: Calculate LD matrix and correlation

In [None]:
sos run ldpred.ipynb LD \
    --outpath res-data \
    --testpath ukbiobank \
    --ld_in res-data/QcMatchedSnp.Rdata

## Step 7: Estimate posterior effect sizes and PRS

In [None]:
sos run ldpred.ipynb load_testdata+inf_prs \
    --outpath res-data \
    --testpath ukbiobank \
    --inf_in res-data/LdMatrix.Rdata \
    --test_bfile ukbiobank/UKB.SUB.bed \
    --test_file ukbiobank/UKB.SUB.rds

In [None]:
sos run ldpred.ipynb grid_prs \
    --outpath res-data \
    --testpath ukbiobank \
    --grid_in res-data/LdMatrix.Rdata \
    --test_bfile ukbiobank/UKB.SUB.bed \
    --cov_file ukbiobank/UKB.ind.cov \
    --trait_file ukbiobank/UKB.hdl.cov \
    --test_file ukbiobank/UKB.SUB.rds \
    --response continuous

In [None]:
sos run ldpred.ipynb auto_prs \
    --outpath res-data \
    --testpath ukbiobank \
    --auto_in res-data/LdMatrix.Rdata \
    --test_bfile ukbiobank/UKB.SUB.bed \
    --test_file ukbiobank/UKB.SUB.rds

## Step 8: predict phenotypes

Null model: Traits ~ Sex + Age + Smoking + Alcohol

In [None]:
sos run ldpred.ipynb null_phenopred \
    --outpath res-data \
    --testpath ukbiobank \
    --cov_file ukbiobank/UKB.ind.cov \
    --trait_file ukbiobank/UKB.hdl.cov \
    --response continuous

Inf/grid/auto model: Traits ~ Sex + Age + Smoking + Alcohol + PRS

In [None]:
sos run ldpred.ipynb inf_phenopred \
    --outpath res-data \
    --testpath ukbiobank \
    --cov_file ukbiobank/UKB.ind.cov \
    --trait_file ukbiobank/UKB.hdl.cov \
    --prs_file res-data/InfPred.Rdata \
    --mod_summary InfSummary.pdf \
    --model InfModel.Rdata \
    --response continuous

In [None]:
sos run ldpred.ipynb grid_phenopred \
    --outpath res-data \
    --testpath ukbiobank \
    --cov_file ukbiobank/UKB.ind.cov \
    --trait_file ukbiobank/UKB.hdl.cov \
    --prs_file res-data/GridPred.Rdata \
    --mod_summary GridSummary.pdf \
    --model GridModel.Rdata \
    --response continuous

In [None]:
sos run ldpred.ipynb auto_phenopred \
    --outpath res-data \
    --testpath ukbiobank \
    --cov_file ukbiobank/UKB.ind.cov \
    --trait_file ukbiobank/UKB.hdl.cov \
    --prs_file res-data/AutoPred.Rdata \
    --mod_summary AutoSummary.pdf \
    --model AutoModel.Rdata \
    --response 1

# Results

Following table shows adjusted R squared of HDL prediction model. QC is quality control in step 4.

|   Betas   | QC? |   Null  |   Inf   |   Grid  |   Auto  |
|:---------:|:---:|:-------:|:-------:|:-------:|:-------:|
|  Original | Yes | 0.2901 | 0.2909 | 0.2897  | 0.2909 |
|  Original |  No$^*$ | - | - |    -  |    -   |
| Posterior | Yes | 0.2901 | 0.2897 | 0.2901| 0.29 |
| Posterior |  No$^*$ |  -  |  - |    -    |    -   |

$*$: Take long very long time to get results (4hrs+)