# Notes

Ultimate goal: Predict phenotype

Q1: How to predict phenotype?

A1: linear model + covariates (testset) + PRS (testset)

Q2: Where is PRS come from?

A2: 
 
    genotype + ref_panel (infos.pos) -> LD matrix + correlation matrix
   
    LD matrix + sumstats(raw beta) -> heritability

    Genotype + adjusted_beta -> PRS

    correlation matrix + heritability + sumstats -> adjusted betas
    
    adjusted betas + genotype (trainset) -> PRS (trainset)
    

Q3: Where is linear model comes from?

A3: Modeling on the `trainset`
   
    y (trainset) ~ covariates (trainset) + PRS (trainset)

# Command Interface

In [7]:
sos run ldpred.ipynb -h

usage: sos run ldpred.ipynb [workflow_name | -t targets] [options] [workflow_options]
  workflow_name:        Single or combined workflows defined in this script
  targets:              One or more targets to generate
  options:              Single-hyphen sos parameters (see "sos run -h" for details)
  workflow_options:     Double-hyphen workflow-specific parameters

Workflows:
  inf
  auto
  grid

Global Workflow Options:
  --pheno-out 'PhenoOut.RData'
  --summary-stat 'SumStats.RData'
  --ld-in 'LdInput.RData'
  --ld-out 'LdOutput.RData'
  --ldreg-out 'LdRegOut.RData'
  --inf-beta 'InfBeta.RData'
  --grid-beta 'GridBeta.RData'
  --auto-beta 'AutoBeta.RData'
  --inf-prs 'InfPrs.RData'
  --grid-prs 'GridPrs.RData'
  --auto-prs 'AutoPrs.RData'
  --null-r2 'NullR2.RData'
  --inf-pheno 'InfPheno.RData'
  --grid-pheno 'GridPheno.RData'
  --auto-pheno 'AutoPheno.RData'
  --pheno-pred-xxx 'PhenoPredXxx.RData'

Sections
  inf_10, auto_10, grid_10:
  inf_20, auto_20, grid_20:
  inf_25, auto_25

: 1

# Global Parameter Setting

In [None]:
[global]
refpanel_file = "https://github.com/privefl/bigsnpr/raw/master/data-raw/hm3_variants.rds"
summstats_file = "post-qc/Height.QC.gz"
bed_file = "post-qc/EUR.QC.bed"
geno_file = "post-qc/EUR.QC.rds"
pheno_file = "post-qc/EUR.height"
covariate_file = "post-qc/EUR.cov"
pcs_file = "post-qc/EUR.eigenvec"
newdata = "path/to/new/data" ##  contains covariates (sex and PC in this case) and and genotype data (used to calculate PRS)

# 
parameter: pheno_out = 'PhenoOut.RData'
parameter: summary_stat = 'SumStats.RData'
parameter: ld_in = 'LdInput.RData'
parameter: ld_out = 'LdOutput.RData'
parameter: ldreg_out = 'LdRegOut.RData'
parameter: inf_beta = 'InfBeta.RData'
parameter: grid_beta = 'GridBeta.RData'
parameter: auto_beta = 'AutoBeta.RData'
parameter: inf_prs = 'InfPrs.RData'
parameter: grid_prs = 'GridPrs.RData'
parameter: auto_prs = 'AutoPrs.RData'
parameter: null_r2 = 'NullR2.RData'
parameter: inf_pheno = 'InfPheno.RData'
parameter: grid_pheno = 'GridPheno.RData'
parameter: auto_pheno = 'AutoPheno.RData'
parameter: pheno_pred_xxx = 'PhenoPredXxx.RData' #### fix this


# Initialize variables for storing the LD score and LD matrix
corr = NULL
ld = NULL
# We want to know the ordering of samples in the bed file 
fam.order = NULL


# Workflow

## Prepare workflows

In [None]:
[inf_10, auto_10, grid_10]

R: expand=True
    ## prepare workplace ##
    library(bigsnpr)
    # options(bigstatsr.check.parallel.blas = FALSE)
    # options(default.nproc.blas = NULL)
    library(data.table)
    library(magrittr)

## Load reference panel and summary statistics

In [None]:
[inf_20, auto_20, grid_20]

input: ref = refpanel_file, sum = summstats_file
output: summary_stat

R: expand=True
    ## Prep data ##
    info <- readRDS(url("{_input["ref"]}"))
    # Read in the summary statistic file
    sumstats <- bigreadr::fread2("{_input["sum"]}") 
    # LDpred 2 require the header to follow the exact naming
    names(sumstats) <-
        c("chr",
        "pos",
        "rsid",
        "a1",
        "a0",
        "n_eff",
        "beta_se",
        "p",
        "OR",
        "INFO",
        "MAF")
    # Transform the OR into log(OR)
    sumstats$beta <- log(sumstats$OR)
    # Filter out hapmap SNPs
    sumstats <- sumstats[sumstats$rsid %in% info$rsid,]
    save(sumstats, file = "{_output}")

## Read in phenotype and covariate files

In [None]:
[inf_25, auto_25, grid_25]


output: pheno_out

R: expand=True

    phenotype <- fread("{pheno_file}")
    covariate <- fread("{covariate_file}")
    pcs <- fread("{pcs_file}")
    # rename columns
    colnames(pcs) <- c("FID","IID", paste0("PC",1:6))
    # generate required table
    pheno <- merge(phenotype, covariate) %>%
        merge(., pcs)
    save(pheno, file = "{_output}")

## Preprocess the bed file

In [None]:
[inf_30, auto_30, grid_30]

input: bed_file

# what is the output for snp_readBed 

R: expand=True 
    # preprocess the bed file (only need to do once for each data set)
    snp_readBed("{_input}")

## SNP matching and get the CM information from 1000 Genome

* Perform SNP matching `snp_match(sumstats, map)` to get `info_snp`

Mathch alleles between summary statistics `sumstats` and SNP information from `obj.bigSNP`.

* CM information from 1000 Genome `snp_asGeneticPos(CHR, POS, dir = ".")`

Use genetic maps available at https://github.com/joepickrell/1000-genomes-genetic-maps/ to interpolate physical positions (in bp) to genetic positions (in cM).

In [None]:
[inf_31, auto_31, grid_31]

input: geno = geno_file, sums = summary_stat
output: ld_in
 
R: expand=True 

    load("{_input["sums"]}")
    # now attach the genotype object
    obj.bigSNP <- snp_attach("{_input["geno"]}")
    
    # Assign the genotype to a variable for easier downstream analysis
    genotype <- obj.bigSNP$genotypes
    
    # extract the SNP information from the genotype
    map <- obj.bigSNP$map[-3]
    names(map) <- c("chr", "rsid", "pos", "a1", "a0")  
    
    # Rename the data structures
    CHR <- map$chr
    POS <- map$pos   

    # perform SNP matching
    info_snp <- snp_match(sumstats, map)
    
    # get the CM information from 1000 Genome
    # will download the 1000G file to the current directory (".")
    POS2 <- snp_asGeneticPos(CHR, POS, dir = ".")
    
    # save data to Rdata file
    save(obj.bigSNP, genotype, map, CHR, POS, info_snp, POS2, file = "{_output}")

## Calculate LD

* calculate LD using genotype from `obj.bigSNP` and CM information (distance)

In [None]:
[inf_32, auto_32, grid_32]

input: ld_in
output: ld_out

R: expand = True
    load("{_input}")
    # calculate LD
    # Get maximum amount of cores
    NCORES <- nb_cores()   ## can this be a parameter??? but it is a result from a function
    # Open a temporary file
    tmp <- tempfile(tmpdir = "tmp-data") ## can this be a parameter??????????
    on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
    for (chr in 1:22) {
        # Extract SNPs that are included in the chromosome
        ind.chr <- which(info_snp$chr == chr)
        ind.chr2 <- info_snp$`_NUM_ID_`[ind.chr]
        # Calculate the LD
        corr0 <- snp_cor(
                genotype,
                ind.col = ind.chr2,
                ncores = NCORES,
                infos.pos = POS2[ind.chr2],
                size = 3 / 1000
            )
        if (chr == 1) {
            ld <- Matrix::colSums(corr0^2)
            corr <- as_SFBM(corr0, tmp)
        } else {
            ld <- c(ld, Matrix::colSums(corr0^2))
            corr$add_columns(corr0, nrow(corr))
        }
    }
    # We assume the fam order is the same across different chromosomes
    fam.order <- as.data.table(obj.bigSNP$fam)
    # Rename fam order
    setnames(fam.order,
            c("family.ID", "sample.ID"),
            c("FID", "IID"))
    # save results
    save(info_snp, ld, fam.order, corr, NCORES, genotype, file = "{_output}")

## Perform LD score regression 

Using funciton `snp_ldsc()` to obtain $h^2$ the (SNP) heritability

In [None]:
[inf_40, auto_40, grid_40]

input: ld_out
output: ldreg_out

R: expand=True
    load("{_input}")
    df_beta <- info_snp[,c("beta", "beta_se", "n_eff", "_NUM_ID_")]
    ldsc <- snp_ldsc(   ld, 
                    length(ld), 
                    chi2 = (df_beta$beta / df_beta$beta_se)^2,
                    sample_size = df_beta$n_eff, 
                    blocks = NULL)
    h2_est <- ldsc[["h2"]]
    save(h2_est, df_beta, corr, NCORES, info_snp, genotype,file = "{_output}")

## Get adjusted betas

### Infinitesimal model

In [None]:
[inf_50]

input: ldreg_out
output: inf_beta

R: expand=True
    load("{_input}")
    ## adjusted beta ##
    beta_inf <- snp_ldpred2_inf(corr, df_beta, h2 = h2_est)
    # save data
    save(beta_inf, df_beta, corr, NCORES,info_snp, genotype, file = "{_output}")

### Grid model

In [None]:
[grid_50]

input: ldreg_out
output: grid_beta

R: expand=True
    load("{_input}")
    # Prepare data for grid model
    p_seq <- signif(seq_log(1e-4, 1, length.out = 17), 2)
    h2_seq <- round(h2_est * c(0.7, 1, 1.4), 4)
    grid.param <-
        expand.grid(p = p_seq,
                h2 = h2_seq,
                sparse = c(FALSE, TRUE))
    # Get adjusted beta from grid model
    beta_grid <- snp_ldpred2_grid(corr, df_beta, grid.param, ncores = NCORES)
    # save data
    save(beta_grid, df_beta, corr, NCORES,info_snp, genotype, file = "{_output}")

### Auto model

In [None]:
[auto_50]

input: ldreg_out
output: auto_beta

R: expand=True

    load("{_input}")
    # Get adjusted beta from the auto model
    multi_auto <- snp_ldpred2_auto(
        corr,
        df_beta,
        h2_init = h2_est,
        vec_p_init = seq_log(1e-4, 0.9, length.out = NCORES),
        ncores = NCORES
    )
    beta_auto <- sapply(multi_auto, function(auto)
        auto$beta_est)
    # save data
    save(beta_auto, df_beta, corr, NCORES,info_snp, genotype, file = "{_output}")

## Get PRS

### Infinitesimal model

In [None]:
[inf_60]

input: inf_beta
output: inf_prs

R: expand=True
    
    load("{_input}")
    # calculate PRS for all samples
    ind.test <- 1:nrow(genotype)
    pred_inf <- big_prodVec(    genotype,
                                beta_inf,
                                ind.row = ind.test,
                                ind.col = info_snp$`_NUM_ID_`)
    save(pred_inf, file = "{_output}")

### Grid model

In [None]:
[grid_60]

input: grid_beta
output: grid_prs

R: expand=True

    load("{_input}")
    ind.test <- 1:nrow(genotype)
    pred_grid <- big_prodMat(   genotype, 
                                beta_grid, 
                                ind.col = info_snp$`_NUM_ID_`)
    save(pred_grid, file = "{_output}")

### Auto model

In [None]:
[auto_60]

input: auto_beta
output: auto_prs

R: expand=True
    
    load("{_input}")
    # calculate PRS for all samples
    ind.test <- 1:nrow(genotype)
    pred_auto <-
        big_prodMat(genotype,
                    beta_auto,
                    ind.row = ind.test,
                    ind.col = info_snp$`_NUM_ID_`)
    # scale the PRS generated from AUTO
    pred_scaled <- apply(pred_auto, 2, sd)
    final_beta_auto <-
        rowMeans(beta_auto[,
                    abs(pred_scaled -
                        median(pred_scaled)) <
                        3 * mad(pred_scaled)])
    pred_auto <-
        big_prodVec(genotype,
            final_beta_auto,
            ind.row = ind.test,
            ind.col = info_snp$`_NUM_ID_`)
    save(pred_auto, file = "{_output}")

## Performance of LDpred model

### Calculate null $R^2$

In [None]:
[inf_70, auto_70, grid_70]
 
input: pheno_out
output: null_r2

R: expand=True

    load("{_input}")
    # Reformat the phenotype file such that y is of the same order as the 
    # sample ordering in the genotype file
    y <- pheno[fam.order, on = c("FID", "IID")]
    # Calculate the null R2
    # use glm for binary trait 
    # (will also need the fmsb package to calculate the pseudo R2)
    null.model <- paste("PC", 1:6, sep = "", collapse = "+") %>%
        paste0("Height~Sex+", .) %>%
        as.formula %>%
        lm(., data = y) %>%
        summary
    null.r2 <- null.model$r.squared
    save(null.r2,y, file = "{_output}")

### calculate model $R^2$

#### Infinitesimal model

In [None]:
[inf_80]

input: r2 = null_r2, prs = inf_prs
output: inf_pheno

R: expand=True
    load("{_input["r2"]}")
    load("{_input["prs"]}")
    reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
        paste0("Height~PRS+Sex+", .) %>%
        as.formula
    reg.dat <- y
    reg.dat$PRS <- pred_inf
    inf.model <- lm(reg.formula, dat=reg.dat)
    result <- data.table(
        infinitesimal = inf.model$r.squared - null.r2,
        null = null.r2)
    save(inf.model,result, file = "{_output}")

#### Grid model

In [None]:
[grid_80]

input: r2 = null_r2, prs = inf_prs
output: grid_pheno

R: expand=True

    load("{_input["r2"]}")
    load("{_input["prs"]}")
    reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
        paste0("Height~PRS+Sex+", .) %>%
        as.formula
    reg.dat <- y
    max.r2 <- 0
    for(i in 1:ncol(pred_grid)){
        reg.dat$PRS <- pred_grid[,i]
        grid.model <- lm(reg.formula, dat=reg.dat)
        if(max.r2 < grid.model$r.squared){
            max.r2 <- grid.model$r.squared
        }
    }
    result <- data.table(
    grid = max.r2 - null.r2,
    null = null.r2)
    save(grid.model,result, file = "{_output}")

#### Auto model

In [None]:
[auto_80]

input: r2 = null_r2, prs = inf_prs
output: auto_pheno

R: expand=True

    load("{_input["r2"]}")
    load("{_input["prs"]}")
    reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
        paste0("Height~PRS+Sex+", .) %>%
        as.formula
    reg.dat <- y
    reg.dat$PRS <- pred_auto
    auto.model <- lm(reg.formula, dat=reg.dat)
    result <- data.table(
            auto = auto.model$r.squared - null.r2,
            null = null.r2)
    save(auto.model,result, file = "{_output}")

## Predict phenotype

fix this. xxx stands for different model

In [None]:
[inf_90,grid_90,auto_90]

input: model = xxx_pheno, newdf = newdata
output: pheno_pred_xxx

R: expend = True
    load("{_input}")
    pheno_pred = predict({_input["model"]},newdata = {_input["newdf"]})
    save(pheno_pred, file = "{_output}")

## Plots of results
...

# INF

In [None]:
sos run ldpred.ipynb inf