# Experiment
This notebook generates the experiment results from output using both GEMMA and linear regression in the directory output. The complete list of SNP-s and the p-value computed is generated for each algorithm and a graphical representation of the SNP-s is outputted as PNG.

This notebook is similar to map.ipynb but specifically meant to be run on the new synthetic genome/phenome data.

In [None]:
# Ensure R kernel is installed. For a fresh install, un-comment and run this cell to 
# install R packages.

install.packages("qtl", repos = "http://cran.us.r-project.org")
install.packages("qqman", repos = "http://cran.us.r-project.org")
install.packages("data.table", repos = "http://cran.us.r-project.org")
install.packages("stringr", repos = "http://cran.us.r-project.org")
install.packages("qqman", repos = "http://cran.us.r-project.org")
install.packages("devtools", repos = "http://cran.us.r-project.org")

In [None]:
# Set up working directory structures

library(stringr)
base_dir        <- str_replace(getwd(), 'research_paper_code/notebooks', '')
r_base          <- "research_paper_code"
experiment_dir  <- "mice_data_set" 
real_gwas_path = paste(base_dir, "/mice_data_set/out", sep="")

setwd(base_dir)
getwd()

In [None]:
# Decide which phenotype you'll be analyzing
phenotype_choice = "abBMD"

In [None]:
# Map QTLs for phenotypes measured in CFW outbred mice using the linear
# mixed model (LMM) analysis implemented in GEMMA.
library(qtl)
library(data.table)
library(qqman)
source(paste(r_base, "/src/misc.R", sep=""))
source(paste(r_base, "/src/gemma.R", sep=""))
source(paste(r_base, "/src/read.data.R", sep=""))
source(paste(r_base, "/src/data.manip.R", sep=""))
source(paste(r_base, "/src/qtl.analyses.R", sep=""))
analysis_selection = analyses["abBMD"]
for (analysis in analysis_selection) {
    print(analysis)
}

# SCRIPT PARAMETERS
# -----------------
chromosomes    <- NULL
gemmadir       <- paste(experiment_dir, "/gemma", sep="")
gemma.exe      <- paste("./", "gemma-0.98.4-linux-static-AMD64", sep="")
geno_txt_base       <- paste(experiment_dir, "/data/synthetic_genome_data", sep="")
map_txt_base       <- paste(experiment_dir, "/data/genome_map_data", sep="")

In [None]:
# Read in the synthetic phenotype data
pheno_synth_file <- paste(experiment_dir, "/data/phenome_alldata_synth.csv", sep="")
pheno_all <- read.csv(pheno_synth_file,quote = "",header = TRUE,check.names = FALSE,
                    stringsAsFactors = FALSE,comment.char = "#")

In [1]:
# This is the main gwas function

run_gwas <- function(geno_txt, map_txt, pheno_all, analysis, batch) {

    # LOAD GENOTYPE DATA
    # ------------------
    # Load the "mean genotypes"; i.e., the the mean alternative allele
    # counts.
    map     <- read.map(map_txt)
    out     <- read.geno.dosage(geno_txt,nrow(map))
    discard <- out$discard
    X_all   <- out$geno
    rm(out)

    # Discard genotype samples from mislabeled flowcell samples.
    X_all <- X_all[which(discard == "no"),]


    # Discard SNPs with low "imputation quality" assessed by inspecting
    # the genotype probabilities. Retain SNPs for which: (1) at least 95%
    # of the samples have a maximum probability genotype greater than than
    # 0.5; (2) the minor allele frequency is greater than 2%.
    f       <- apply(X_all,2,compute.maf)
    markers <- which(map$quality > 0.95 & f > 0.02)
    map     <- map[markers,]
    X_all   <- X_all[,markers]
    
    # min_var, max_var - which of the columns in genotype data (X_all above) to be considered when 
    # using linear models (mostly for speed). Please note that gemma analysis will analyze the 
    # whole chromosome
    min_var = 1
    max_var = dim(X_all)[2]

    analysis_selection <- analyses[phenotype_choice]
    chromosomes <- unique(map[min_var:max_var,"chr"])
    
    ##################################
    # Cleanup data
    phenotype  <- analysis$pheno
    covariates <- analysis$cov
    outliers   <- analysis$outliers
 
    pheno <- copy(pheno_all)
    if (!is.null(outliers))
      pheno_all <- remove.outliers(pheno,phenotype,covariates,outliers)

    
    # Only analyze samples (i.e. rows of the genotype and phenotype
    # matrices) for which the phenotype and all the covariates are
    # observed.
    pheno <- pheno[which(none.missing.row(pheno[c(phenotype,covariates)])),]  
     
    # Align the phenotypes and genotypes
    ids   <- intersect(pheno_all$id,rownames(X_all))
    pheno <- pheno_all[match(ids,pheno_all$id),]
    X     <- X_all[match(ids,rownames(X_all)),]

    ###################################
    # Compute using gemma
    # MAP QTLs
    
    
    ge_out_dat <- paste(experiment_dir, "/out_synth/ge_batch", batch, "_", analysis$pheno, "_", min_var, "_", max_var, ".dat", sep="")
    ge_out_csv <- paste(experiment_dir, "/out_synth/ge_batch", batch, "_", analysis$pheno, "_", min_var, "_", max_var, ".csv", sep="")
    
    if (!file.exists(ge_out_csv)) {
      # Calculate p-values using GEMMA.
        gwscan.gemma <- run.gemma(phenotype,covariates,pheno,X,map,
                                  gemmadir,gemma.exe,chromosomes)

        # Save results to file.
        save(list = c("analysis","gwscan.gemma"),file = ge_out_dat)
        
        named_gws <- gwscan.gemma
        named_gws$snp = rownames(named_gws)
        named_gws$p = 10 ^ (-named_gws$log10p)
               
        write.csv(data.table(named_gws)[order(rank(p)),], ge_out_csv)
        
    }
    
    ###################################
    # Compute using linear model
    lm_out_csv <- paste(experiment_dir, "/out_synth/lm_batch", batch, "_", analysis$pheno, "_", min_var, "_", max_var, ".csv",sep="")
    
    if(!file.exists(lm_out_csv)) {
        print(dim(X)[2])
        dt <- data.table(snp=rep("",dim(X)[2]), chr=rep(0,dim(X)[2]), pos=rep(0,dim(X)[2]), p=rep(1,dim(X)[2]))
        for (i in min_var:max_var) {
            X_variant <- cbind(X[,i], pheno_column=pheno[,analysis$pheno])
            colnames(X_variant)[1]<-colnames(X)[i]
            f <- paste("pheno_column ~ ",colnames(X)[i])
            # Add any covariates
            for(cov in analysis$cov) {
                X_variant <- cbind(X_variant, pheno_column=pheno[,cov])
                f <- paste(f,"+",cov)
            }
            res_variant <- lm(pheno_column~., data = data.frame(X_variant))
          
            dt[i,1] = colnames(X)[i]
            dt[i,2] = as.numeric(map[map["id"]==colnames(X)[i],"chr"])
            dt[i,3] = as.numeric(map[map["id"]==colnames(X)[i],"pos"])
            dt[i,4] = as.numeric(summary(res_variant)$coefficients[2,4])
        }
        
        # Print to file

        write.csv(dt[order(rank(p)),][1:(max_var-min_var)], lm_out_csv)
        print(paste("Sorted p-values saved in: ", lm_out_csv, sep=""))

    }

    
}

In [None]:
# Now you can run GWAS for a particular synthetic genome batch
# Be sure the contents of out_synth doesn't already have output for this batch or it won't rerun
# Restart the notebook for each call to run_gwas

batch = 2316
geno_txt       <- paste(geno_txt_base, "/synthetic_genomes_batch", batch, ".txt", sep="")
map_txt       <- paste(map_txt_base, "/map_abBMD_batch", batch, ".txt", sep="")
run_gwas(geno_txt, map_txt, pheno_all, analysis, batch)

In [None]:
# Alternatively, you can run gwas for a merged larger synthetic genome file
# Be sure that any previous files created in out_synth for this pheno aren't still there
# or new data will not be created
# Restart the notebook for each call to run_gwas
# NOTE - YOU MUST USE THE FILENAMES THAT YOU CREATED AT THE END OF THE 04 NOTEBOOK

batch = "all"
geno_txt       <- paste(geno_txt_base, "/synthetic_genomes_allbatches.txt", sep="")
map_txt       <- paste(map_txt_base, "/map_abBMD_allbatches.txt", sep="")
run_gwas(geno_txt, map_txt, pheno_all, analysis, batch)