# Command interface 

In [None]:
sos run SOS_weight_cpt_template.ipynb -h

# Working example
On a minimal working example (MWE) dataset that can be downloaded from the private repo:
    https://github.com/cumc/neuro-twas/blob/master/WIP/GD462.hsq_succ.test.txt

    sos run SOS_weight_cpt_template.ipynb \
      --GCTA "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/gcta_1.93.2beta_mac/gcta64"\
      --PLINK `which plink` \
      --GEMMA `which gemma` \
      --Rscp  "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/FUSION.compute_weights.mod.R" \
      --Datarep "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/Testing/Data/" \
      --wd  "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/SOS" \
      --OUT_DIR "./WEIGHTS" \
      --BATCH_START 1 \
      --BATCH_END 50 \
      --LDREF  "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/LDREF" \
      --PRE_GEXP "GD462.hsq_succ.test.txt" \
      --PRE_GENO "1000G.EUR"

# Global parameter settings

In [96]:
[global]
##!/bin/sh
# MAKE SURE FUSION.compute_weights.R IS IN YOUR PATH
# FILL IN THESE PATHS
# For mac user, the mac version of GCTA shall be downloaded saperately, the one came with the Fusion package will not work.
parameter: GCTA = "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/gcta_1.93.2beta_mac/gcta64"
parameter: PLINK = "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/plink_mac_20200616/plink"
parameter: GEMMA = "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/GEMMA"

# Required the customized fusion.compute_weight.mod.R script, other wise will not work
parameter: Rscp = "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/FUSION.compute_weights.mod.R"
# Path to the input data
parameter: Datarep = "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/Testing/Data/"
# ALTERNATIVELY: ENSURE THAT plink, gcta, gemma CAN BE CALLED FROM PATH AND REMOVE --PATH_* FLAGS BELOW

# PATH TO WORKING DIRECTORY
parameter: wd = "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/SOS"

# PATH TO OUTPUT DIRECTORY (population-specific subdirs will be made)
parameter: OUT_DIR = "./WEIGHTS"

# ROWS IN THE MATRIX TO ANALYZE (FOR BATCHED RUNS)
parameter: BATCH_START = 1
parameter: BATCH_END = 3


# PATH TO DIRECTORY CONTAINING LDREF DATA (FROM FUSION WEBSITE or https://data.broadinstitute.org/alkesgroup/FUSION/LDREF.tar.bz2)
parameter: LDREF = "/Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/LDREF"
# THIS IS USED TO RESTRICT INPUT SNPS TO REFERENCE IDS ONLY

# PATH TO GEUVADIS GENE EXPRESSION MATRIX:
parameter: PRE_GEXP = "GD462.hsq_succ.test.txt"
# GEUVADIS DATA WAS DOWNLOADED FROM https://www.ebi.ac.uk/arrayexpress/experiments/E-GEUV-1/files/analysis_results/

# PATH TO PREFIX FOR GEUVADIS GENOTYPES SPLIT BY CHROMOSOME
# SUBSAMPLE THESE TO THE LDREF SNPS FOR EFFICIENCY
parameter: PRE_GENO = "1000G.EUR"

# Specify the first four column of the genexpression file
parameter: 


# Actual pipeline

In [102]:
# Make folder structure for the pipeline
[STEP_1]
bash: expand= "$[ ]"
    cd $[wd]
    NR="$[BATCH_START]_$[BATCH_END]"
    mkdir -p tmp/$NR
    mkdir -p hsq/$NR
    mkdir -p out/$NR
    mkdir $[OUT_DIR]
    mkdir $[wd]/$[PRE_GEXP]_per_gene
    head -1 $[Datarep]$[PRE_GEXP] | awk '{$1=$2=$3=$4=""; print substr($0,4)}' | fmt -1 > PRE_GEXPID
    head -3 PRE_GEXPID

In [88]:
# extract each gene from the data matrixs to a file for multi tasking
[STEP_2]
bash: expand= "$[ ]"
cd $[wd]
    cat $[Datarep]$[PRE_GEXP] | awk -v s=$[BATCH_START] -v e=$[BATCH_END] 'NR > s && NR <= e' |  while read PARAM; do
    GNAME=`echo $PARAM | awk '{ print $1 }'`
    echo $PARAM > $[wd]/$[PRE_GEXP]_per_gene/$[PRE_GEXP]_$GNAME.txt
    
done

In [100]:
#Actual weight computation analysis
[STEP_3]
depends: sos_step('STEP_2')
input: f'{wd}/{PRE_GEXP}_per_gene/',group_by=1
bash:expand= "$[ ]"
cd $[wd]
cat $[_input]/* |  while read PARAM; do
# Get the gene positions +/- 500kb
CHR=`echo $PARAM | awk '{ print $3 }'`
P0=`echo $PARAM | awk '{ print $4 - 0.5e6 }'`
P1=`echo $PARAM | awk '{ print $4 + 0.5e6 }'`
GNAME=`echo $PARAM | awk '{ print $1 }'`

OUT="tmp/$[PRE_GEXP].$GNAME"
echo $GNAME $CHR $P0 $P1 $OUT $tmpdir$OUT

head -1 PRE_GEXPID

## Pull out the current gene expression phenotype, adding patient ID to the expression profile (Note 1 )
echo $PARAM | tr ' ' '\n' | tail -n+5 | paste PRE_GEXPID PRE_GEXPID - > $OUT.pheno

head -1 $OUT.pheno

## Get the locus genotypes for all samples and set current gene expression as the phenotype
$[PLINK] --bfile $[LDREF]/$[PRE_GENO].$CHR \
--pheno $OUT.pheno \
--make-bed \
--out $OUT \
--chr $CHR \
--from-bp $P0 \
--to-bp $P1 \
--extract $[LDREF]/$[PRE_GENO].$CHR.bim \
--keep $OUT.pheno \
--allow-no-sex
## Process all samples together (for reference purposes only since this is mult-ethnic data)
mkdir $[OUT_DIR]/ALL
FINAL_OUT="$[OUT_DIR]/ALL/ALL.$GNAME"
#echo $FINAL_OUT
#
#echo $OUT.tmp
echo $[Rscp]
#
Rscript $[Rscp] \
--bfile $OUT \
--tmp $OUT.tmp \
--out $FINAL_OUT \
--verbose 0 \
--save_hsq \
--PATH_gcta $[GCTA] \
--PATH_gemma $[GEMMA] \
--models blup,lasso,top1,enet

#
## Append heritability output to hsq file
cat $FINAL_OUT.hsq >> hsq/hsq
#
## Clean-up just in case
rm -f $FINAL_OUT.hsq $OUT.tmp.*
#
# Remove all intermediate files
echo "end of circle"

done


In [104]:
%sosrun



 HG00096
 HG00097
 HG00099


ENSG00000136237.12 7 21896763 22896763 tmp/GD462.hsq_succ.test.txt.ENSG00000136237.12 tmp/GD462.hsq_succ.test.txt.ENSG00000136237.12
 HG00096
 HG00096	 HG00096	3.78850273139249
PLINK v1.90b6.18 64-bit (16 Jun 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to tmp/GD462.hsq_succ.test.txt.ENSG00000136237.12.log.
Options in effect:
  --allow-no-sex
  --bfile /Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/LDREF/1000G.EUR.7
  --chr 7
  --extract /Users/haosun/Documents/WG_Reasearch_Assisstant/Fusion/install/fusion_twas-master/LDREF/1000G.EUR.7.bim
  --from-bp 21896763
  --keep tmp/GD462.hsq_succ.test.txt.ENSG00000136237.12.pheno
  --make-bed
  --out tmp/GD462.hsq_succ.test.txt.ENSG00000136237.12
  --pheno tmp/GD462.hsq_succ.test.txt.ENSG00000136237.12.pheno
  --to-bp 22896763

16384 MB RAM detected; reserving 8192 MB for main workspace.
673 out of 66171 variants loaded fr

In [None]:
GD462.hsq_succ.test.txt_per_gene