## SoS Workflow:

This is the options and the SoS code to run the LDSC pipeline using your own data. 

## Command Interface:

In [10]:
!sos run LDSC_DeepSea_Code.ipynb -h

No help information is available for script run: Failed to locate LDSC_DeepSea_Code.ipynb.sos


## Train Model:

In [93]:

[train_model]

bash: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif'

    python3.7 /mnt/mfs/statgen/Anmol/training_files/tutorial/run_neuron_full_tutorial.py 

## Make Full Annotation File Based on Trained Model

In [None]:

[make_annot]

#path to feature list file
parameter: feature_list = str
#path to trained model location
parameter: model = str
#path to output directory
parameter: output = str


python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif'

    from selene_sdk.utils import load_path
    from selene_sdk.utils import parse_configs_and_run
    from selene_sdk.predict import AnalyzeSequences
    from selene_sdk.sequences import Genome
    from selene_sdk.utils import load_features_list
    from selene_sdk.utils import NonStrandSpecific
    from selene_sdk.utils import DeeperDeepSEA
    import glob
    import os
    distinct_features = load_features_list({feature_list})

    model_predict = AnalyzeSequences(
    NonStrandSpecific(DeeperDeepSEA(1000,{num_features})),
    {model},
    sequence_length=1000,
    features=distinct_features,
    reference_sequence=Genome("/mnt/mfs/statgen/Anmol/training_files/male.hg19.fasta"),
    use_cuda=False # update this to False if you do not have CUDA on your machine.
    )

    for i in range(1,22):
        model_predict.variant_effect_prediction(
        "/mnt/mfs/statgen/Anmol/training_files/testing/1000G_chr_"+str(i)+".vcf",
        save_data=["abs_diffs"],  # only want to save the absolute diff score data
        output_dir={output})

## Format Annotation File

In [None]:

[format_annot]

#path to tsv files directory
parameter: tsv = path()
#path to output file directory
parameter: output = path()

R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif"
    library(data.table)
    library(tidyverse)
    data = fread(paste0("${tsv}","/tutorial_1000G_chr_",22,"_abs_diffs.tsv"))
    features = colnames(data)[9:ncol(data)]
    features = data.frame(features)
    features$encoding = paste0("feat_",seq(1,nrow(features)))
    fwrite(features,paste0("${output}","/feature_encoding.txt"),quote=F,sep="\t",row.names=F,col.names=T)
    for (i in seq(1,22)){
    data = fread(paste0("${tsv}","/tutorial_1000G_chr_",i,"_abs_diffs.tsv"))
    data_2 = select(data,-seq(4,8))
    base = data.frame(base=rep(1,nrow(data_2)))
    fwrite(base,paste0("${output}","/base_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T)
    for (j in seq(4,ncol(data_2))){
    data_3 = select(data_2,c(1,2,3,j))
    colnames(data_3) = c("CHR","BP","SNP",paste0("feat_",j))
    data_3 = setorder(data_3,BP)
    data_3 = select(data_3,-c("CHR","BP","SNP"))
    fwrite(data_3,paste0("${output}","/feat_",j,"_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T)
    }
    }

## Munge Summary Statistics (Option 1: No Signed Summary Statistic):

In [None]:
#This option is for when the summary statistic file does not contain a signed summary statistic (Z or Beta). 
#In this case,the program will calculate Z for you based on A1 being the risk allele
[munge_sumstats_no_sign]



#path to summary statistic file
parameter: sumst = str
#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program
parameter: alleles = "w_hm3.snplist"
#path to output file
parameter: output = str

bash: 
    python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output} --a1-inc

## Munge Summary Statistics (Option 2: No Signed Summary Statistic):

In [None]:
# This option is for when the summary statistic file does contain a signed summary statistic (Z or Beta)
[munge_sumstats_sign]



#path to summary statistic file
parameter: sumst = str
#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program
parameter: alleles = "w_hm3.snplist"
#path to output file
parameter: output = str

bash: 
    python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output}

## Calculate LD Scores:

**Make sure to delete SNP,CHR, and BP columns from annotation files if they are present otherwise this code will not work. Before deleting, if these columns are present, make sure that the annotation file is sorted.**

In [None]:

[calc_ld_score]

#Path to directory with bim files
parameter: bim = path()
#Path to directory with annotation files, output will appear here too. Make sure to remove the SNP, CHR, and BP columns from the annotation files if present before running.
parameter: annot_files = path()
#number of features
parameter: num_features = int

bash: 
   #echo {annot_files} > out.txt
   for i in $(seq 1 {num_features});do for j in {1..22}; do python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile {bim}/1000G.EUR.QC.${j} --l2 --ld-wind-cm 1 --annot {annot_files}/feat_${i}_chr_${j}.annot.gz --thin-annot --out {annot_files}/feat_${i}_chr_${j} --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt; done; done
   for j in {1..22}; do python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile {bim}/1000G.EUR.QC.${j} --l2 --ld-wind-cm 1 --annot {annot_files}/base_chr_${j}.annot.gz --thin-annot --out {annot_files}/base_chr_${j} --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt; done

## Convert LD Score SNPs to AD Summary Statistic Format:

In [None]:

[convert_ld_snps]

#Path to directory with ld score files AND annotation files
parameter: ld_scores = str

parameter: num_features = int


R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif"
    library(tidyverse)
    #library(R.utils)
    library(data.table)
    for (i in seq(1,22)){
      data = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".l2.ldscore.gz")))
      data_2 = fread(paste0("${ld_scores}/base_chr_",i,".l2.M_5_50"))
      data_3 = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".annot.gz")))
      data$SNP = paste0(data$CHR,":",data$BP)
      fwrite(data,paste0("${ld_scores}/AD_base_chr_",i,".l2.ldscore.gz"),quote=F,sep="\t",row.names=F,col.names=T)
      fwrite(data_2,paste0("${ld_scores}/AD_base_chr_",i,".l2.M_5_50"),quote=F,sep="\t",row.names=F,col.names=F)
      fwrite(data_3,paste0("${ld_scores}/AD_base_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T)
      for (j in seq(1,${num_features})){
      data = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".l2.ldscore.gz")))
      data_2 = fread(paste0("${ld_scores}/feat_",j,"_chr_",i,".l2.M_5_50"))
      data_3 = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".annot.gz")))
      data$SNP = paste0(data$CHR,":",data$BP)
      fwrite(data,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".l2.ldscore.gz"),quote=F,sep="\t",row.names=F,col.names=T)
      fwrite(data_2,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".l2.M_5_50"),quote=F,sep="\t",row.names=F,col.names=F)
      fwrite(data_3,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T)
    }
    }
  


## Calculate Functional Enrichment using Annotations:

In [None]:
#Calculate Enrichment Scores for Functional Annotations

[calc_enrichment]

#Path to Summary statistics File
parameter: sumstats = str
#Path to Reference LD Scores Files (Base Annotation + Annotation you want to analyze, format like minimal working example)
parameter: ref_ld = str
#Path to LD Weight Files (Format like minimal working example)
parameter: w_ld = str
#path to frequency files (Format like minimal working example)
parameter: frq_file = str
#Output name
parameter: output = str

bash:
    python2 ldsc.py --h2 {sumstats} --ref-ld-chr {ref_ld} --w-ld-chr {w_ld} --overlap-annot --frqfile-chr {frq_file} --out {output}