In [1]:
import subprocess
import sys
import os
import shutil
import pandas as pd

In [2]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [None]:
!git clone https://github.com/gusevlab/fusion_twas.git

In [3]:
# set paths
basedir = '/data/songy4/TWAS'
datadir = f'{basedir}/data_folder'
fusiondir = f'fusion_twas'
geno_path = f'{datadir}/id_comparison/qc_genotypes_twas'
gene_list_path = f'{datadir}/final_folder/gene_list.txt'
pheno_path = f'{datadir}/final_folder/expression_matrix_final.txt'
coord_path = f'{datadir}/final_folder/twas_coordinate.txt'
gcta = f'{fusiondir}/gcta_nr_robust'
# gcta = 'gcta'
gemma = f'gemma-0.98.3-linux-static'
fusion_ldref_basename = f'{fusiondir}/LDREF/1000G.EUR'
fusion_compute_weights_script = f'{fusiondir}/FUSION.compute_weights.R'
# out_dir = f'{basedir}/output


!mkdir --parents output/weights
!mkdir --parents output/tmp


# Pipeline for TWAS run

In [4]:
# get gene list
gene_list_df = pd.read_csv(gene_list_path, sep='\t')
gene_list = list(gene_list_df.ID)
# gene_list = ['ENSG00000186092', 'ENSG00000187634', 'ENSG00000188976'] # for testing
# gene_list = ['ENSG00000188976']
pheno = pd.read_csv(pheno_path, sep='\t')
coords = pd.read_csv(coord_path, sep='\t')

In [5]:
# now put together pipeline

compweights_swarmfile = f'{basedir}/compute_weights.swarm'

with open(compweights_swarmfile, 'w') as f:
    
    for gene in gene_list:
        OUT = f'output/tmp/{gene}'
        FINAL_OUT = f'output/weights/{gene}'
        #get chr start stop
        _chr = coords.loc[coords.ID == gene, 'X.Chr'].item()
        _start = coords.loc[coords.ID == gene, 'start'].item()-0.5e6
        _stop = coords.loc[coords.ID == gene, 'end'].item()+0.5e6

        if _start < 0:
            _start = 0
        
        _start = int(_start)
        _stop = int(_stop)
        
#         _temp_name = f'{gene}_temp'
#         _gene_temp = f'{tempdir}/{_temp_name}'
        
        # pheno per gene
#         _phenoname = f'{_gene_temp}.pheno'
        
        # write pheno file per gene
        pheno[['FID','IID', gene]].to_csv(f'{OUT}.pheno', sep='\t', header=False, index=False)
#         pheno[['FID','IID', gene]].to_csv(_phenoname, sep='\t', header=False, index=False)
        
        plink_cmd = f'\
plink --bfile {geno_path}_hg19_lifted \
--pheno {OUT}.pheno \
--keep {OUT}.pheno \
--chr {_chr} \
--from-bp {_start} \
--to-bp {_stop} \
--extract {fusion_ldref_basename}.{_chr}.bim \
--make-bed \
--out {OUT}'  

#         shell_do(plink_cmd, log=True, return_log=True)

        fusion_cmd = f'\
Rscript {fusion_compute_weights_script} \
--bfile {OUT} \
--tmp {OUT}.tmp \
--out {FINAL_OUT} \
--PATH_gemma {gemma} \
--PATH_plink plink \
--PATH_gcta {gcta} \
--verbose 2 \
--save_hsq \
--models top1,lasso,enet'
#         shell_do(fusion_cmd, log=True, return_log=True)

        f.write(f'{plink_cmd} && {fusion_cmd}\n')
    f.close()

In [7]:
# run swarm
swarm_cmd = f'swarm -f {compweights_swarmfile} -g 16 -t 16 --time=10:00:00 --logdir swarm --gres=lscratch:20 --module plink,GEMMA/0.96 --partition=norm'
# shell_do(swarm_cmd)
print(swarm_cmd)
!{swarm_cmd}

swarm -f /data/songy4/TWAS/compute_weights.swarm -g 16 -t 16 --time=10:00:00 --logdir swarm --gres=lscratch:20 --module plink,GEMMA/0.96 --partition=norm
5529448


# running list of questions
1. what about non-autosome transcripts? i.e. MT
2. how many snps in common between ref and geno?
3. how many snps per transcript should we expect (0-500?)


In [44]:
p_in = 'tmp/ENSG00000188976.tmp.cv'
f"gemma -miss 1 -maf 0 -r2 1 -rpace 1000 -wpace 1000 -bfile {p_in} -bslmm 2 -o ENSG00000188976"

'gemma -miss 1 -maf 0 -r2 1 -rpace 1000 -wpace 1000 -bfile tmp/ENSG00000188976.tmp.cv -bslmm 2 -o ENSG00000188976'