In [None]:
import subprocess
import sys
import os
import shutil
import pandas as pd

In [None]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [None]:
# set paths
basedir = '/data/songy4/TWAS'
datadir = f'{basedir}/data_folder'
fusiondir = f'/data/vitaled2/twas/fusion_twas'
geno_path = f'{datadir}/id_comparison/qc_genotypes_twas'
gene_list_path = f'{datadir}/final_folder/gene_list.txt'
pheno_path = f'{datadir}/final_folder/expression_matrix_final.txt'
coord_path = f'{datadir}/final_folder/twas_coordinate.txt'

# Liftover genotype from hg38 to hg19 to match 1kg ref

In [None]:
# get chrN:start-end positions for liftover of genotype from hg38 to hg19 to match 1kG LD ref
lift_outname = f'{basedir}/geno_hg38_positions.bed'
bim = pd.read_csv(f'{geno_path}.bim', sep='\t', header=None)
bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']
bim['chr'] = 'chr' + bim['chr'].astype('str')
bim['end'] = bim['pos'] + 1
lift_out = bim[['chr', 'pos', 'end', 'rsid' ]].copy()
lift_out.to_csv(lift_outname, sep='\t', header=False, index=False)

In [None]:
# pull liftOver from UCSC
# !wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver -P /data/vitaled2/twas/liftover/
# !wget --timestamping 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz' -O /data/vitaled2/twas/liftover/hg38ToHg19.over.chain.gz
# !chmod +x /data/vitaled2/twas/liftover/liftOver

In [None]:
# create command for liftover
liftOver = f'./data/vitaled2/twas/liftover/liftOver'
chainfile = f'/data/vitaled2/twas/liftover/hg38ToHg19.over.chain.gz'
liftover_output = f'{basedir}/geno_hg19_positions.bed'
unlifted = f'{basedir}/unlifted.bed'

liftover_cmd = f'\
{liftOver} {lift_outname} {chainfile} {liftover_output} {unlifted}'

shell_do(liftover_cmd)
liftover_cmd

In [None]:
# create map update file
lifted = pd.read_csv(liftover_output, sep='\t', header=None, names=['chr', 'pos', 'end', 'rsid' ])

bim = pd.read_csv(f'{geno_path}.bim', sep='\t', header=None, names=['chr', 'rsid', 'kb', 'pos', 'a1', 'a2'])
bim_lift_merge = bim.merge(lifted, how='right', on='rsid')
lifted_bim = bim_lift_merge[['chr_x', 'rsid', 'kb', 'pos_y', 'a1', 'a2']].copy()
lifted_bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']

In [None]:
# use only lifted snps
lifted_bim['rsid'].to_csv(f'{geno_path}_hg19_lifted.snplist', sep='\t', header=False, index=False)

plink_extract_cmd = f'\
plink --bfile {geno_path}\
 --extract {geno_path}_hg19_lifted.snplist\
 --make-bed\
 --out {geno_path}_hg19_lifted'

shell_do(plink_extract_cmd)


In [None]:
# move bim with old positions to new file
!mv {geno_path}_hg19_lifted.bim {geno_path}_hg19_lifted_old_positions.bim

In [None]:
# write lifted bim to _hg19_lifted genotype name
lifted_bim.to_csv(f'{geno_path}_hg19_lifted.bim', sep='\t', header=False, index=False)