In [7]:
import os
import re

from pyliftover import LiftOver


In [2]:
## convert hg19 vcf to hg38 vcf

In [3]:
hg19_vcf = '/gscmnt/gc2508/dinglab/estorrs/ancestry/data/hg19/all.coding.sorted.02maf.snps.vcf'

In [4]:
lo = LiftOver('hg19', 'hg38')

In [6]:
def get_headerless_vcf_fobj(fp):
    header = ''
    f = open(fp)
    for line in f:
        if line[:7] == '#CHROM\t':
            header += line
            return f, header
        header += line
    
    return None

In [6]:
lo.convert_coordinate('chr1', 1000000)

[('chr1', 1064620, '+', 20851231461)]

In [7]:
vcf_f_obj, header = get_headerless_vcf_fobj(hg19_vcf)

In [8]:
lines = []
for i, line in enumerate(vcf_f_obj):
    if i % 5000 == 0:
        print(i)
    chrom, pos, rest = line.split('\t', 2)
    
    try:
#     print(chrom, pos)
        converted = lo.convert_coordinate('chr' + chrom, int(pos))[0]
        new_line = converted[0] + '\t' + str(converted[1]) + '\t' + rest
        lines.append(new_line)
    except IndexError:
        pass
        

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000


In [9]:
hg38_vcf = '/gscmnt/gc2508/dinglab/estorrs/ancestry/data/hg38/all.coding.sorted.02maf.snps.vcf'

In [10]:
f = open(hg38_vcf, 'w')
f.write(header + ''.join(lines))
f.close()

## add chr to vcf

In [8]:
hg19_vcf = '/gscmnt/gc2508/dinglab/estorrs/ancestry/data/hg19/all.coding.sorted.02maf.snps.vcf'
# hg38_vcf = '/gscmnt/gc2508/dinglab/estorrs/ancestry/data/hg38/all.coding.sorted.02maf.snps.vcf'

In [9]:
# chr_regex = re.compile(r'^(^chr)')

def add_chr_to_lines(fobj):
    lines = []
    for line in fobj:
        lines.append('chr' + line)
    return lines

def add_chr_to_vcf(vcf_fp):
    f_obj, header = get_headerless_vcf_fobj(vcf_fp)
    
    header = re.sub(r'contig=<ID=([^>]+)>', r'contig=<ID=chr\1>', header)
    body = ''.join(add_chr_to_lines(f_obj))
    
    out_fp = vcf_fp.replace('.vcf', '.chr.vcf')
    out_f = open(out_fp, 'w')
    out_f.write(header + body)
    out_f.close()
    f_obj.close()

In [10]:
add_chr_to_vcf(hg19_vcf)
# add_chr_to_vcf(hg38_vcf)

## bed generation

In [15]:
hg19_vcf = '/gscmnt/gc2508/dinglab/estorrs/ancestry/data/hg19/all.coding.sorted.02maf.snps.vcf'
# hg38_vcf = '/gscmnt/gc2508/dinglab/estorrs/ancestry/data/hg38/all.coding.sorted.02maf.snps.chr.vcf'



In [16]:
hg19_bed = '/gscmnt/gc2508/dinglab/estorrs/ancestry/data/hg19/all.coding.sorted.02maf.snps.bed'
# hg38_bed = '/gscmnt/gc2508/dinglab/estorrs/ancestry/data/hg38/all.coding.sorted.02maf.snps.chr.bed'





In [17]:
def bed_from_vcf(vcf_fp, bed_fp):
    f_obj, header = get_headerless_vcf_fobj(vcf_fp)
    
    positions = set()
    for line in f_obj:
        chrom, pos, _ = line.split('\t', 2)

        positions.add((chrom, int(pos)))
    positions = sorted(list(positions))
    
    f_obj.close()
    f = open(bed_fp, 'w')
    for chrom, pos in positions:
        f.write(f'{chrom}\t{pos}\t{pos}\n')
    f.close()

In [18]:
bed_from_vcf(hg19_vcf, hg19_bed)
# bed_from_vcf(hg38_vcf, hg38_bed)