In [1]:
# Merges VCFs output by SV2. Removes calls that aren't DUPs or DELs, and that aren't FILTER==PASS. 

In [2]:
import pandas as pd

pd.set_option('display.max_colwidth', 1000)

In [3]:
fam = pd.read_csv('/data3/16p12_WGS/parsing_cnv_callers/sv2/all_batches.ped',
                 sep='\t', header=None)


fam = fam[fam[1] != 'SG047_batch3'].copy()

samples = fam[1].to_list()

In [4]:
# load in all vcfs
vcf = pd.DataFrame()

for samp in samples:
    vcffile = '/data5/16p12_WGS/structural_variants/sv2/sv2_genotypes/{}.vcf'.format(samp)

    # load in sample VCF file
    samp_vcf = pd.read_csv(vcffile, sep='\t', comment='#', header=None)
    
    # add column for sample
    samp_vcf['sample'] = samp
    
    # append to master vcf df
    vcf = vcf.append(samp_vcf)

In [5]:
# rename columns
vcf.columns = ['chrom', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info', 'format', 'record', 'sample']

In [6]:
# get END, SVLEN, SVTYPE, DENOVO_FILTER, GENES from info column
def get_info(s, info_item):
    s = s.split(';')
    for item in s:
        if item.startswith(info_item):
            item = item[len(info_item) + 1:]
            return item

info_items = ['END', 'SVLEN', 'SVTYPE', 'DENOVO_FILTER', 'GENES']
for info_item in info_items:
    vcf[info_item] = vcf['info'].apply(lambda s: get_info(s, info_item))

In [7]:
# rename columns again
vcf.columns = ['chrom', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info', 'format', 'record', 'sample',
              'end', 'svlength', 'svtype', 'denovo_filter', 'genes']

In [8]:
# keep only sv2 calls that are dups or dels
vcf = vcf[vcf.svtype.isin(['DUP', 'DEL'])]

In [9]:
# keep only sv2 calls that are PASS
vcf = vcf[vcf['filter'] == 'PASS']

In [10]:
# sort by chrom and position
vcf = vcf.sort_values(['chrom', 'pos'])

In [12]:
# save
vcf.to_csv('output/merged.sv2.tsv', sep='\t', index=False)

In [13]:
vcf.shape

(490298, 16)