In [1]:
#!/usr/bin/env python

import argparse
from datetime import datetime
import numpy as np
import scipy.stats
import pandas as pd
import subprocess
import vcf
import sys
import os
import io
import gzip

"""
    Merge vcf files, format for downstream  and filters homopolymers and seg. dupl STRs. 
    Output is a vcf file to be indexed with tabix
    Usage:
        STR_filter.py --vcf VCFs --homopolymers --seg_dup --hrun --hwe 0.1 --call_rate --heterozygosty 0.3  --out Filtered_STRs.vcf   
"""

SEGDUP="/storage/resources/dbase/human/hg19/hg19_segmentalduplications.bed"
HRUN = "/storage/resources/dbase/human/hg19/hg19.hipstr_reference_hrun.bed"
OUTPUTFILE = ""

def PROGRESS(msg):
    sys.stderr.write("%s\n"%msg.strip())

def removehomopolymers(Frame):
    cleanF=Frame.loc[Frame["UNIT"]!=1]
    return(cleanF)

def removeoverlap(Frame, feat ):
    L=list(set(list(Frame['CHROM'])))
    fragments=[]
    t=0
    for C in L:
        X = Frame.loc[Frame['CHROM']==C]
        Y = feat.loc[feat['CHROM']=='chr'+str(C)]
        X['POS'] = X["POS"].astype(int)
        X['END'] = X["END"].astype(int)
        for i in range(len(list(Y.index))):
            start = list(Y['START'])[i]
            end = list(Y['END'])[i]
            X2 = X.loc[(X["END"]<=start) | (X["POS"]>=end)]
            X = 0; X = X2
        fragments.append(X2.sort_values('POS'))
        print(C,'\t',X.shape)
    result = pd.concat(fragments)
    return(result)
   
def removelowcallrate(Frame): 
    Frame['Count0'] = Frame.isnull().sum(axis=1)
    Frame['Count1'] = Frame.isin({'./.:.'}).sum(1)
    Frame['New'] = 650 - Frame['Count0']                     #650 samples
    result = Frame.loc[Frame['Count1']<Frame['New']*0.2]     #Call rate 80%
    del result['Count0']
    del result['Count1']
    del Table['New']
    retun (result)
    
def GetLocusStats(record, samples=[]):
    hwe_p = 0
    het = 0
    # Get genotypes, allele frequencies
    allele_counts = {}
    obs_het = 0
    obs_hom = 0
    total = 0
    for sample in record:
        if len(samples)>0 and sample.sample not in samples: continue
        if sample["GB"] == "." or sample["GB"] == None: continue
        gt = map(int, sample["GB"].split("|"))
        if gt[0] == gt[1]: obs_hom += 1
        else:
            obs_het += 1
        total += 1
        for al in gt:
            allele_counts[al] = allele_counts.get(al, 0) + 1
    # Get Allele frequencies
    allele_freqs = {}
    for key in allele_counts.keys():
        allele_freqs[key] = allele_counts[key]*1.0/sum(allele_counts.values())
    # Get expected num homs/hets
    exp_hom_frac = 0
    for al in allele_freqs.keys():
        exp_hom_frac += allele_freqs[al]**2
    # Binomial test for HWE
    hwe_p = scipy.stats.binom_test(obs_het, n=obs_het+obs_hom, p=1-exp_hom_frac)
    # Compute heterozygosity
    het = 1-sum([allele_freqs[al]**2 for al in allele_freqs.keys()])
    # Get mean allele length
    mean_allele = sum([al*allele_freqs[al] for al in allele_freqs])
    return (hwe_p, het, mean_allele,obs_het+obs_hom)

def getper(a, infofield):
    a = a.split(";")
    b =[b.split("=")[1] for b in a if infofield in b]
    return(b[0])

def addheader():
    header = "\n".join(['##INFO=<ID=HWE,Number=1,Type=Float,Description="HWE pvalue genotype frequencies not as expected">',
            '##INFO=<ID=HET,Number=1,Type=Float,Description="Heterozygosity">',
            '##INFO=<ID=CCOUNT,Number=1,Type=Float,Description="Number of samples with genotype information">',
            '##FILTER=<ID=HET,Description="Heterozygosity less than '+str(HETZYG)+'">',
            '##FILTER=<ID=HRUN,Description="Hrun greater than -1">',
            '##FILTER=<ID=HWE,Description="HWE less than '+str(HWE)+'">',
            '##FILTER=<ID=CALLRATE,Description="Callrate less than '+str(ClR)+'">',
            '##FILTER=<ID=HOM_POLY,Description="Homopolymer locus">',
            '##FILTER=<ID=SEGDUP,Description="Locus in a segmental duplication">\n#'])
    return(header)

def rmstring(string, w):
    w = w.split(';')
    if string in w:
        w.remove(string)
    return(w)

def meet(x):
    if ''.join(x.strip().split(";")) == "":
        return(x.strip()+'')
    elif 'HET' in ''.join(x.strip().split(";")):
        x=x.strip().split('HET')
        x.remove('')
        return(';'.join(x)+';HET')
    else:
        return(x)
    
def rmhom(w):
    w = w.split(';')
    if 'HOM_POLY' in w:
        w.remove('HOM_POLY')
    return(';'.join(w))

HOM = True
SeD = True
HRN = True
HWE = 0.05
ClR = 0.8
HETZYG = 0.1
VCF = '/storage/szfeupe/Runs/650GTEx_estr/Filter_Merged_STRs_All_Samples.vcf.gz'
OUTPUTFILE = '/storage/szfeupe/Runs/650GTEx_estr/Filter_Merged_STRs_All_Samples_New.vcf'
chrom = 'chrX'

In [4]:
PROGRESS('Starting ... ')
f=gzip.open(VCF, 'r') 
file_content = io.BufferedReader(f)

Head = [next(file_content).decode("utf-8").strip()  for x in range(132)]
lines = [l.decode("utf-8").strip() for l in file_content if not l.decode("utf-8").startswith('##') ]
print(1, 'Opened')
f.close()

with open(OUTPUTFILE, 'w') as f:
    f.write('\n'.join(Head))
Table = pd.read_table( io.StringIO(str.join(os.linesep, lines)), dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, 'FILTER': str, 'INFO': str}).rename(columns={'#CHROM': 'CHROM'})
TABLE = Table
print(2, list(set(list(Table['FILTER']))))
TABLE['FILTER'] = TABLE['FILTER'].apply(lambda x: ';'.join(rmstring('HET',x)))

TABLE['FILTER'] = TABLE['FILTER'].apply(lambda x: ';'.join(rmstring('HOM_POLY',x)))
TABLE['het'] = TABLE['INFO'].apply(lambda x: x.split(';')[4].split('=')[1] )
TABLE["het1"] = np.where(TABLE["het"].astype(float) >=HETZYG, "", "HET")
TABLE['FILTER'] = TABLE['FILTER']+TABLE['het1']
TABLE['FILTER'] =TABLE['FILTER'].apply(lambda x: meet(x))
print(3, list(set(list(Table['FILTER']))))
Table = TABLE
Table['FILTER'] = np.where(Table["FILTER"]!='', Table["FILTER"], "PASS")
#Clean up    
del Table['het']
del Table['het1']

print(4)
#Package and save vcf
PROGRESS("Saving to file")
command = "zgrep '^##' "+VCF
vcfheader = subprocess.check_output(command, shell=True)
f=open('tmp','w')
f.write(vcfheader.decode('utf-8'))
f.write(addheader())
f.close()
Table = Table.sort_values(['CHROM','POS'])
path='/storage/szfeupe/Runs/650GTEx_estr/'
print(5)
Table.to_csv(path+'table.tab',sep='\t',index=None)
command = "cat tmp "+path+"table.tab >"+OUTPUTFILE 
MG = subprocess.check_output(command, shell=True)
command = "rm "+path+"table.tab"
MG = subprocess.check_output(command, shell=True)
command = "rm tmp"
MG = subprocess.check_output(command, shell=True)
#compress and index vcf 
PROGRESS("Indexing")
command = "bgzip -c "+ OUTPUTFILE +" > "+OUTPUTFILE+'.gz'
MG = subprocess.check_output(command, shell=True)
command = "tabix -p vcf "+ OUTPUTFILE +'.gz '
MG = subprocess.check_output(command, shell=True)

Indexing


In [23]:
#Get all tha addtional data to be regressed
GG = Table.loc[Table['FILTER1']=='PASS']
print(GG.shape)
def get0301(x): 
    I=[i.split('=') for i in x.split(';')]
    H = float(I[4][1])
    if (H<0.3) & (H>=0.1) :
        return 1
    else:
        return 0
GG['Select'] = GG['INFO'].apply(lambda x: get0301(x))

OUT = GG.loc[GG['Select']==1]
print(OUT.shape)
OUT['FILTER']=OUT['FILTER1']
del OUT['FILTER1']
del OUT['Select']
OUT

(53793, 661)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,GTEX-PLZ4,...,GTEX-1212Z,GTEX-14C39,GTEX-131XF,GTEX-111YS,GTEX-ZXES,GTEX-11WQK,GTEX-ZVP2,GTEX-Y8E4,GTEX-1GN2E,GTEX-14PJM
258,1,886040,STR_271,CACACACACACAG,.,.,PASS,PERIOD=2;START=886040;END=886051;HWE=0.4326264...,GT:GB,./.:-3|-3,...,./.:-3|-3,./.:-3|-3,./.:-3|-3,./.:-3|-3,./.:-3|-3,./.:-3|-3,./.:-3|-3,./.:-3|-3,./.:0|-3,./.:-3|-3
290,1,946128,STR_303,TGGGTTTTTTTTTTTTTTTTTTGA,.,.,PASS,PERIOD=1;START=946128;END=946145;HWE=0.5907473...,GT:GB,./.:0|-1,...,./.:-1|-1,./.:-1|-1,./.:.,./.:-1|-1,./.:-1|-1,./.:-1|-1,./.:.,./.:-1|-1,./.:-1|-1,./.:-1|-1
341,1,992119,STR_357,CGTGTGTGTGTGTGTGTGTGTG,.,.,PASS,PERIOD=2;START=992119;END=992139;HWE=0.5452202...,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:-2|-2,./.:0|0,./.:0|0,./.:0|0
342,1,993134,STR_358,TCTCAAAAAAAAAAAGAAAAAAGAAAA,.,.,PASS,PERIOD=1;START=993134;END=993156;HWE=0.8455015...,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0
428,1,1199212,STR_449,TATTTAAAAAAAAAAA,.,.,PASS,PERIOD=1;START=1199212;END=1199222;HWE=0.95818...,GT:GB,./.:0|0,...,./.:0|0,./.:.,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|1,./.:0|0,./.:.,./.:0|0
435,1,1209175,STR_456,AACCCGCCGCAGCGCCGCCGCCGCCGCC,.,.,PASS,PERIOD=3;START=1209175;END=1209199;HWE=0.20258...,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|0,./.:0|3,./.:0|0,./.:0|0,./.:0|0
438,1,1213211,STR_459,CTCAAAAAAAAAAAAA,.,.,PASS,PERIOD=1;START=1213211;END=1213223;HWE=0.09853...,GT:GB,./.:0|0,...,./.:0|0,./.:0|0,./.:.,./.:0|0,./.:0|0,./.:0|0,./.:0|1,./.:0|0,./.:.,./.:0|0
475,1,1298161,STR_496,TCTCAAAAAAAAAAAAAAAAAAAA,.,.,PASS,PERIOD=1;START=1298161;END=1298180;HWE=0.95230...,GT:GB,./.:-1|-1,...,./.:-1|-1,./.:-1|-1,./.:0|-1,./.:-1|-1,./.:-1|-1,./.:.,./.:-1|-1,./.:-1|-1,./.:-1|-1,./.:-1|-1
492,1,1315275,STR_513,CAAAAAAAAAAAAAGAAAAAAA,.,.,PASS,PERIOD=1;START=1315275;END=1315295;HWE=0.55230...,GT:GB,./.:-1|-1,...,./.:-1|-1,./.:-1|-1,./.:-1|-1,./.:-1|-1,./.:-1|-1,./.:-1|-1,./.:-1|-1,./.:-1|-1,./.:-1|-1,./.:-1|-1
497,1,1323946,STR_518,GTGTCTCTCTCTCTCTCTCTCTATATATATATATACATAGACACAC...,.,.,PASS,PERIOD=2;START=1323946;END=1323999;HWE=0.25867...,GT:GB,./.:-2|-2,...,./.:-2|-2,./.:-2|-2,./.:-2|-2,./.:-2|-2,./.:-2|-2,./.:-2|-2,./.:-2|-2,./.:-2|-2,./.:-2|-2,./.:-2|-2


In [24]:
#Package and save vcf
output='/storage/szfeupe/Runs/650GTEx_estr/Filter_Additional_STR_HET_HOM.vcf' 
PROGRESS("Saving to file")
command = "zgrep '^##' "+VCF
vcfheader = subprocess.check_output(command, shell=True)
f=open('tmp','w')
f.write(vcfheader.decode('utf-8'))
f.write(addheader())
f.close()
OUT = OUT.sort_values(['CHROM','POS'])
path='/storage/szfeupe/Runs/650GTEx_estr/'
print(5)
OUT.to_csv(path+'table.tab',sep='\t',index=None)
command = "cat tmp "+path+"table.tab >"+output 
MG = subprocess.check_output(command, shell=True)
command = "rm "+path+"table.tab"
MG = subprocess.check_output(command, shell=True)
command = "rm tmp"
MG = subprocess.check_output(command, shell=True)
#compress and index vcf 
PROGRESS("Indexing")
command = "bgzip -c "+ output +" > "+output+'.gz'
MG = subprocess.check_output(command, shell=True)
command = "tabix -p vcf "+ output +'.gz '
MG = subprocess.check_output(command, shell=True)
print('END')

Saving to file


5


Indexing


END
