In [1]:
""" Module for writing artifical vcfs,
    for testing purposes """

' Module for writing artifical vcfs,\n    for testing purposes '

In [1]:
import pandas as pd
import os
import VCF
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def get_random_intervals():
    """ get set of 1000 random intronic / intergenic intervals, with bedtools """
    
    bed_rand = 'bedtools_random_out.csv'
    ! bedtools random -l 1 -n 1000 -g human.hg38.genome > $bed_rand

    bed_inter = 'bedtools_intersect_out.csv'
    ! bedtools intersect -a $bed_rand -b /Users/lincoln.harris/code/potLuck/hg38-plus.gtf > $bed_inter

    bed_unique = 'bed_unique_loci.csv'
    ! grep -F -xvf $bed_inter $bed_rand > $bed_unique
    
    unique = pd.read_csv('bed_unique_loci.csv', sep='\t', names=['chrom', 'start_pos', 'end_pos', 
                                                        'num', 'score', 'strand'])
    pos_strings = []
    
    for idx, row in unique.iterrows():
        chrom_ = row.chrom
        if '_' not in chrom_:
            pos = row.start_pos
            tup = [chrom_, pos]
            pos_strings.append(tup)
        
    os.remove('bedtools_random_out.csv')
    os.remove('bedtools_intersect_out.csv')
    os.remove('bed_unique_loci.csv')
    return(pos_strings)

In [3]:
def add_garbage_lines(df_):
    """ add a bunch of lines that dont correspond to any gene / amino acid sequence
        MANUALLY """
    
    g_info = 'AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=3.00;QD=30.14;SOR=2.303'
    g_format = 'GT:AD:DP:GQ:PL'
    g_20 = '1/1:2,2:13:38:434,38,0'
    
    garbage_positions = [['chr12', 25284777],  # each of these has been evaluated by hand...
                         ['chr12', 24788289],     # intergenic regions, but close to some of the genes
                         ['chr7', 54880599],      #    we care about
                         ['chr7', 55271013], 
                         ['chr2', 29954866], 
                         ['chr2', 29081787],
                         ['chr7', 140944330],
                         ['chr7', 140238466], 
                         ['chr17', 7636991],
                         ['chr2', 41732258]]
    
    random_positions = get_random_intervals()
    
    for elm in random_positions:
        add_chr = elm[0]
        add_pos = elm[1]
        
        df_ = df_.append({'#CHROM': add_chr, 'POS': add_pos, 'ID': '.', 'REF': 'A', 'ALT': 'C', 
                          'QUAL': 60, 'FILTER': '.', 'INFO': g_info, 'FORMAT': g_format, 
                          '20': g_20}, ignore_index=True)
    
    # random shuffle rows
    df_ = df_.sample(frac=1)
    
    return(df_)

In [4]:
def write_vcf(outStr_, chrom, pos, ref_, alt_, v_count, wt_count):
    """ routine for writing VCF files, from an existing dataframe. 
    essentially just adding in this horrible vcf header. """
    
    cwd = os.getcwd()
    
    with open(cwd + '/../cerebra/vcfheader.txt', 'r') as f:
        header = f.read()
        df = pd.DataFrame(columns=['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', '20'])
        
        chrom_ = 'chr' + str(chrom)
        pos_ = pos
        id_ = '.'
        ref_ = 'C'
        alt_ = 'A'
        qual_ = 60.28
        filter_ = '.'
        info_ = 'AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=3.00;QD=30.14;SOR=2.303'
        format_ = 'GT:AD:DP:GQ:PL'
        twenty = '1/1:2,2:13:38:434,38,0'
        
        df = df.append({'#CHROM': chrom_, 'POS': pos_, 'ID': id_, 'REF': ref_, 'ALT': alt_, 
                        'QUAL': qual_, 'FILTER': filter_, 'INFO': info_, 'FORMAT': format_,
                       '20': twenty}, ignore_index=True)
        
        output_VCF = outStr_
        with open(output_VCF, 'w') as vcf:
            vcf.write(header)

        df.to_csv(output_VCF, sep="\t", mode='a', index=False)

In [5]:
write_vcf('test0_kras_G13C.vcf', 12, 25245348)

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";

ERCC-00002

In [62]:
VCF.dataframe('../cerebra/wrkdir/vcf_test_set/test8_braf_V600E.vcf')

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,chr7,140753336,.,T,A,60.28,.,AC=2;AF=1.00;AN=2;DP=7;ExcessHet=3.0103;FS=0.0...
