# VCF

In [1]:
import pandas as pd
from io import StringIO
import gzip
import numpy as np
from saveAndLoad import *

tumors = pickleLoad('../aa/tumors.pkl')
data = pickleLoad('../data_processing/consolidated_data.pkl')
seqs = pickleLoad('../data_processing/dna_seq_by_hgncId.pkl')
ref_aa = pickleLoad('../aa/canonical_ref.pkl')
mut_aa = pickleLoad('../aa/canonical_mut.pkl')

def parse_chrom(chrom):
    """
    Convert chromosome strings into numeric values for sorting.
    E.g., 'chr1' -> 1, 'chrX' -> 23, 'chrY' -> 24, etc.
    Adjust as needed if your chromosome naming is different.
    """
    c = chrom.replace('chr', '').upper()  # remove 'chr' prefix
    if c == 'X':
        return 23
    elif c == 'Y':
        return 24
    else:
        return int(c)  # e.g., '1' -> 1, '10' -> 10

def to_vcf(tumors, barcode, data, seqs, ref_aa, mut_aa, save=False):
    # Collect all final VCF rows before sorting
    unsorted_records = []

    # Grab the indexes from the tumors structure
    data_idxs = [i[-1] for i in tumors[barcode]]
    ref_idxs = [i[4] for i in tumors[barcode]]
    mut_idxs = [i[5] for i in tumors[barcode]]

    # ---------------------------------------------------------
    # Build up each VCF record, adjusting pos if needed
    # ---------------------------------------------------------
    for data_idx, ref_idx, mut_idx in zip(data_idxs, ref_idxs, mut_idxs):
        start,end,chrom,build,variant_class,isoforms,hgncId,gene_start,gene_end,ref_allele,strand,mutation,bc = data[data_idx]

        # Pull reference and mutant amino acids
        ref_seq, mut_seq = ref_aa[ref_idx], mut_aa[mut_idx]
        if (ref_seq is None) or (mut_seq is None):
            continue
        if '*' in ref_seq: 
            continue

        # Prepare the position, ref, alt, etc.
        pos = start
        ref = ref_allele
        alt = mutation
        record_id = "."
        qual = "."
        vcf_filter = "."
        info = f"Variant_Classification={variant_class}"
        
        # Retrieve the entire reference DNA sequence for this gene
        gene_dna_seq = seqs[(build, chrom)][hgncId]

        # Sanity-check that the base in gene_dna_seq matches ref_allele
        if len(ref_allele) == 1:
            assert gene_dna_seq[pos - gene_start] == ref_allele, data[data_idx]

        # If one of them is empty (insertion), shift coordinate by 1
        if '' in [ref_allele, mutation]:
            pos -= 1
            anchor = gene_dna_seq[pos - gene_start]
            ref = anchor + ref
            alt = anchor + alt
        
        # Store the final record
        unsorted_records.append((chrom, pos, record_id, ref, alt, qual, vcf_filter, info))

    # ---------------------------------------------------------
    # Now sort the records by chromosome (numeric) and position
    # ---------------------------------------------------------
    sorted_records = sorted(
        unsorted_records,
        key=lambda row: (parse_chrom(row[0]), row[1])
    )

    # ---------------------------------------------------------
    # Build the final VCF text
    # ---------------------------------------------------------
    output = []
    output.append("##fileformat=VCFv4.1")
    output.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO")

    for (chrom, pos, record_id, ref, alt, qual, vcf_filter, info) in sorted_records:
        vcf_line = "\t".join([chrom, str(pos), record_id, ref, alt, qual, vcf_filter, info])
        output.append(vcf_line)

    # Join into a single text block
    vcf_content = "\n".join(output) + "\n"

    # Optionally save as gzipped VCF
    if save:
        out_path = f"/data/dandreas/SomaticMutationsLLM/vcf_1_00percentMinCancerType/{barcode}.vcf.gz"
        with gzip.open(out_path, "wt") as gz_out:
            gz_out.write(vcf_content)

    return vcf_content


# -------------------------------------------------
# Example
# -------------------------------------------------

barcode = 'GENIE-JHU-00006-00185'
vcf_string = to_vcf(tumors, barcode, data, seqs, ref_aa, mut_aa, save=True)
print(vcf_string)

  from pandas.core import (


loading data from ../aa/tumors.pkl
loading data from ../data_processing/consolidated_data.pkl
loading data from ../data_processing/dna_seq_by_hgncId.pkl
loading data from ../aa/canonical_ref.pkl
loading data from ../aa/canonical_mut.pkl
##fileformat=VCFv4.1
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
1	115256529	.	T	C	.	.	Variant_Classification=Missense_Mutation
3	41266080	.	A	G	.	.	Variant_Classification=Missense_Mutation
3	41266101	.	C	T	.	.	Variant_Classification=Missense_Mutation
3	178916944	.	A	G	.	.	Variant_Classification=Missense_Mutation
3	178952085	.	A	G	.	.	Variant_Classification=Missense_Mutation
4	55593673	.	A	G	.	.	Variant_Classification=Missense_Mutation
5	112175253	.	G	C	.	.	Variant_Classification=Missense_Mutation
7	55249071	.	C	T	.	.	Variant_Classification=Missense_Mutation
7	140453136	.	A	T	.	.	Variant_Classification=Missense_Mutation
9	21971153	.	C	A	.	.	Variant_Classification=Nonsense_Mutation
11	108123578	.	G	T	.	.	Variant_Classification=Missense_Mutation
12	25378636	.	

In [None]:
for i in tumors['GENIE-UCSF-592864-44573T']:
    start,end,chrom,build,variant_class,isoforms,hgncId,gene_start,gene_end,ref_allele,strand,mutation,bc = data[i[-1]]
    print(variant_class,'\t',hgncId,ref_allele,mutation)

In_Frame_Ins 	 HGNC:11110  CCC
Missense_Mutation 	 HGNC:11110 A C
Frame_Shift_Ins 	 HGNC:11110 A CC
Frame_Shift_Ins 	 HGNC:11110  C
Frame_Shift_Ins 	 HGNC:11110 A CCC
Frame_Shift_Ins 	 HGNC:11110  CC
Silent 	 HGNC:23805 G A
Missense_Mutation 	 HGNC:6693 T C
Missense_Mutation 	 HGNC:5382 C T
Missense_Mutation 	 HGNC:7127 G A
Missense_Mutation 	 HGNC:30064 C T
Missense_Mutation 	 HGNC:11086 A G
Silent 	 HGNC:8803 A G
Silent 	 HGNC:8803 G A
Missense_Mutation 	 HGNC:6342 T G
Missense_Mutation 	 HGNC:6342 T G
Silent 	 HGNC:6119 C T
Missense_Mutation 	 HGNC:3236 G A
Missense_Mutation 	 HGNC:7029 C T
Missense_Mutation 	 HGNC:30939 TAT CAG
Nonsense_Mutation 	 HGNC:7881  T
Silent 	 HGNC:7881 G C
Missense_Mutation 	 HGNC:3587 A G
Missense_Mutation 	 HGNC:15998 T G
Missense_Mutation 	 HGNC:6126 G A
Silent 	 HGNC:6861 G A
Missense_Mutation 	 HGNC:391 C T
Silent 	 HGNC:14010 T C
Missense_Mutation 	 HGNC:2348 C T
Missense_Mutation 	 HGNC:11998 C T
Missense_Mutation 	 HGNC:5112 A T
Nonsense_Mutation 

In [None]:
from tqdm import tqdm
class_data = pd.read_csv('../labeled_data/data_1_00percentMinCancerType.csv')
barcodes = class_data['barcode'].values

for barcode in tqdm(barcodes):
    vcf_string = to_vcf(tumors, barcode, data, seqs, ref_aa, mut_aa, save=True)

100%|██████████| 143530/143530 [00:12<00:00, 11237.05it/s]


In [None]:
filenames = os.listdir('/data/dandreas/SomaticMutationsLLM/vcf_1_00percentMinCancerType/')
path = '/data/dandreas/SomaticMutationsLLM/vcf_1_00percentMinCancerType/'
with open('/data/dandreas/SomaticMutationsLLM/filenames_vcf_1_00percentMinCancerType.tsv', 'w') as f:
    f.write('path\n')
    for filename in filenames:
        f.write(f'{path}{filename}\n')

In [None]:
j=0
for ni,i in enumerate(filenames):
    if 'GENIE-DFCI-007014-8417.gc.genic.vcf.gz'==i:
        print(ni,f'{ni/len(filenames)*100:.2f}%')

In [None]:
len(filenames)

166854