In [1]:
import pandas as pd
from io import StringIO
import gzip
import numpy as np
from saveAndLoad import *

tumors = pickleLoad('../aa/tumors.pkl')
data = pickleLoad('../data_processing/consolidated_data.pkl')
seqs = pickleLoad('../data_processing/dna_seq_by_hgncId.pkl')
ref_aa = pickleLoad('../aa/canonical_ref.pkl')
mut_aa = pickleLoad('../aa/canonical_mut.pkl')

  from pandas.core import (


loading data from ../aa/tumors.pkl
loading data from ../data_processing/consolidated_data.pkl
loading data from ../data_processing/dna_seq_by_hgncId.pkl
loading data from ../aa/canonical_ref.pkl
loading data from ../aa/canonical_mut.pkl


In [15]:
def enumerate_150_mutation_types():
    """
    Return a list of all 150 mutation types:
      - 6 single-base changes
      - 48 double-base contexts
      - 96 triple-base contexts
    in strings like:
      'T>C'       # single
      'AT>C'      # double, 5' base
      'T>CG'      # double, 3' base
      'AT>CG'     # triple (5' + mutation + 3')
    """

    # We assume the 'pyrimidine-based' set of ref->alt:
    #   C->A, C->G, C->T, T->A, T->C, T->G
    single_changes = [
        ("C", "A"),
        ("C", "G"),
        ("C", "T"),
        ("T", "A"),
        ("T", "C"),
        ("T", "G"),
    ]
    # Possible flanking bases
    neighbors = ["A", "C", "G", "T"]

    # 1) Enumerate 6 single-base changes.
    singles = []
    for (ref, alt) in single_changes:
        singles.append(f"{ref}>{alt}")

    # 2) Enumerate 48 doubles:
    #    We split them into 24 "5' base + mutation" and
    #    24 "mutation + 3' base".
    doubles = []

    # 2a) 5′ neighbor + mutation, e.g. "AT>C"
    for five_prime in neighbors:
        for (ref, alt) in single_changes:
            doubles.append(f"{five_prime}{ref}>{alt}")

    # 2b) mutation + 3′ neighbor, e.g. "T>CG"
    for (ref, alt) in single_changes:
        for three_prime in neighbors:
            doubles.append(f"{ref}>{alt}{three_prime}")

    # 3) Enumerate 96 triples: "5' base + mutation + 3' base"
    #    e.g. "AT>CG" means five_prime="A", (ref="T", alt="C"), three_prime="G".
    triples = []
    for five_prime in neighbors:
        for (ref, alt) in single_changes:
            for three_prime in neighbors:
                triples.append(f"{five_prime}{ref}>{alt}{three_prime}")

    # Combine them: 6 + 48 + 96 = 150 total
    all_150 = singles + doubles + triples
    return all_150

def to_snv150(tumors, barcode, data, seqs, ref_aa, mut_aa, save=False):
    snv150 = enumerate_150_mutation_types()
    snv150_indexes = {snv:i for i,snv in enumerate(snv150)}
    instances = []
    counts = np.array([0]*150)
    comp = {'A':'T','T':'A','C':'G','G':'C'}

    # Grab the indexes from the tumors structure
    data_idxs = [i[-1] for i in tumors[barcode]]
    ref_idxs = [i[4] for i in tumors[barcode]]
    mut_idxs = [i[5] for i in tumors[barcode]]


    for data_idx, ref_idx, mut_idx in zip(data_idxs, ref_idxs, mut_idxs):
        start,end,chrom,build,variant_class,isoforms,hgncId,gene_start,gene_end,ref_allele,strand,mutation,bc = data[data_idx]

        # Pull reference and mutant amino acids
        ref_seq, mut_seq = ref_aa[ref_idx], mut_aa[mut_idx]
        if (ref_seq is None) or (mut_seq is None):
            continue
        if '*' in ref_seq: 
            continue

        # Prepare the position, ref, alt, etc.
        pos = start
        ref = ref_allele
        alt = mutation
        
        # Retrieve the entire reference DNA sequence for this gene
        gene_dna_seq = seqs[(build, chrom)][hgncId]

        # Sanity-check that the base in gene_dna_seq matches ref_allele
        if (len(ref) == 1) and (len(alt) == 1):
            r = pos - gene_start
            five,ref,three = gene_dna_seq[r-1:r+2]
            assert ref == ref_allele, (five,ref,three)
        else: continue

        if ref in 'AG':
            five,ref,alt,three = tuple(map(comp.get, [five,ref,alt,three]))

        triple = f'{five}{ref}>{alt}{three}'
        single = triple[1:4]
        double_5 = triple[:4]
        double_3 = triple[1:]

        instances+=[single, double_5, double_3, triple]
    
    for i in instances:
        counts[snv150_indexes[i]]+=1
    
    return counts
        



In [22]:
from tqdm import tqdm
class_data = pd.read_csv('../labeled_data/data_1_00percentMinCancerType.csv')
barcodes = class_data['barcode'].values

snv150_data = {}
for barcode in tqdm(barcodes):
    mut_type_vector = to_snv150(tumors, barcode, data, seqs, ref_aa, mut_aa, save=True)
    snv150_data[barcode] = mut_type_vector

snv150_df = pd.DataFrame(snv150_data).T
snv150_df.columns = enumerate_150_mutation_types()

100%|██████████| 143530/143530 [00:06<00:00, 21089.30it/s]


In [24]:
snv150_df

Unnamed: 0,C>A,C>G,C>T,T>A,T>C,T>G,AC>A,AC>G,AC>T,AT>A,...,TT>AG,TT>AT,TT>CA,TT>CC,TT>CG,TT>CT,TT>GA,TT>GC,TT>GG,TT>GT
GENIE-JHU-00006-00185,5,1,5,1,5,0,1,0,2,0,...,0,0,0,0,1,1,0,0,0,0
GENIE-JHU-00007-00187,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GENIE-JHU-00024-00426,2,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GENIE-JHU-00026-01223,1,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
GENIE-JHU-00032-00469,0,0,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GENIE-PROV-358b6384eb-e74ad60cbf,0,0,5,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GENIE-PROV-a09878f384-bb63e20e8d,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GENIE-PROV-dab2e48af5-35f92dc0e9,0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GENIE-PROV-95b5a6d4e0-2ee7ad86ea,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
