In [1]:
import os, sys, glob
import math
import numpy as np
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_snv = pd.read_csv("raw/SNVs.csv", low_memory=False)

In [3]:
df_snv.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,...,HIGH_INF_POS,MOTIF_SCORE_CHANGE,VAF,TCN,LCN,CCF,CLONALITY,Total_Index_Lesions_In_Patient,Num_Other_Mets_In_Patient,CLONALITY_In_Other_Mets
0,AKT1,207,MSKCC,GRCh37,14,105246551,105246551,+,Missense_Mutation,SNP,...,,,,,,,,,,
1,FOXA1,3169,MSKCC,GRCh37,14,38061240,38061240,+,Missense_Mutation,SNP,...,,,0.298731257,2.0,1.0,1.0,CLONAL,1.0,1.0,CLONAL
2,FOXA1,3169,MSKCC,GRCh37,14,38061240,38061240,+,Missense_Mutation,SNP,...,,,0.139386189,2.0,1.0,0.961,CLONAL,1.0,1.0,CLONAL
3,DICER1,23405,MSKCC,GRCh37,14,95574698,95574698,+,Missense_Mutation,SNP,...,,,,,,,,,,
4,AKT1,207,MSKCC,GRCh37,14,105246551,105246551,+,Missense_Mutation,SNP,...,,,,,,,,,,


In [14]:
# generate patient to samples dict
patientToSamplesDict = {}
for sample in set(df_snv.Tumor_Sample_Barcode):
    patient = "-".join(sample.split('-')[0:2])
    if patient not in patientToSamplesDict:
        patientToSamplesDict[patient] = set()
    patientToSamplesDict[patient].add(sample)

In [29]:
df_cna = pd.read_csv("raw/CNAs.csv", low_memory=False)

In [30]:
df_cna.head()

Unnamed: 0,Hugo_Symbol,P-0004434-T01-IM5,P-0004314-T02-IM6,P-0004216-T01-IM5,P-0011462-T01-IM5,P-0000464-T01-IM3,P-0012362-T01-IM5,P-0009740-T01-IM5,P-0008538-T01-IM5,P-0001952-T01-IM3,...,P-0014854-T01-IM6,P-0006376-T03-IM6,P-0003224-T01-IM5,P-0000473-T01-IM3,P-0011072-T01-IM5,P-0009444-T02-IM6,P-0016472-T01-IM6,P-0017362-T01-IM6,P-0015582-T01-IM6,P-0014692-T01-IM6
0,RB1,0,0,0,0,0,0,-1.5,0,0,...,0,0,0,0,0,0,0,0,0,0
1,MDC1,0,0,0,0,0,0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ARAF,0,0,0,0,0,0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,PREX2,0,0,0,0,0,0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SOX17,0,0,0,0,0,0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [134]:
patientToSamplesDict = {}
for sample in df_cna.columns[1:]:
    patient = "-".join(sample.split('-')[0:2])
    if patient not in patientToSamplesDict:
        patientToSamplesDict[patient] = set()
    patientToSamplesDict[patient].add(sample)

In [135]:
set(df_snv[df_snv['Chromosome'] == 'X'].Hugo_Symbol)

{'AMER1',
 'AR',
 'ARAF',
 'ATRX',
 'BCOR',
 'BTK',
 'CRLF2',
 'EIF1AX',
 'FAM58A',
 'GATA1',
 'KDM5C',
 'KDM6A',
 'MED12',
 'RBM10',
 'SH2D1A',
 'STAG2',
 'XIAP',
 'ZRSR2'}

In [136]:
male_patients = set(["P-0002574", "P-0000281", "P-0000281",
                     "P-0015988", "P-0005904", "P-0000694",
                     "P-0001354", "P-0007684", "P-0000423",
                     "P-0007700", "P-0016667"])

In [141]:
def generate_frequencies(df_snvs, df_cnas, patient, snvs):
    nrSamples = len(patientToSamplesDict[patient])
    sampleList = list(patientToSamplesDict[patient])
    nrSNVs = len(snvs)
#     print(patient, snvs, nrSNVs)
    with open("%s.tsv" % patient, "w") as f:
        f.write("%d #anatomical sites\n" % 1)
        f.write("%d #samples\n" % nrSamples)
        f.write("%d #characters\n" % nrSNVs)
        f.write("\t".join(["#sample_index", "sample_label", 
                 "anatomical_site_index", "anatomical_site_label", 
                 "character_index", "character_label", 
                 "f-", "f+\n"]))
        for sampleIdx, sample in enumerate(sampleList):
            for snvIdx, snv in enumerate(snvs):
                s = snv.split(":")
                symbol = s[2]
                chr = s[0]
                pos = int(s[1])
                
#                 print(sample, symbol, chr, pos)
#                 print(df_snvs[(df_snvs["Tumor_Sample_Barcode"] == sample)
#                                      & (df_snvs["Hugo_Symbol"] == symbol)
#                                      & (df_snvs["Chromosome"] == chr)
#                                      & (df_snvs["Start_Position"] == pos)
#                                     ]["t_alt_count"])

                try:
                    var = float(df_snvs[(df_snvs["Tumor_Sample_Barcode"] == sample)
                                         & (df_snvs["Hugo_Symbol"] == symbol)
                                         & (df_snvs["Chromosome"] == chr)
                                         & (df_snvs["Start_Position"] == pos)
                                        ]["t_alt_count"])
                    tot = float(df_snvs[(df_snvs["Tumor_Sample_Barcode"] == sample)
                                         & (df_snvs["Hugo_Symbol"] == symbol)
                                         & (df_snvs["Chromosome"] == chr)
                                         & (df_snvs["Start_Position"] == pos)
                                        ]["t_depth"])
                    vaf = var/tot
                except:
                    vaf = 0.
            
                f.write("\t".join(map(str, [sampleIdx, sample, 0, "P", snvIdx, snv, vaf, vaf])) + "\n")


In [145]:
def generate_spruce_input(df_snvs, df_cnas, patientToSamplesDict, male_patients):
    for patient in patientToSamplesDict:
        if patient not in male_patients:
            snv_candidates = set()
            for sample in patientToSamplesDict[patient]:
                # determine whether SNV is copy neutral in all tumor samples
                for idx, row in df_snvs[df_snvs["Tumor_Sample_Barcode"] == sample].iterrows():
                    classification = row["Variant_Classification"]
                    symbol = row["Hugo_Symbol"]
                    chr = row["Chromosome"]
                    pos = row["Start_Position"]
#                     if classification in ['Missense_Mutation', 'Nonsense_Mutation', 'Nonstop_Mutation', 'Splice_Site', 'Translation_Start_Site']:
                    if classification not in ['Silent', 'Intron']:
                        snv_candidates.add("%s:%s:%s" % (chr, pos, symbol))
            snvs = set()
            for snv in snv_candidates:
                s = snv.split(":")
                symbol = s[2]
                chr = s[0]
                pos = int(s[1])

                neutral = True
                for sample in patientToSamplesDict[patient]:
                    if sample in df_cnas.columns:
                        if len(df_cnas[df_cnas["Hugo_Symbol"] == symbol][sample]) == 0:
                            neutral = False
                        else:
                            copynumber = df_cnas[df_cnas["Hugo_Symbol"] == symbol][sample].iloc[0]
                            if copynumber != 0:
                                neutral = False
                if neutral:
                    snvs.add(snv)
            generate_frequencies(df_snvs, df_cnas, patient, snvs)

In [143]:
df_snv[(df_snv["Tumor_Sample_Barcode"] == "P-0003381-T01-IM5")
       & (df_snv["Hugo_Symbol"] == "SMARCA4")
       & (df_snv["Chromosome"] == "19")
#        & (df_snv["Start_Position"] == 11143994)
      ]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,...,HIGH_INF_POS,MOTIF_SCORE_CHANGE,VAF,TCN,LCN,CCF,CLONALITY,Total_Index_Lesions_In_Patient,Num_Other_Mets_In_Patient,CLONALITY_In_Other_Mets
6726,SMARCA4,6597,MSKCC,GRCh37,19,11143994,11143994,+,Missense_Mutation,SNP,...,,,,,,,,,,


In [146]:
generate_spruce_input(df_snv, df_cna, patientToSamplesDict, male_patients)

In [40]:
set(df_snv["Variant_Classification"])

{"5'UTR",
 'Frame_Shift_Del',
 'Frame_Shift_Ins',
 'In_Frame_Del',
 'In_Frame_Ins',
 'Intron',
 'Missense_Mutation',
 'Nonsense_Mutation',
 'Nonstop_Mutation',
 'Silent',
 'Splice_Site',
 'Translation_Start_Site'}

In [49]:
patientToSamplesDict["P-0011355"]

{'P-0011355-CMO-P', 'P-0011355-T01-IM5'}