# Compile pre- and post-ICB CCFs for mutations

TODO: Need to get OncoKB and CADD annotations for these mutations

In [1]:
import pandas as pd
import numpy as np

### Load data

In [2]:
# Mutation data
mut_ccf_df = pd.read_pickle("data/icb_wes_mut_ccfs_maf.pickle")

# Participant metadata
pt_df = pd.read_csv("data/participant_cohort_sheet.tsv", sep="\t")
sample_df = pd.read_csv("data/sample_cohort_sheet.tsv", sep="\t")

# Deleteriousness annotations
del_df = pd.read_csv("data/annotated_nf1_mut_ccfs.tsv", sep="\t")

# Clean data

In [3]:
def clean_sample_id(sid):
    if sid.endswith("_pair"):
        return sid[:-5]
    else:
        return sid

complements = {"G": "C",
               "C": "G",
               "A": "T",
               "T": "A"
              }

def clean_ref_alt(ref, alt):
    if ref in ("G", "A"):
        if alt != "-":
            ref = complements[ref]
            alt = complements[alt]
    return ref, alt

In [4]:
mut_ccf_df["Sample_ID"] = mut_ccf_df["Sample_ID"].map(clean_sample_id)

mut_ccf_df["sbs"] = mut_ccf_df[["Reference_Allele", "Tumor_Seq_Allele"]].apply(lambda r: clean_ref_alt(r["Reference_Allele"], 
                                                                                                       r["Tumor_Seq_Allele"]
                                                                                                      ),
                                                                               axis=1
                                                                              )
mut_ccf_df["Reference_Allele"] = mut_ccf_df["sbs"].map(lambda x: x[0])
mut_ccf_df["Tumor_Seq_Allele"] = mut_ccf_df["sbs"].map(lambda x: x[1])

mut_ccf_df = mut_ccf_df.rename(columns={"Start_position": "Start_Position",
                                        "Tumor_Seq_Allele": "Tumor_Seq_Allele2"
                                        }
                                )

# Obtain latest pre- and earliest post-ICB WES samples for each participant

In [5]:
sample_df = sample_df[sample_df["usable_wes"] & (sample_df["tumor_normal"] == "tumor")]

In [6]:
sample_df = sample_df.merge(pt_df[["participant_id_legacy", "DaysDxtoICB"]], left_on="participant_id_legacy", right_on="participant_id_legacy", how="left")

In [7]:
sample_df["collection_days_from_icb"] = sample_df["collection_date_dfd"] - sample_df["DaysDxtoICB"]

In [8]:
pre_sample_df = sample_df[sample_df["collection_days_from_icb"] <= 0]
post_sample_df = sample_df[sample_df["collection_days_from_icb"] > 0]

In [9]:
latest_pre_dates = pre_sample_df[["participant_id_legacy", "collection_days_from_icb"]].groupby("participant_id_legacy").max().rename(columns={"collection_days_from_icb": "latest_pre_date"})
earliest_post_dates = post_sample_df[["participant_id_legacy", "collection_days_from_icb"]].groupby("participant_id_legacy").min().rename(columns={"collection_days_from_icb": "earliest_post_date"})

In [10]:
pre_sample_df = pre_sample_df.merge(latest_pre_dates, left_on="participant_id_legacy", right_index=True)
post_sample_df = post_sample_df.merge(earliest_post_dates, left_on="participant_id_legacy", right_index=True)

In [11]:
pre_sample_df = pre_sample_df[pre_sample_df["collection_days_from_icb"] == pre_sample_df["latest_pre_date"]]
post_sample_df = post_sample_df[post_sample_df["collection_days_from_icb"] == post_sample_df["earliest_post_date"]]

In [12]:
latest_pre_samples = set(pre_sample_df["sample_id_legacy"]) | set(pre_sample_df["aliases"].dropna())
earliest_post_samples = set(post_sample_df["sample_id_legacy"]) | set(post_sample_df["aliases"].dropna())

relevant_samples = set(latest_pre_samples) | set(earliest_post_samples)

In [13]:
latest_pre_sample_df = (pre_sample_df[pre_sample_df['sample_id_legacy'].isin(latest_pre_samples)]
                        #.rename(columns={"collection_days_from_icb": "pre_collection_days_from_icb"})
                       )
earliest_post_sample_df = (post_sample_df[post_sample_df['sample_id_legacy'].isin(earliest_post_samples)]
                           #.rename(columns={"collection_days_from_icb": "post_collection_days_from_icb"})
                          )

# Get pre- and post-ICB mean CCFs for these mutations

In [14]:
pre_muts = mut_ccf_df[mut_ccf_df["Sample_ID"].isin(latest_pre_samples)]
post_muts = mut_ccf_df[mut_ccf_df["Sample_ID"].isin(earliest_post_samples)]

In [15]:
pre_mut_ccfs = (pre_muts.groupby(["Patient_ID", "unique_mut_id"])["clust_ccf_mean"]
                .mean()
                .reset_index()
                .rename(columns={"clust_ccf_mean": "pre_ccf_mean",
                                }
                ))
post_mut_ccfs = (post_muts.groupby(["Patient_ID", "unique_mut_id"])["clust_ccf_mean"]
                 .mean()
                 .reset_index()
                 .rename(columns={"clust_ccf_mean":"post_ccf_mean",
                                 }
                ))

In [16]:
kept_cols = ["Patient_ID", "Sample_ID", "Hugo_Symbol", "Chromosome", "Start_Position", 
             "Reference_Allele", "Tumor_Seq_Allele2", "unique_mut_id", "Cluster_Assignment", "clonal_status"]
mut_df = mut_ccf_df[kept_cols].drop_duplicates()

In [17]:
mut_df = (mut_df.merge(pre_mut_ccfs, left_on=["Patient_ID", "unique_mut_id"], right_on=["Patient_ID", "unique_mut_id"], how="left")
                .merge(post_mut_ccfs, left_on=["Patient_ID", "unique_mut_id"], right_on=["Patient_ID", "unique_mut_id"], how="left")
                .merge(latest_pre_sample_df[["participant_id_legacy", "latest_pre_date"]], left_on="Patient_ID", right_on="participant_id_legacy", how="left")
                .drop(columns=["participant_id_legacy"])
                .merge(earliest_post_sample_df[["participant_id_legacy", "earliest_post_date"]], left_on="Patient_ID", right_on="participant_id_legacy", how="left")
                .drop(columns=["participant_id_legacy"])
                .sort_values(["Patient_ID", "Chromosome", "Start_Position"])
                .reset_index(drop=True)
                .drop_duplicates(["Patient_ID", "unique_mut_id"])
         )

In [18]:
mut_df[mut_df["Hugo_Symbol"] == "NF1"].shape#.to_csv("mutations_w_prepost_ccfs_nf1.tsv", sep="\t", index=None)

(54, 14)

# Join deleteriousness annotations

In [19]:
del_df["sbs"] = del_df[["Reference_Allele", "Tumor_Seq_Allele"]].apply(lambda r: clean_ref_alt(r["Reference_Allele"], 
                                                                                               r["Tumor_Seq_Allele"]
                                                                                              ),
                                                                       axis=1
                                                                              )
del_df["Reference_Allele"] = del_df["sbs"].map(lambda x: x[0])
del_df["Tumor_Seq_Allele"] = del_df["sbs"].map(lambda x: x[1])
del_df["Chromosome"] = del_df["Chromosome"].astype(str)

In [20]:
del_df = del_df.rename(columns={"Start_position": "Start_Position",
                                "Tumor_Seq_Allele": "Tumor_Seq_Allele2"
                               }
                      )

In [21]:
join_cols = ["Chromosome", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2"]

del_df[join_cols] = del_df[join_cols].astype(str)
del_df = del_df[join_cols + ["CADD_PHRED", "ONCOKB_MUTATION_EFFECT", "cadd_oncokb_deleterious"]].drop_duplicates()

mut_df[join_cols] = mut_df[join_cols].astype(str)
mut_df = mut_df.merge(del_df, left_on=join_cols, right_on=join_cols, how="left")

In [22]:
nf1_paired_mut_df = mut_df[(mut_df["Hugo_Symbol"] == "NF1") &\
                           (mut_df["pre_ccf_mean"].notnull()) &\
                           (mut_df["post_ccf_mean"].notnull())]

In [23]:
nf1_paired_mut_df.to_csv("mutations_w_prepost_ccfs_nf1.tsv", sep="\t", index=None)

In [24]:
nf1_paired_mut_df

Unnamed: 0,Patient_ID,Sample_ID,Hugo_Symbol,Chromosome,Start_Position,Reference_Allele,Tumor_Seq_Allele2,unique_mut_id,Cluster_Assignment,clonal_status,pre_ccf_mean,post_ccf_mean,latest_pre_date,earliest_post_date,CADD_PHRED,ONCOKB_MUTATION_EFFECT,cadd_oncokb_deleterious
8583,GBM.ICB-1,GBM.ICB-1-10.533.B1.Pre,NF1,17,29550543,C,T,17:29550543G>A,9,subclonal,0.01,0.28,-1786.0,98.0,11.27,Unknown,False
8584,GBM.ICB-1,GBM.ICB-1-10.533.B1.Pre,NF1,17,29553685,C,T,17:29553685G>A,6,subclonal,0.01,0.9,-1786.0,98.0,29.3,Unknown,True
8585,GBM.ICB-1,GBM.ICB-1-10.533.B1.Pre,NF1,17,29556107,C,T,17:29556107C>T,8,subclonal,0.01,0.01,-1786.0,98.0,24.8,Unknown,True
8587,GBM.ICB-1,GBM.ICB-1-10.533.B1.Pre,NF1,17,29663438,C,T,17:29663438G>A,9,subclonal,0.01,0.28,-1786.0,98.0,28.0,Unknown,True
8588,GBM.ICB-1,GBM.ICB-1-10.533.B1.Pre,NF1,17,29664881,C,T,17:29664881G>A,9,subclonal,0.01,0.28,-1786.0,98.0,42.0,Unknown,True
26769,GBM.ICB-10,GBM.ICB-10-14.178.B.Pre2.FFc,NF1,17,29647432,C,T,17:29647432C>T,11,subclonal,0.01,0.01,-807.0,14.0,26.1,Unknown,True
26770,GBM.ICB-10,GBM.ICB-10-14.178.B.Pre2.FFc,NF1,17,29665798,C,T,17:29665798C>T,11,subclonal,0.01,0.01,-807.0,14.0,26.5,Unknown,True
43600,GBM.ICB-131,GBM.ICB-131-16.577.2-1.u.Pre,NF1,17,29486094,C,T,17:29486094G>A,9,subclonal,0.01,0.01,-83.0,355.0,25.5,Unknown,True
43601,GBM.ICB-131,GBM.ICB-131-16.577.2-1.u.Pre,NF1,17,29527470,C,T,17:29527470C>T,15,subclonal,0.01,0.01,-83.0,355.0,24.6,Unknown,True
43602,GBM.ICB-131,GBM.ICB-131-16.577.2-1.u.Pre,NF1,17,29533315,C,T,17:29533315C>T,12,subclonal,0.64,0.01,-83.0,355.0,38.0,Unknown,True


In [26]:
latest_pre_sample_df

Unnamed: 0,sample_id,Broad_Name_add,Histology,aliases,collection_date_dfd,has_RNA,has_WES,irb_protocols,no_normal,original_material_type,...,Exonic Rate,Mapping Rate,pdb_preservation_type,usable_wes,usable_rnaseq,participant_id_legacy,sample_id_legacy,DaysDxtoICB,collection_days_from_icb,latest_pre_date
0,GBM-001-T-1B,,Giant cell GBM,GBM.ICB-1-10.533.A2-Pre,0.0,True,True,10-417,False,tissue,...,0.880715,0.983749,,True,False,GBM.ICB-1,GBM.ICB-1-10.533.B1.Pre,1786.0,-1786.0,-1786.0
3,GBM-010-T-1,,gbm,,23.0,True,True,10-417,False,tissue,...,0.839242,0.990576,,True,False,GBM.ICB-10,GBM.ICB-10-14.178.B.Pre2.FFc,830.0,-807.0,-807.0
9,GBM-102-T-1,,gbm,,0.0,True,True,10-417,True,tissue,...,0.877586,0.98333,,True,False,GBM.ICB-102,GBM.ICB-102-14.186.Pre,307.0,-307.0,-307.0
20,GBM-131-T-1,,gliosarcoma,,0.0,False,True,10-417,False,tissue,...,,,,True,False,GBM.ICB-131,GBM.ICB-131-16.577.2-1.u.Pre,83.0,-83.0,-83.0
23,GBM-136-T-1,GBM.ICB-136-16.350.A2.Pre.add,,GBM.ICB-136-16.350.A2.Pre.add,0.0,True,True,10-417,True,tissue,...,0.916316,0.98638,,True,True,GBM.ICB-136,GBM.ICB-136-16.350.A2.Pre,49.0,-49.0,-49.0
26,GBM-139-T-1A,,,,0.0,True,True,10-417,False,tissue,...,0.887162,0.962082,,True,True,GBM.ICB-139,GBM.ICB-139-16.266-Pre.A2,35.0,-35.0,-35.0
27,GBM-139-T-1B,,,,0.0,True,True,10-417,False,tissue,...,0.846051,0.909367,,True,False,GBM.ICB-139,GBM.ICB-139-16.266-Pre.C5,35.0,-35.0,-35.0
28,GBM-139-T-1C,,,,0.0,True,True,10-417,False,tissue,...,0.851592,0.988466,,True,True,GBM.ICB-139,GBM.ICB-139-16.266.B.Pre.FFc,35.0,-35.0,-35.0
32,GBM-014-T-2,,gbm,,300.0,True,True,10-417,False,tissue,...,0.902156,0.979729,,True,True,GBM.ICB-14,GBM.ICB-14-16.702.B1.Pre2,425.0,-125.0,-125.0
35,GBM-141-T-1,GBM.ICB-141-15.700.1B.Pre.add,,GBM.ICB-141-15.700.1B.Pre.add,0.0,True,True,10-417,True,tissue,...,0.912824,0.980922,,True,False,GBM.ICB-141,GBM.ICB-141-15.700.1B.Pre,409.0,-409.0,-409.0


In [27]:
earliest_post_sample_df

Unnamed: 0,sample_id,Broad_Name_add,Histology,aliases,collection_date_dfd,has_RNA,has_WES,irb_protocols,no_normal,original_material_type,...,Exonic Rate,Mapping Rate,pdb_preservation_type,usable_wes,usable_rnaseq,participant_id_legacy,sample_id_legacy,DaysDxtoICB,collection_days_from_icb,earliest_post_date
1,GBM-001-T-2,,GBM,"GBM.ICB-1-15.228.B.Post.FFb, SM-JDD5Y",1884.0,True,True,"10-417, 17-682",False,tissue,...,0.820317,0.985425,,True,True,GBM.ICB-1,GBM.ICB-1-15.228.Post.FFbank,1786.0,98.0,98.0
4,GBM-010-T-2,,gbm,"GBM.ICB-10-17.305.Post.FFb, SM-JDD5R",844.0,True,True,"10-417, 17-682",False,tissue,...,0.852651,0.989419,,True,True,GBM.ICB-10,GBM.ICB-10-17.305.A.Post.FFb,830.0,14.0,14.0
10,GBM-102-T-2,,gbm,,397.0,True,True,10-417,True,tissue,...,0.895114,0.984699,,True,False,GBM.ICB-102,GBM.ICB-102-14.439.A.Post.FFc,307.0,90.0,90.0
12,GBM-107-T-1,,gbm,,265.0,True,True,10-417,False,tissue,...,0.879410,0.975860,,True,True,GBM.ICB-107,GBM.ICB-107-18.067.A2.Post,259.0,6.0,6.0
15,GBM-011-T-1A,,gbm,,606.0,True,True,10-417,True,tissue,...,0.891150,0.970226,,True,True,GBM.ICB-11,GBM.ICB-11-17.000-Post2.C1,36.0,570.0,570.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,GBM-076-T-1,,gbm,GBM.ICB-76-20.769.A2-Post.extra,1807.0,True,True,10-417,False,tissue,...,0.873123,0.990121,,True,True,GBM.ICB-76,GBM.ICB-76-20.769.A2-Post,33.0,1774.0,1774.0
191,GBM-078-T-2,,glioma,"GBM.ICB-78-16.739.B.Post.FFb, SM-JDKOZ",348.0,True,True,"10-417, 17-682",False,tissue,...,0.909551,0.937919,,True,False,GBM.ICB-78,GBM.ICB-78-16.739.Post.FFbank,22.0,326.0,326.0
193,GBM-008-T-2,,gbm,"GBM.ICB-8-14.194.Post, SM-JDKOQ",339.0,True,True,"10-417, 17-682",False,tissue,...,0.456219,0.982462,,True,False,GBM.ICB-8,GBM.ICB-8-14.194.Post.FFbank,273.0,66.0,66.0
194,GBM-084-T-2,,gbm,"GBM.ICB-84-16.138.B.Pre2.FFb, GBM.ICB-84-16.13...",84.0,True,True,"10-417, 17-682",False,tissue,...,0.834741,0.983420,,True,True,GBM.ICB-84,GBM.ICB-84-16.138.Post.FFbank,29.0,55.0,55.0


In [28]:
latest_pre_sample_df.to_csv("latest_pre_samples.tsv", sep="\t", index=None)

In [29]:
earliest_post_sample_df.to_csv("earliest_post_samples.tsv", sep="\t", index=None)