In [1]:
# Import necessary libraries
import pandas as pd
import plotly.graph_objects as go
import dalmatian
from google.cloud import storage
import matplotlib.pyplot as plt
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout

import numpy as np

import sys
import os
sys.path.append('..')  
import model_helper 
from helper import *
from plotly_helper import *

In [2]:
# Function to load data
def load_data(patient_id, wbc_file, treatment_file):
    # Load WBC and treatment data
    wbc_df = pd.read_csv(wbc_file)
    treatment_df = pd.read_csv(treatment_file, sep='\t')
    
    # Get input files from Terra through dalmatian
    workspace = 'broad-firecloud-ibmwatson/TAG_CLL_Clonal_Kinetic_UMI_PrAN'
    wm = dalmatian.WorkspaceManager(workspace)
    participants = wm.get_participants()
    
   # Load additional data for the patient
    cluster_CCF_df = pd.read_csv(participants.loc[patient_id]['cluster_ccfs'], sep='\t')
    
#     tree_df = pd.read_csv(participants.loc[patient_id]['tree_tsv'], sep='\t')
    
    tree_df = pd.read_csv(f"../Cell_Population/{patient_id}_build_tree_posteriors.tsv", sep = '\t')

    abundance_file = f"../Cell_Population/{patient_id}_cell_population_abundances.tsv"
    mcmc_file = f"../Cell_Population/{patient_id}_cell_population_mcmc_trace.tsv"  
    
    abundance_df = pd.read_csv(abundance_file, sep='\t')
    mcmc_df = pd.read_csv(mcmc_file, sep='\t')  
    
    return wbc_df, treatment_df, cluster_CCF_df, abundance_df, mcmc_df, tree_df

# Function to filter data for a specific patient
def filter_patient_data(wbc_df, patient_id):
    wbc_df_patient = wbc_df[wbc_df['Patient'] == patient_id]
    wbc_df_patient.reset_index(drop=True, inplace=True)
    
    # Filter out rows where ALC is empty or NaN
    wbc_df_patient = wbc_df_patient[wbc_df_patient['ALC'].notna() & (wbc_df_patient['ALC'] != '')]
    wbc_df_patient.reset_index(drop=True, inplace=True)
    
    times_sample = [int(i) for i in wbc_df_patient.loc[wbc_df_patient.Sample.notna(), 'Day'].tolist()]
    alc_all = [float(i) for i in wbc_df_patient['ALC'].tolist()]
    
    
    alc_count_sample = [float(i) for i in wbc_df_patient.loc[wbc_df_patient.Sample.notna(), 'CLL_est'].tolist()]
    all_times = [int(i) for i in wbc_df_patient['Day'].to_list()]
    
    return wbc_df_patient, times_sample, alc_all,alc_count_sample,all_times
    




In [3]:
end_treatment_df = pd.read_csv("../CLL8_end_treatment.csv")

In [5]:
# Input patient information
# Input patient information
patient_id = 'RP-1895_FI-13'
wbc_file = '../combined_FI_alc_updated.csv'
treatment_file = '../GCLL_treatment.txt'
UMI_start = 8
UMI_end = 9
treatment_start = 0


In [47]:
# Manually verify the sample list order is correct

In [6]:

sample_list = ['RP-1895_10004-C1D1_v1_Exome_OnPrem-RP-1895_Saliva-372_v1_Exome_OnPrem','RP-1895_10157-C2D28_v1_Exome_OnPrem-RP-1895_Saliva-372_v1_Exome_OnPrem', 'RP-1895_10227-C3D28_v1_Exome_OnPrem-RP-1895_Saliva-372_v1_Exome_OnPrem', 'RP-1895_10304-C4D28_v1_Exome_OnPrem-RP-1895_Saliva-372_v1_Exome_OnPrem', 'RP-1895_10428-C6D28_v1_Exome_OnPrem-RP-1895_Saliva-372_v1_Exome_OnPrem', 'SM-N57UY_RP-1895_Saliva-372_v1_Exome_OnPrem_pair', 'SM-N57UZ_RP-1895_Saliva-372_v1_Exome_OnPrem_pair', 'SM-N57UW_RP-1895_Saliva-372_v1_Exome_OnPrem_pair', 'SM-N57UX', 'SM-N57UU_RP-1895_Saliva-372_v1_Exome_OnPrem_pair']

In [26]:
# Output file

In [7]:
outputfile_name = "FI_subplot_combined_13_wbc_updated.html"

Update patient specific information in the below cell 

In [8]:
# Get the treatment duration 

treatment_end = 1980

In [9]:
treatment_end

1980

In [11]:

wbc_df, treatment_df, cluster_CCF_df, abundance_df, mcmc_df, tree_df = load_data(patient_id, wbc_file,treatment_file )

# Filter data for the specific patient
wbc_df_patient, times_sample,alc_all,alc_count_sample, all_times = filter_patient_data(wbc_df, patient_id)


# Plot CLL count over time and save as HTML
CLL_plot_html = plot_CLL_count(patient_id, times_sample, alc_count_sample, UMI_start, UMI_end, treatment_start, treatment_end, )
    





In [12]:
times_sample

[-1, 55, 88, 111, 167, 335, 706, 1076, 1440, 1798]

In [13]:
alc_count_sample

[111.0, 217.52, 41.44, 10.98, 4.75, 1.464, 1.09968, 1.03264, 0.44652, 1.4924]

In [14]:
wbc_table_html = plot_metadata_table(wbc_df_patient, patient_id )

In [15]:
tree_df

Unnamed: 0,n_iter,likelihood,edges
0,133,"[0.3333333333333332, 0.5, 0.5, 0.3333333333333...","1-2,1-3,1-4,None-1"
1,116,"[0.3333333333333332, 0.3333333333333332, 0.5, ...","1-2,1-3,2-4,None-1"


In [16]:
 plot_ccf_html = plot_ccf(cluster_CCF_df, times_sample, treatment_df,)

In [17]:
ccf_tree_html = plot_ccf_tree_combined(
    tree_df=tree_df,
    tree_selected=1
    ,
    ccf_df=cluster_CCF_df,
    times_sample=times_sample,
    treatment_df=treatment_df,


)

In [18]:
cluster_list, cluster_abundance = model_helper.get_abundance(abundance_df, mcmc_df, sample_list)
subclone_sample, log_subclone_sample = model_helper.calc_subclone(alc_count_sample, cluster_abundance, cluster_list)
all_abundance = model_helper.get_all_abundance(cluster_list, mcmc_df, sample_list, times_sample, )

In [19]:
#  Add noise to all abundance

import random 
random.seed(42)
def add_uniform_noise_and_normalize(lst, low=0, high=0.01):
    
    # Add uniform noise
    noisy_lst = [x + random.uniform(low, high) for x in lst]

    
    # Normalize the list
    total_sum = sum(noisy_lst)
    
    normalized_lst = [x / total_sum for x in noisy_lst]

    return normalized_lst


subclone_cluster_iter = {}
# Iterate through each cluster
for cluster, iterations in all_abundance.items():
   
    # Iterate through each iteration in the cluster
    for iteration, value in iterations.items():
        # If the iteration is not in the new dict, add it
        if iteration not in subclone_cluster_iter:
            subclone_cluster_iter[iteration] = {}
        # Add the cluster and its value to this iteration
        subclone_cluster_iter[iteration][cluster] = value

        
        
noise_added_iter = {}

for iteration, abundances in subclone_cluster_iter.items():
    
    transposed_data = {i: list(values) for i, values in enumerate(zip(*abundances.values()), start=1)}
    
    noise_added_iter[iteration] = {}
    for time_point, abundance in transposed_data.items():
        
#         print(time_point, abundance)
        noise_abundance = add_uniform_noise_and_normalize(abundance)
    
#         print(noise_abundance)

        noise_added_iter[iteration][time_point] = add_uniform_noise_and_normalize(abundance)

    
noise_added_all_abundance = {}
for iteration, abundances in noise_added_iter.items():
    original_format = {i + 1: list(values) for i, values in enumerate(zip(*abundances.values()))}
    noise_added_all_abundance[iteration] = original_format
    
    
noise_added_original_all_abundances = {}

for iteration, abundances in noise_added_all_abundance.items():
    
    for cluster, abundance_value in abundances.items():
        if cluster not in noise_added_original_all_abundances:
            noise_added_original_all_abundances[cluster] = {}
        noise_added_original_all_abundances[cluster][iteration] = abundance_value
            

In [20]:
subclone_sample_mcmc_with_uniform_noise, log_subclone_sample_mcmc_with_uniform_noise = model_helper.calc_subclone(alc_count_sample, noise_added_original_all_abundances, cluster_list, input_type = "mcmc")

In [41]:
times_aft_tx = [x for x in all_times if x > treatment_start]
times_aft_tx.insert(0, treatment_end)

# Set the extrapolate time after treatment
extrapolate_start_idx = 1

In [43]:
times_sample

[-1, 55, 88, 111, 167, 335, 706, 1076, 1440, 1798]

In [44]:
times_sample[extrapolate_start_idx:]

[55, 88, 111, 167, 335, 706, 1076, 1440, 1798]

In [45]:
# Generate Plotly plot
subclone_plot_html = plot_subclones(cluster_list, times_sample, alc_count_sample, log_subclone_sample, extrapolate_start_idx, times_aft_tx, treatment_df, treatment_end, CLL14_modeling=True)


In [46]:
linear_model_mcmc_html = plot_linear_model_mcmc(cluster_list,times_sample,  alc_count_sample, log_subclone_sample_mcmc_with_uniform_noise,extrapolate_start_idx,times_aft_tx,treatment_df, treatment_end, CLL14_modeling=True)  

In [47]:
times_sliced_aft = [int(i) for i in wbc_df_patient[wbc_df_patient['Day'] > 0]['Day'].values  if int(i) > 0]

In [48]:
# Needs to manually check times_sliced_aft

In [49]:
times_sliced_aft

[27,
 55,
 60,
 75,
 83,
 88,
 103,
 111,
 139,
 167,
 244,
 335,
 419,
 510,
 608,
 706,
 804,
 895,
 986,
 1076,
 1168,
 1259,
 1350,
 1440,
 1532,
 1616,
 1707,
 1798,
 1889,
 1980]

In [50]:
len(times_sliced_aft)

30

In [51]:
index_samples_model = slice(1,10)

# index_samples_model is the index of the samples after treatment. For instance, slice(1,7) means sample 2, 3, 4, 5,6, 7

X, y = create_inputs(times_sliced_aft, log_subclone_sample_mcmc_with_uniform_noise,2, index_samples_model, times_sample)



In [52]:
y

[None,
 array([4.50392074, 3.51450279, 4.14992211, 3.40810589]),
 None,
 None,
 None,
 array([2.46259825, 2.4251724 , 1.93721416, 2.44040975]),
 None,
 array([ 0.89763341,  1.4207696 , -0.51019188,  1.33234409]),
 None,
 array([ 0.3486148 ,  0.18933772, -0.43135825,  0.38982452]),
 None,
 array([-0.32420147, -1.96309158, -0.87389947, -1.69425318]),
 None,
 None,
 None,
 array([-0.282945  , -5.374412  , -1.13794844, -3.84206926]),
 None,
 None,
 None,
 array([-0.32888273, -5.1199764 , -1.23952502, -4.02699783]),
 None,
 None,
 None,
 array([-1.06929738, -5.44489236, -2.36646929, -5.23640698]),
 None,
 None,
 None,
 array([ 0.28174906, -5.58883674, -1.96575849, -3.73967195]),
 None,
 None]

In [53]:
wbc_aft_tx = wbc_df_patient[wbc_df_patient.Day > treatment_start]['CLL_est'].to_list()
wbc_model = [i for i in wbc_aft_tx if i > 0]

In [54]:
wbc_model

[182.49,
 217.52,
 179.97,
 69.06,
 59.78,
 41.44,
 14.31,
 10.98,
 9.46,
 4.75,
 1.88,
 1.464,
 1.43,
 1.52,
 2.06,
 1.09968,
 1.76,
 1.7,
 1.84,
 1.03264,
 2.2,
 1.68,
 1.98,
 0.44652,
 2.32,
 2.46,
 2.19,
 1.4924,
 2.06,
 3.43]

In [55]:
len(wbc_model)

30

In [56]:
n_clusters = len(cluster_list)
logsumexp_points = np.log(wbc_model)
model = MultiClusterLinearRegression(n_clusters, X, y)
model.fit(logsumexp_points)

In [57]:
# Generate Plotly plot
subclone_plot_new_model_html = plot_subclones_new_model(cluster_list, times_sample, wbc_model, log_subclone_sample, extrapolate_start_idx, times_aft_tx, times_sliced_aft, treatment_df,treatment_end, model, CLL14_modeling=True )


DEBUG: plot_subclones_new_model called with clusters=[1, 2, 3, 4], times_sample[1]=55, treatment_end=1980


In [58]:
plot_mcmc_new_model_html = plot_mcmc_model(cluster_list, index_samples_model, times_aft_tx, times_sample, times_sliced_aft, sample_list, wbc_model, log_subclone_sample_mcmc_with_uniform_noise, treatment_df, treatment_end, CLL14_modeling=True)


In [59]:
create_html_file([wbc_table_html, plot_ccf_html, ccf_tree_html, CLL_plot_html, subclone_plot_html,linear_model_mcmc_html,subclone_plot_new_model_html, plot_mcmc_new_model_html], output_file=outputfile_name)

HTML file saved as FI_subplot_combined_13_wbc_updated.html
