# MACH2

This segment includes code snippets for running MACH2.

## Prostate

In [None]:
import mach2
import time
import tracemalloc
import os

gundem_sols = {}
gundem_time = {}
gundem_memory = {}
for i in [10, 12, 17, 21, 22, 24, 29, 31, 32, 34]:
    t = time.time()
    tracemalloc.start()
    tree = mach2.MultiLabeledTree.from_files(f'../mach2/data/prostate/A{i}.tree', 
                                     f'../mach2/data/prostate/A{i}.observed.labeling')
    gundem_sols[i] = mach2.MACH2(tree, primary_location='prostate', criteria_ordering='UMC').solve()
    gundem_memory[i] = tracemalloc.get_traced_memory()
    gundem_time[i] = time.time() - t
    tracemalloc.stop()

for i in gundem_sols:
    os.makedirs(f'results2/mach2/prostate/A{i}', exist_ok=True)
    gundem_sols[i].write(f'results2/mach2/prostate/A{i}/')
    
for i in gundem_time:
    print(i, gundem_time[i], gundem_memory[i])

## Lung

In [None]:
import mach2
import time
import tracemalloc
import os

tree = mach2.MultiLabeledTree.from_files('../mach2/data/lung/TRACERxTest.automatic.tree', 
                                 '../mach2/data/lung/TRACERxTest.automatic.observed.labeling', 
                                 '../mach2/data/lung/TRACERxTest.coloring.txt')
t = time.time()
tracemalloc.start()
sols = mach2.MACH2(tree, primary_location='p.lung', criteria_ordering='UMC').solve()
print(tracemalloc.get_traced_memory(), time.time() - t)
tracemalloc.stop()
os.makedirs(f'results/mach2/lung_CRUKOO63', exist_ok=True)
sols.write(f'results/mach2/lung_CRUKOO63')

## Ovarian

In [None]:
import mach2
import time
import tracemalloc
import os

mc_lov = {}
mc_rov = {}
mcl_time = {}
mcr_time = {}
mcl_mem = {}
mcr_mem = {}
for i in [3]:
    try:
        tree = mach2.MultiLabeledTree.from_files(f'../mach2/data/ovarian/patient{i}.tree', 
                                        f'../mach2/data/ovarian/patient{i}.observed.labeling', 
                                        f'../mach2/data/ovarian/coloring.txt' )
        t = time.time()
        tracemalloc.start()
        mc_lov[i] = mach2.MACH2(tree, primary_location='LOv', criteria_ordering='UMC').solve(threads=15)
        mcl_time[i] = time.time() - t
        mcl_mem[i] = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        
        os.makedirs(f'results/mach2/ovarian/LOv/{i}/', exist_ok=True)
        mc_lov[i].write(f'results/mach2/ovarian/LOv/{i}/')
    except:
        pass
    t = time.time()
    tracemalloc.start()
    tree = mach2.MultiLabeledTree.from_files(f'../mach2/data/ovarian/patient{i}.tree', 
                                            f'../mach2/data/ovarian/patient{i}.observed.labeling', 
                                            f'../mach2/data/ovarian/coloring.txt' )
    mc_rov[i] = mach2.MACH2(tree, primary_location='ROv', criteria_ordering='UMC').solve(threads=15)
    mcr_time[i] = time.time() - t
    mcr_mem[i] = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    os.makedirs(f'results/mach2/ovarian/ROv/{i}/', exist_ok=True)
    mc_rov[i].write(f'results/mach2/ovarian/ROv/{i}/')
for i in mcl_time:
    print(i, mcl_time[i], mcl_mem)
for i in mcr_time:
    print(i, mcr_time[i], mcr_mem)

In [None]:
import mach2
import time
from collections import defaultdict
from collections import Counter
import glob
import os

funcs = ['M', 'UM', 'UC', 'CM', 'CU', 'SC', 'SU', 'SM', 'USM', 'USC']
sim_res = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
for treefile in glob.glob('../../../../mach2/data/sims/m[8]/*/T_seed*.tree'):
    if 'refined' not in treefile:
        dataset, pattern = tuple(treefile.split('/')[-3:-1])
        treeid = treefile.split('/')[-1][6:-5]
        labelfile = treefile[:-4]+'observed.labeling'
        tree = mach2.MultiLabeledTree.from_files(treefile, labelfile, '../../../../mach2/data/sims/coloring.txt')
        for f in funcs:
            os.makedirs(f'{f}/{dataset}/{pattern}/{treeid}/', exist_ok=True)
            t = time.time()
            solver = mach2.MACH2(tree, primary_location='P', criteria_ordering=f)
            sols = solver.solve(threads=15)
            t = time.time() - t
            sols.write(f'{f}/{dataset}/{pattern}/{treeid}/')
            print(dataset, pattern, treeid, f, solver.successfully_run, t)

# Metient

This segment includes code snippets for data preprocessing and running Metient.

## Prostate

In [None]:
import mach2
import pandas as pd
import os

for i in [10, 12, 17, 21, 22, 24, 29, 31, 32, 34]:
    t = mach2.MultiLabeledTree.from_files(f'../mach2/data/prostate/A{i}.tree', 
                                     f'../mach2/data/prostate/A{i}.observed.labeling')
    gd = pd.read_csv(f'misc/gundem_subclones/A{i}.csv')[['cluster.colour','# subs from WGS data']].set_index('cluster.colour').to_dict()['# subs from WGS data']
    asi = []
    asl = []
    ci = []
    cl = []
    p = []
    sc = []
    nm = []
    loci = {i:j for j, i in enumerate(t.locations)}
    nodei = {i:j for j, i in enumerate(t.nodes)}
    for s in loci:
        for u in nodei:
            asi.append(loci[s])
            asl.append(s)
            ci.append(nodei[u])
            cl.append(u)
            p.append(1 if (s in t.get_labels(u)) else 0)
            sc.append('primary' if s=='prostate' else 'metastasis')
            uu = u[:-1] if 'A' <= u[-1] <= 'Z' else u
            nm.append(gd[uu.replace('_', ' ')] if u!='fake' else '0')
    os.makedirs(f'metient_formatted_data/prostate', exist_ok=True)
    pd.DataFrame({'anatomical_site_index':asi, 'anatomical_site_label':asl, 'cluster_index':ci, 'cluster_label':cl, 
                      'present':p, 'site_category':sc, 'num_mutations':nm}).to_csv(f'metient_formatted_data/prostate/A{i}.tsv', sep='\t', index=False)
    with open(f'metient_formatted_data/prostate/A{i}.tree', 'w') as f:
        for u, v in t.edges:
            f.write(f'{nodei[u]} {nodei[v]}\n')

In [None]:
from metient import metient as met
import os
import glob
import time
import pandas as pd

ref_var_fns = []
clone_tree_fns = []
patients = []
for i in [10, 12, 17, 21, 22, 24, 29, 31, 32, 34]:
    ref_var_fns.append(f'metient_formatted_data/prostate/A{i}.tsv')
    clone_tree_fns.append(f'metient_formatted_data/prostate/A{i}.tree')
    patients.append(f'A{i}')
# os.makedirs(f'results/metient/prostate/', exist_ok=True)
t = time.time()
print_config = met.PrintConfig(visualize=False, verbose=False, k_best_trees=8192)
met.calibrate_label_clone_tree(clone_tree_fns, ref_var_fns, print_config, f'results/metient/prostate/', patients, solve_polytomies=True, sample_size=8192)
print(time.time() - t)

# Lung

In [None]:
import mach2
import pandas as pd
import os

t = mach2.MultiLabeledTree.from_files('../mach2/data/lung/TRACERxTest.automatic.tree', 
                             '../mach2/data/lung/TRACERxTest.automatic.observed.labeling', 
                             '../mach2/data/lung/TRACERxTest.coloring.txt')
gd = pd.read_csv('misc/tracerx_analysis/CRUK0063_mutation_mapping.tsv', sep='\t').groupby('NewTreeCluster')['MutationID'].count().to_dict()
asi = []
asl = []
ci = []
cl = []
p = []
sc = []
nm = []
loci = {i:j for j, i in enumerate(t.locations)}
nodei = {i:j for j, i in enumerate(t.nodes)}
for s in loci:
    for u in nodei:
        asi.append(loci[s])
        asl.append(s)
        ci.append(nodei[u])
        cl.append(u)
        p.append(1 if (s in t.get_labels(u)) else 0)
        sc.append('primary' if s=='p.lung' else 'metastasis')
        uu = u[:-1] if 'A' <= u[-1] <= 'Z' else u
        nm.append(gd[int(uu)] if int(uu) in gd else 1)
os.makedirs('metient_formatted_data/lung', exist_ok=True)
pd.DataFrame({'anatomical_site_index':asi, 'anatomical_site_label':asl, 'cluster_index':ci, 'cluster_label':cl, 
                  'present':p, 'site_category':sc, 'num_mutations':nm}).to_csv('metient_formatted_data/lung/CRUK0063.tsv', sep='\t', index=False)
with open('metient_formatted_data/lung/CRUK0063.tree', 'w') as f:
    for u, v in t.edges:
        f.write(f'{nodei[u]} {nodei[v]}\n')

In [None]:
from metient import metient as met
from metient.util import data_extraction_util as dutil
import os
import glob
import time
import pandas as pd

weights = met.Weights()

os.makedirs(f'results/metient/lung/CRUK0063', exist_ok=True)
t = time.time()
ref_var_fn = f'metient_formatted_data/lung/CRUK0063.tsv'
clone_tree_fn = f'metient_formatted_data/lung/CRUK0063.tree'
print_config = met.PrintConfig(visualize=False, verbose=False, k_best_trees=8192)
met.evaluate_label_clone_tree(clone_tree_fn,  ref_var_fn, weights, print_config, 'results/metient/lung/CRUK0063', 'CRUK0063', solve_polytomies=True)
print(time.time() - t)

# Ovarian

In [None]:
import mach2
import pandas as pd
import os

sample_df = pd.read_csv('misc/mcpherson_cluster_analysis/S1.csv')
sample_df = sample_df[sample_df['sample_id']!='normal_blood']
sample_df['anatomical_site_label'] = sample_df.apply(lambda row: ''.join([i for i in row['anatomy'] if not i.isdigit()]).replace("Site", "").strip(), axis=1)

pyclone_to_clone_id_df = pd.read_csv('misc/mcpherson_cluster_analysis/S8.csv')

pyclone_cluster_df = pd.read_csv('misc/mcpherson_cluster_analysis/S16.csv')
pyclone_cluster_df['mut_label'] = pyclone_cluster_df.apply(lambda row: f"{row['chrom']},{row['coord']},{row['ref']},{row['alt']}", axis=1)

prevalences_df = pd.read_csv('misc/mcpherson_cluster_analysis/S9.csv')

def get_pyclone_clusters(pid, clone_id):
    return set(pyclone_to_clone_id_df[(pyclone_to_clone_id_df['clone_id']==clone_id)&(pyclone_to_clone_id_df['patient_id']==pid)&(pyclone_to_clone_id_df['present']==1)]['pyclone_cluster_id'])

def num_muts_in_clusters(pid, pyclone_clusters):
    muts = set(pyclone_cluster_df[(pyclone_cluster_df['cluster_id'].isin(pyclone_clusters))&(pyclone_cluster_df['patient_id']==pid)]['mut_label'])
    return len(muts)
                                  
def get_num_mutations(tree, pid, clone_id):
    pyclone_clusters = get_pyclone_clusters(pid, clone_id)
    if clone_id == 'A': # root node
        return num_muts_in_clusters(pid, pyclone_clusters)
    # get the clusters which are either gained or lost between child and parent
    parent = tree.get_parent(clone_id)
    parent_pyclone_clusters = get_pyclone_clusters(pid, parent)
    gained_or_lost_clusters = parent_pyclone_clusters ^ pyclone_clusters
    return num_muts_in_clusters(pid, gained_or_lost_clusters)

for i in [1, 2, 3, 4, 7, 9, 10]:
    t = mach2.MultiLabeledTree.from_files(f'../mach2/data/ovarian/patient{i}.tree', 
                                     f'../mach2/data/ovarian/patient{i}.observed.labeling')
    asi = []
    asl = []
    ci = []
    cl = []
    p = []
    sc = []
    nm = []
    loci = {i:j for j, i in enumerate(t.locations)}
    nodei = {i:j for j, i in enumerate(t.nodes)}
    for s in loci:
        for u in nodei:
            asi.append(loci[s])
            asl.append(s)
            ci.append(nodei[u])
            cl.append(u)
            p.append(1 if (s in t.get_labels(u)) else 0)
            sc.append('primary' if (s=='ROv' or s=='LOv') else 'metastasis')
            # uu = u[:-1] if 'A' <= u[-1] <= 'Z' else u
            nm.append(get_num_mutations(t, i, u))
    os.makedirs(f'metient_formatted_data/ovarian', exist_ok=True)
    pd.DataFrame({'anatomical_site_index':asi, 'anatomical_site_label':asl, 'cluster_index':ci, 'cluster_label':cl, 
                      'present':p, 'site_category':sc, 'num_mutations':nm}).to_csv(f'metient_formatted_data/ovarian/patient{i}.tsv', sep='\t', index=False)
    with open(f'metient_formatted_data/ovarian/patient{i}.tree', 'w') as f:
        for u, v in t.edges:
            f.write(f'{nodei[u]} {nodei[v]}\n')

In [None]:
from metient import metient as met
import os
import glob
import time
import pandas as pd

ref_var_fns = []
clone_tree_fns = []
patients = []
for i in [1, 2, 3, 4, 7, 9, 10]:
    ref_var_fns.append(f'metient_formatted_data/ovarian/patient{i}.tsv')
    clone_tree_fns.append(f'metient_formatted_data/ovarian/patient{i}.tree')
    patients.append(f'patient{i}')
# os.makedirs(f'results/metient/prostate/', exist_ok=True)
t = time.time()
print_config = met.PrintConfig(visualize=False, verbose=False, k_best_trees=8192)
met.calibrate_label_clone_tree(clone_tree_fns, ref_var_fns, print_config, f'results/metient/ovarian/', patients, solve_polytomies=True, sample_size=8192)
print(time.time() - t)

# Simulated

In [None]:
import mach2
import pandas as pd
import glob
import os

for treefile in glob.glob('../mach2/data/sims/*/*/T_seed*.tree'):
    if 'refined' not in treefile:
        dataset, pattern = tuple(treefile.split('/')[-3:-1])
        treeid = treefile.split('/')[-1][6:-5]
        labelfile = treefile[:-4]+'observed.labeling'
        t = mach2.MultiLabeledTree.from_files(treefile, labelfile)
        asi = []
        asl = []
        ci = []
        cl = []
        p = []
        sc = []
        nm = []
        loci = {i:j for j, i in enumerate(t.locations)}
        nodei = {i:j for j, i in enumerate(t.nodes)}
        for s in loci:
            for u in nodei:
                # print(loci[s], s, nodei[u], u, s in t.get_labels(u), scf(s), nmf(u))
                asi.append(loci[s])
                asl.append(s)
                ci.append(nodei[u])
                cl.append(u)
                p.append(1 if (s in t.get_labels(u) or (u=='GL' and s=='P')) else 0)
                sc.append('primary' if s=='P' else 'metastasis')
                nm.append( len(u.split(';')) if (u != 'GL' and '_' not in u) else (1 if '_' in u else 0))
        os.makedirs(f'metient_formatted_data/sims/{dataset}/{pattern}/', exist_ok=True)
        pd.DataFrame({'anatomical_site_index':asi, 'anatomical_site_label':asl, 'cluster_index':ci, 'cluster_label':cl, 
                      'present':p, 'site_category':sc, 'num_mutations':nm}).to_csv(f'metient_formatted_data/sims/{dataset}/{pattern}/T_seed{treeid}.tsv', sep='\t', index=False)
        with open(f'metient_formatted_data/sims/{dataset}/{pattern}/T_seed{treeid}.tree', 'w') as f:
            for u, v in t.edges:
                f.write(f'{nodei[u]} {nodei[v]}\n')

In [None]:
from metient import metient as met
import os
import glob
import time
import pandas as pd

p2m = []
m2m = []

cohort = p2m
cohortname = 'primary_only'
ref_var_fns = [f'metient_formatted_data/sims/{dataset}/{pattern}/T_seed{treeid}.tsv' for (dataset, pattern, treeid) in cohort]
clone_tree_fns = [f'metient_formatted_data/sims/{dataset}/{pattern}/T_seed{treeid}.tree' for (dataset, pattern, treeid) in cohort]
patients = [f'{dataset}_{pattern}_{treeid}' for (dataset, pattern, treeid) in cohort]
os.makedirs(f'results/metient/sims/{cohortname}', exist_ok=True)
t = time.time()
print_config = met.PrintConfig(visualize=False, verbose=False, k_best_trees=1500)
met.calibrate_label_clone_tree(clone_tree_fns, ref_var_fns, print_config, 
                               f'results/metient/sims/{cohortname}/', patients, solve_polytomies=False)
print(time.time() - t)

cohort = m2m
cohortname = 'met_to_met'
ref_var_fns = [f'metient_formatted_data/sims/{dataset}/{pattern}/T_seed{treeid}.tsv' for (dataset, pattern, treeid) in cohort]
clone_tree_fns = [f'metient_formatted_data/sims/{dataset}/{pattern}/T_seed{treeid}.tree' for (dataset, pattern, treeid) in cohort]
patients = [f'{dataset}_{pattern}_{treeid}' for (dataset, pattern, treeid) in cohort]
os.makedirs(f'results/metient/sims/{cohortname}', exist_ok=True)
t = time.time()
print_config = met.PrintConfig(visualize=False, verbose=False, k_best_trees=1500)
met.calibrate_label_clone_tree(clone_tree_fns, ref_var_fns, print_config, 
                               f'results/metient/sims/{cohortname}/', patients, solve_polytomies=False)
print(time.time() - t)