# Trees

## set working directory
Warning: only run the cell below once per kernel session

In [1]:
import os
from pathlib import Path

project_root = Path.cwd().parent
os.chdir(project_root)
print(os.getcwd())

/Users/denniswiersma/Library/Mobile Documents/com~apple~CloudDocs/Documents/School/Bioinformatica/internship


## loading data

In [2]:
from data import Data
import tomllib

with open("config.toml", "rb") as file:
    config = tomllib.load(file)
    
data = Data(config)

# new mixing matrix has different sample separator
data.replace_sample_sep(".")

Reading data...


## Setup for TreeBuilder

In [3]:
# fetch the current state of the mixing matrix with a tumor types column
dataset = data.get_mm_with_tt()

In [4]:
# Set the sizes of the train, test, and validation sets according to some fraction
train, test, val = data.get_train_test_val(
    train_size=0.05,
    test_size=0.05,
    val_size=0.9,
    data=dataset
)

In [5]:
# Check the shapes and distributions of the train, test, and validation sets
print("size of train:", train.shape)
print("size of test:", test.shape)
print("size of val:", val.shape)

from ml.ctree import Ctree
ctree = Ctree(data)

size of train: (443, 9710)
size of test: (443, 9710)
size of val: (7976, 9710)


## ctree: single tree

All plots are saved in `ml/[predictor]/`, or, e.g., `ml/[predictor1]_[predictor3]/` when multiple predictors are defined.

In [6]:
ctrl = ctree.CtreeControl(
    testtype= "Bonferroni",
    alpha=0.01
)

ctree.fit(
    train=train,
    ctree_control=ctrl,
)

ctree.plot()
ctree.save()

building tree...
Plot saved to output/ctree/20231111170125_b1542f/ctree-testtype=Bonferroni-alpha=0_01-maxdepth=inf-minsplit=20-minbucket=7.png
Model saved to output/ctree/20231111170125_b1542f/ctree-testtype=Bonferroni-alpha=0_01-maxdepth=inf-minsplit=20-minbucket=7.pkl


In [7]:
ypredict = ctree.predict(newx=test.drop(columns=["response"]), type="response")
ypredict_probs = ctree.predict(newx=test.drop(columns=["response"]), type="prob")

ctree.assess(ytrue=test["response"], ypredict=ypredict, ypredict_probs=ypredict_probs)

AUC-ROC: (0.4996102063701396,)
MCC: -0.010644916634548215
<rpy2.rinterface_lib.sexp.NULLType object at 0x117594110> [0]


TypeError: 'NULLType' object is not iterable


## ctree: multiprocessed

Note: this might take a while depending on the number of parameters and the size of the dataset.

All plots are saved in `ml/[predictor]/`, or, e.g., `ml/[predictor1]_[predictor3]/` when multiple predictors are defined.

### Prepare parameters

A tree will be built for each combination of parameters. The parameters are defined below.

In [None]:
import itertools

# define parameters to be passed to ctree
teststats: list[str] = ["quad", "max"]
testtypes: list[str | list] = [
    "Teststatistic",
    "Univariate",
    "Bonferroni",
    "MonteCarlo",
    robjects.r.c("MonteCarlo", "Bonferroni"),
]
splitstats: list[str] = ["quad", "max"]
# splittests: list[bool] = [True, False]
alphas: list[float] = [0.1, 0.05, 0.01]
predictors: list[list[str]] = [
    ["consensus independent component 1"],
    ["consensus independent component 2"],
    ["consensus independent component 3"],
]

# make all possible combinations of parameters above
arg_combos = itertools.product(
    teststats, testtypes, splitstats, alphas, predictors
)


In [None]:
import multiprocessing as mp
from datetime import datetime

t0 = datetime.now()
print(f"starting time: {t0.time()}")

with mp.Pool(config["execution"]["cores"]) as p:
    p.starmap(treebuilder.build_ctree, arg_combos)
    
print("Time taken:", datetime.now() - t0)