# Cforest

## Set working directory
Warning: only run the cell below once per kernel session
Warning: the cell below set the working directory to the project root and should therefore be run once per kernel session.

In [None]:
import os
from pathlib import Path

project_root = Path.cwd().parent
os.chdir(project_root)
print(os.getcwd())

## Load the data

In [None]:
from src.data import Data
import tomllib

with open("config.toml", "rb") as file:
    config = tomllib.load(file)
    
data = Data(config)

# new mixing matrix has different sample separator
data.replace_sample_sep(".")

In [None]:
# fetch the current state of the mixing matrix with a tumor types column
dataset = data.get_mm_with_tt()
dataset = data.subset(dataset, n_rows=dataset.shape[0], n_cols=(int(dataset.shape[1] * 0.1)), n_labels=len(data.tumor_types["response"].unique()))

## Split data into train, test, and validation sets

In [None]:
# Set the sizes of the train, test, and validation sets according to some fraction
train, test, val = data.get_train_test_val(
    train_size=0.7,
    test_size=0.15,
    val_size=0.15,
    data=dataset
)

## Build the cforest model

The plot, save, and assess methods will save files to the output directory specified in the `config.toml` file.
Each model type (e.g., glm, ctree, cforest) will have its own subdirectory in the output directory.
Each model fit will have its own subdirectory in the model type directory consisting of the date, time, and a unique identifier.

In [None]:
from src.ml.cforest import Cforest
cforest = Cforest(data)

controls = [
    cforest.CtreeControl(
        testtype= "Univariate",
        alpha=0.05,
        minsplit=10,
        minbucket=7
    ),
    cforest.CtreeControl(
        testtype= "Univariate",
        alpha=0.05,
        minsplit=20,
        minbucket=7
    ),
    cforest.CtreeControl(
        testtype= "Univariate",
        alpha=0.05,
        minsplit=30,
        minbucket=7
    ),
    cforest.CtreeControl(
        testtype= "Univariate",
        alpha=0.05,
        minsplit=20,
        minbucket=3
    ),
    cforest.CtreeControl(
        testtype= "Univariate",
        alpha=0.05,
        minsplit=20,
        minbucket=14
    ),
]

for ctrl in controls:
    cforest.fit(
        train=train,
        ctree_control=ctrl,
        ntree=10,
        cores=config["execution"]["cores"]
    )

    cforest.save()

    ## Predict on train ##
    ypredict_probs = cforest.predict(newx=train.drop(columns=["response"]), type="prob")
    ypredict = cforest.predict(newx=train.drop(columns=["response"]), type="response")
    cforest.assess(ytrue=train["response"], ypredict=ypredict, ypredict_probs=ypredict_probs, name="clustermap_train")
    
    ## Predict on val ##
    ypredict_probs = cforest.predict(newx=val.drop(columns=["response"]), type="prob")
    ypredict = cforest.predict(newx=val.drop(columns=["response"]), type="response")
    cforest.assess(ytrue=val["response"], ypredict=ypredict, ypredict_probs=ypredict_probs, name="clustermap_val")