# GLMNET

## set working directory
Warning: only run the cell below once per kernel session

In [None]:
import os
from pathlib import Path

project_root = Path.cwd().parent
os.chdir(project_root)
print(os.getcwd())

## loading data

In [None]:
from data import Data
import tomllib

with open("config.toml", "rb") as file:
    config = tomllib.load(file)

data = Data(config)

# postprocess data
data.replace_sample_sep(".")
data.filter_tt(100)
unique_tt = data.tumor_types["response"].unique()
print("Number of tumor types:", len(unique_tt))

## Setup for GLMNET

Note: the amount of data in the subset might differ from the amount requested in `data.get_subset()`, when you request more data than the given number of labels can provide.

In [None]:
dataset = data.get_mm_with_tt()

In [None]:
# Optional: subset
# dataset = data.subset(dataset, n_rows=250, n_cols=9709, n_labels=15)  # Does not guarantee an equal number of samples per label
# dataset = data.subset_nrows_per_label(dataset, nrows_per_label=100, ncols=9709, nlabels=len(unique_tt))  # Guarantees an equal number of samples per label, use all labels
dataset = data.subset_nrows_per_label(dataset, nrows_per_label=100, ncols=9709, nlabels=12)  # Guarantees an equal number of samples per label, use x labels

The cell below is only necessary when you want to aggregate some of the labels into one label.

In [None]:
# aggregate the all the labels from the 12th onward into one label
labels_to_replace = unique_tt[11:]
dataset["response"] = dataset["response"].replace(labels_to_replace, "other")

In [None]:
from ml.glm import GLM
glm = GLM(data)

print("size of dataset:", dataset.shape)

train, test, val = data.get_train_test_val(
    train_size=0.7,
    test_size=0.15,
    val_size=0.15,
    data=dataset
)

print("size of train:", train.shape)
print("size of test:", test.shape)
print("size of val:", val.shape)

_, _ = glm.plot_label_distribution(train, test, val)


## Split reponse from predictors

In [None]:
xtrain, ytrain = data.split_xy(train)
xtest, ytest = data.split_xy(test)
xval, yval = data.split_xy(val)

## Run GLMNET

In [None]:
from datetime import datetime

start = datetime.now()
print(start)

fit = glm.fit(xtrain, ytrain, alpha=0, maxit=1e6)

end = datetime.now()
print(end)
print("duration:", end - start)

## Predict

In [None]:
import numpy as np

ypredict = glm.predict(fit, newx=xtest, type="class")
ypredict_probs = glm.predict(fit, newx=xtest, type="response")
ypredict_probs = np.squeeze(ypredict_probs, axis=-1)

## Evaluate

In [None]:
glm.assess(ytest, ypredict, ypredict_probs)
glm.assess_cm(ypredict_probs, ytest)