# GLMNET

## set working directory
Warning: only run the cell below once per kernel session

In [None]:
import os
from pathlib import Path

project_root = Path.cwd().parent
os.chdir(project_root)
print(os.getcwd())

## loading data

In [None]:
from data import Data
import tomllib

with open("config.toml", "rb") as file:
    config = tomllib.load(file)

data = Data(config)

## Data preparation

In [None]:
# new mixing matrix has different sample separator
data.replace_sample_sep(".")
# filter the tumor types for those which have more than x samples
data.filter_tt(100)
# check how many unique tumor types are left
unique_tt = data.tumor_types["response"].unique()
print("Number of tumor types:", len(unique_tt))

In [None]:
# fetch the current state of the data with a tumor types column
dataset = data.get_mm_with_tt()

## Optional: subset

Choose one of the cells below, or make your own.

In [None]:
subset = data.subset(dataset, n_rows=8510, n_cols=9709, n_labels=12)  # Does not guarantee an equal number of samples per label

In [None]:
subset = data.subset_nrows_per_label(dataset, nrows_per_label=32, ncols=9709, nlabels=len(unique_tt))  # Guarantees an equal number of samples per label, use all labels

In [None]:
subset = data.subset_nrows_per_label(dataset, nrows_per_label=30, ncols=9709, nlabels=22)  # Guarantees an equal number of samples per label, use x labels

In [None]:
dataset = subset

## Optional: aggregate labels
The cell below is only necessary when you want to aggregate some of the labels into one label.

In [None]:
# aggregate the all the labels from the 12th onward into one label
labels_to_replace = unique_tt[11:]
dataset["response"] = dataset["response"].replace(labels_to_replace, "other")

## Split train, test, val
Split the data into train, test, and validation sets.
Choose either of the two cells below printing the dataset's size.

In [None]:
print("size of dataset:", dataset.shape)

In [None]:
# Set the sizes of the train, test, and validation sets according to some fraction
train, test, val = data.get_train_test_val(
    train_size=0.7,
    test_size=0.2,
    val_size=0.1,
    data=dataset
)

In [None]:
# Use the subset as training data and the rest as test
# WARNING: do not run the `dataset = subset` cell in the `subset` section if you want to use this cell
unique_subset_tt = subset["response"].unique()

filtered_dataset = dataset[dataset["response"].isin(unique_subset_tt)]

is_train = filtered_dataset.index.isin(subset.index)
train = filtered_dataset[is_train]
test = filtered_dataset[~is_train]

In [None]:
# Check the shapes and distributions of the train, test, and validation sets
print("size of train:", train.shape)
print("size of test:", test.shape)
# print("size of val:", val.shape)

from ml.glm import GLM
glm = GLM(data)
# _, _ = glm.plot_label_distribution(train, test, val)

## Split reponse from predictors

In [None]:
xtrain, ytrain = data.split_xy(train)
xtest, ytest = data.split_xy(test)
# xval, yval = data.split_xy(val)

## Run GLMNET

In [None]:
from datetime import datetime

start = datetime.now()
print(start)

glm.fit(xtrain, ytrain, alpha=0)

end = datetime.now()
print(end)
print("duration:", end - start)

glm.plot()
glm.save()

## Predict

In [None]:
import numpy as np

ypredict = glm.predict(newx=xtest, type="class")
ypredict_probs = glm.predict(newx=xtest, type="response")
ypredict_probs = np.squeeze(ypredict_probs, axis=-1) # method above returns 3D array where 3rd dimension is 1. We remove it here to get a 2D array to pass to the assessments

## Evaluate

In [None]:
glm.assess(ytest, ypredict, ypredict_probs)