# GLMNET

## set working directory
Warning: only run the cell below once per kernel session

In [None]:
import os
from pathlib import Path

project_root = Path.cwd().parent
os.chdir(project_root)
print(os.getcwd())

## loading data

In [None]:
from data import Data
import tomllib

with open("config.toml", "rb") as file:
    config = tomllib.load(file)

data = Data(config)

## Setup for GLMNET

Note: the amount of data in the subset might differ from the amount requested in `data.get_subset()`, when you request more data than the given number of labels can provide.

In [None]:
dataset = data.get_mm_with_tt()

In [None]:
# Optional: subset
dataset = data.get_subset(dataset, n_rows=1500, n_cols=20, n_labels=5)

In [None]:
from ml.glm import GLM
glm = GLM(data)

print("size of dataset:", dataset.shape)

train, test, val = data.get_train_test_val(
    train_size=0.7,
    test_size=0.15,
    val_size=0.15,
    data=dataset
)
print("size of train:", train.shape)
print("size of test:", test.shape)
print("size of val:", val.shape)

glm.plot_label_distribution(train, test, val)


## Split reponse from predictors

In [None]:
xtrain, ytrain = data.split_xy(train)
xtest, ytest = data.split_xy(test)
xval, yval = data.split_xy(val)

## Run GLMNET

In [None]:
fit = glm.fit_glm(xtrain, ytrain)

## Predict

In [None]:
import numpy as np

ypredict = glm.predict(fit, newx=xtest, type="class")
ypredict_probs = glm.predict(fit, newx=xtest, type="response")
ypredict_probs = np.squeeze(ypredict_probs, axis=-1)

## Evaluate

In [None]:
glm.assess(ytest, ypredict, ypredict_probs)