# Project Notebook

Dear User,

this Jupyter Notebook contains the code and AI model to our challenge "Rare Diseases" by the LMU childrens hospital.
Here our various functions come together to create a data pipeline and readable as well as modular model architecture.
There are few medical terms being used whose understanding is necessary and they are the following:

- `HPO Features`: Human Phenotype Ontology Features provide a standardized vocabulary of phenotypic abnormalities encountered in human disease.
- `ICD-9`: International standard for the classification of diseases

Most parameters used to tune training are found in the [Training Parameters](#training-parameters) section


## Module Import

In [None]:
import nn_data
import torch
import torch.nn as nn
import numpy as np
import nn_files

## Dataset loading

In [None]:

# Labevents_HPO are the HPO features which where measured during an examination of a patient
# Diagnoses_HPO are the HPO features that were diagnosed based
HPO_PATH = 'data/hp.obo'
LABEVENTS_HPO_PATH = 'data/OUT_LABEVENTS_HPO.csv'
DIAGNOSES_HPO_PATH = 'data/DIAGNOSE_ICD_hpo.csv'


Load the data (takes about 4 seconds)

If the data format is different, use `labevents_hpo_column_name` and `diagnoses_hpo_column_name` parameters

In [None]:
# loads HPO data, labevents and diagnoses and groups them by subject ID
data = nn_data.LoadedData(HPO_PATH, LABEVENTS_HPO_PATH, DIAGNOSES_HPO_PATH)


## Training Parameters

### Parameters for Dataset Loading

In [None]:
use_ICD: bool = False
enable_parent_nodes_hpo_input: bool = False
enable_parent_nodes_hpo_target: bool = False # only used if `use_ICD == False`


### Parameters for the Autoencoder

In [None]:
use_autoencoder: bool = True

# set the parameters for the training of the autoencoder
batch_size_AE: int = 8
num_epochs_AE: int = 60
learning_rate_AE: float = 1e-2
beta_AE: tuple[float, float] = (0.9, 0.999)

# layer sizes for the autoencoder
reduction_factor_hidden: float = 0.7
reduction_factor_latent: float = 0.5

# best loss_function determined experimentally
loss_func_AE: nn.Module = nn.MSELoss()


### Parameters for Main Neural Network

In [None]:
if use_autoencoder:
    batch_size: int = 8
    num_epochs: int = 20
    learning_rate: float = 1e-4
    betas: tuple[float, float] = (0.9, 0.999)
    enlarging_factor_NN: float = 1.4
    dropOutRatio: float = 0.0

    # best loss_function determined experimentally
    loss_func_NN: nn.Module = nn.CrossEntropyLoss()
else:
    batch_size: int = 8
    num_epochs: int = 20
    learning_rate: float = 1e-4
    betas: tuple[float, float] = (0.9, 0.999)
    enlarging_factor_NN: float = 1.4
    dropOutRatio: float = 0.0

    # best loss_function determined experimentally
    loss_func_NN: nn.Module = nn.CrossEntropyLoss()


## Model Creation

### Dataset Creation

In [None]:
# splits the loaded data in labevents and diagnoses
input_data_creator = nn_data.HPODatasetCreator(
    data, 'labevents',
    enable_parent_nodes=enable_parent_nodes_hpo_input,
)
if use_ICD:
    target_data_creator = nn_data.ICDDatasetCreator(data, batch=True)
else:
    target_data_creator = nn_data.HPODatasetCreator(
        data, 'diagnoses',
        enable_parent_nodes=enable_parent_nodes_hpo_target,
    )


In [None]:
# the data from the input and target_data_creators are transferred in a nested list structure
input_data: list[list[int]] = input_data_creator.data()
target_data: list[list[int]] = target_data_creator.data()

# for pytorch the list structure is transformed in tensors
input_tensor = torch.FloatTensor(input_data)
target_tensor = torch.FloatTensor(target_data)


### Accuracy Functions

In [None]:
def calc_accuracy(output, target) -> float:
    """
    computes the portion of correctly predicted outputs
    """
    number_of_features = target.sum(axis=1)
    correctly_identified = (target * output).sum(axis=1)
    return np.mean(correctly_identified / (number_of_features + .00001))
    #  + .00001 avoid divided by zero errors


In [None]:
def print_real_effect(outputs, targets):
    """
    lists the correctly identified diagnoses, false positives and false negatives
    correct diagnoses counts only recognized active features and not neglected inactive features
    """ 
    correct_diagnosed = 0
    false_positive = 0
    false_negative = 0
    total_to_diagnose = sum(targets[0])

    for i in range(len(outputs[0])):
        if(outputs[0, i] >= 0.5 and targets[0, i] == 1):
            correct_diagnosed += 1
        if(outputs[0, i] < 0.5 and targets[0, i] == 1):
            false_negative += 1
        if(outputs[0, i] > 0.5 and targets[0, i] == 0):
            false_positive += 1

    print("Correct diagnoses:" f'{correct_diagnosed}/{total_to_diagnose}')
    print("False positives:" f'{false_positive}')
    print("False negatives:" f'{false_negative}')


### Model generation

In [None]:
# device selection, where NN is trained
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


### Model Architectures

In [None]:
if use_autoencoder:
    # Dimensions of the Autoencoder
    input_size_AE = len(input_data[0])
    hidden_size_AE = int(input_size_AE*reduction_factor_hidden)
    latent_size_AE = int(input_size_AE*reduction_factor_latent)

    # In Autoencoder function the architecture is built
    AE = nn_files.Autoencoder(input_size_AE, hidden_size_AE, latent_size_AE)
    AE.to(device)

    # Latent space of autoencoder is used as input for the FCN
    input_size_NN = latent_size_AE
else:
    AE = None

    input_size_NN = len(input_data[0])

output_size_NN = len(target_data[0])
hidden_size_NN = int(max(input_size_NN, output_size_NN) * enlarging_factor_NN)

# Call of FCNModel function, can build Model differently depending on if encoder is used or not
model = nn_files.FCNModel(input_size_NN, hidden_size_NN,
                          output_size_NN, enlarging_factor_NN, AE, dropOutRatio=dropOutRatio)
                          
_ = model.to(device) # `_ =` is used to suppress jupyter output


## Train Autoencoder (if enabled)

### Training and Validation

In [None]:
if use_autoencoder:
    assert AE is not None  # the linter is not smart enough to infer that
    # Bestimmung des Optimizers, standard: Adam
    optimizer_AE = torch.optim.Adam(
        AE.parameters(),
        lr=learning_rate_AE,
        betas=beta_AE,
    )

    # create separate dataset for Autoencoder, as output of model is not compared to original target
    # but again to the input
    dataset_AE = torch.utils.data.TensorDataset(input_tensor, input_tensor)
    # split the dataset in 70% training data, 20% validation data, 10% test data
    dataset_AE_split = nn_files.split_dataset(batch_size_AE, dataset_AE)

    # train autoencoder with general training function, can handle autoencoder and other models
    nn_files.training(
        AE, device, dataset_AE_split,
        optimizer=optimizer_AE, loss_func=loss_func_AE,  num_epochs=num_epochs_AE,
    )
else:
    dataset_AE_split = None


### Test

In [None]:
if use_autoencoder:
    assert AE is not None and dataset_AE_split is not None
    # the linter is not smart enough to infer that
    
    # show the output of the autoencoder to visualize ability to decode and reconstruct multi-hot encoding vectors
    # X visualize an active feature, blanks a zero
    nn_files.test(
        AE, device, dataset_AE_split, data_creator=input_data_creator,
        plot_outputs=True,
    )


## Train Main Neural Network

In [None]:
# Bestimmung des Optimizers, standard: Adam
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=learning_rate,
    betas=betas,
)


In [None]:
# create a dataset in which the input and target aka ground truth tensor are located next to each other
dataset = torch.utils.data.TensorDataset(input_tensor, target_tensor)
# split the dataset in 70% training data, 20% validation data, 10% test data
dataset_split = nn_files.split_dataset(batch_size, dataset)


In [None]:
# train the model, log_rhythm denotes after how many iterations in one epoch 
# the loss and accuracy if applicable is print
nn_files.training(
    model, device, dataset_split,
    num_epochs=num_epochs,
    optimizer=optimizer, loss_func=loss_func_NN,
    calc_accuracy=calc_accuracy,
)


### Testing

In [None]:
# Print the real effect of the prediction, aka understandable by humans in a way of classification in
# correct diagnoses, false positives and false negatives
nn_files.test(
    model, device, dataset_split,
    target_data_creator,
    print_real_effect=print_real_effect, calc_accuracy=calc_accuracy,
    plot_outputs=True, plot_decide=True,
)
