# Project Notebook

Dear User,

this Jupyter Notebook contains the code and AI model to our challenge "Rare Diseases" by the LMU childrens hospital.
Here our various functions come together to create a data pipeline and readable as well as modular model architecture.
There are few medical terms being used whose understanding is necessary and they are the following:

`HPO Features`: Human Phenotype Ontology Features provide a standardized vocabulary of phenotypic abnormalities encountered in human disease.
`ICD-9`: International standard for the classification of diseases

## Module Import

In [None]:
import nn_data
import torch
import torch.nn as nn
import numpy as np
import nn_files

In [None]:

# Labevents_HPO are the HPO features which where measured during an examination of a patient
# Diagnoses_HPO are the HPO features that were diagnosed based
HPO_PATH = 'data/hp.obo'
LABEVENTS_HPO_PATH = 'data/OUT_LABEVENTS_HPO.csv'
DIAGNOSES_HPO_PATH = 'data/DIAGNOSE_ICD_hpo.csv'


## Dataset loading

In [None]:
# loads HPO data, labevents and diagnoses and groups them by subject ID
data = nn_data.LoadedData(HPO_PATH, LABEVENTS_HPO_PATH, DIAGNOSES_HPO_PATH)


In [None]:

# splits the loaded data in labevents  and diagnoses
input_data_creator = nn_data.HPODatasetCreator(
    data, mode='labevents', enable_parent_nodes=True)



#### Set this to False or True depending if you want use HPO features as output(False) or ICD codes(True) 

In [None]:

use_ICD = False

if not use_ICD:
    target_data_creator = nn_data.HPODatasetCreator(
        data, mode='diagnoses', enable_parent_nodes=True)
else:
    target_data_creator = nn_data.ICDDatasetCreator(data, batch = True)


## Model Creation

### Dataset Creation

In [None]:
# the data from the input and target_data_creators are transferred in a nested list structure
input_data: list[list[int]] = input_data_creator.data()
target_data: list[list[int]] = target_data_creator.data()

# for pytorch the list structure is transformed in tensors
input_tensor = torch.FloatTensor(input_data)
target_tensor = torch.FloatTensor(target_data)


### Accuracy Functions

In [None]:
def calc_accuracy(output, target) -> float:
    """
    definition of an accuracy function
    """
    number_of_features = target.sum(axis=1)
    correctly_identified = (target * np.sqrt(output)).sum(axis=1)
    return np.mean(correctly_identified / (number_of_features + .00001))


In [None]:
def real_effect(outputs, targets):
    """
    lists the correctly identified diagnoses, false positives and false negatives
    correct diagnoses counts only recognized active features and not neglected inactive features
    """ 
    correct_diagnosed = 0
    false_positive = 0
    false_negative = 0
    total_to_diagnose = sum(targets[0])

    for i in range(len(outputs[0])):
        if(outputs[0, i] >= 0.5 and targets[0, i] == 1):
            correct_diagnosed += 1
        if(outputs[0, i] < 0.5 and targets[0, i] == 1):
            false_negative += 1
        if(outputs[0, i] > 0.5 and targets[0, i] == 0):
            false_positive += 1

    print("Correct diagnoses:" f'{correct_diagnosed}/{total_to_diagnose}')
    print("False positives:" f'{false_positive}')
    print("False negatives:" f'{false_negative}\n')


### Model generation

In [None]:
# device selection, where NN is trained
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


### Usage of Autoencoder

In [None]:
# True or False
use_autoencoder = True  


### Model Architectures

In [None]:
if use_autoencoder:
    
    # Reduction factors symbolize how stronly the data is compressed in the latent space
    reduction_factor_hidden = 0.7
    reduction_factor_latent = 0.5

    # Dimensions of the Autoencoder
    input_size_AE = len(input_data[0])
    hidden_size_AE = int(input_size_AE*reduction_factor_hidden)
    latent_size_AE = int(input_size_AE*reduction_factor_latent)

    # definition of loss function
    # usefullness of CrossEntropyLoss for Model and Autoencoder Training 
    # determined experimentally, best among available pytorch loss functions
    loss_func_AE = nn.MSELoss()
    loss_func_NN = nn.CrossEntropyLoss()

    # In Autoencoder funciton the architecture is built
    AE = nn_files.Autoencoder(input_size_AE, hidden_size_AE, latent_size_AE)
    AE.to(device)

    # Set enlarging factor for the following FCN (Fully Connected Network)
    enlarging_factor_NN = 1.4
    
    # Latent space of autoencoder is used as input for the FCN
    input_size_NN = latent_size_AE
else:
    AE = None

    loss_func_NN = nn.CrossEntropyLoss()

    # Enlarging factor and input size of the FCN if no encoder is used
    enlarging_factor_NN = 1.4
    input_size_NN = len(input_data[0])

output_size_NN = len(target_data[0])
hidden_size_NN = int(max(input_size_NN, output_size_NN) * enlarging_factor_NN)

# Call of NN function, can build Model differently depending on if encoder is used or not
model = nn_files.FCNModel(input_size_NN, hidden_size_NN,
                    output_size_NN, enlarging_factor_NN, AE, dropOutRatio=0)
_ = model.to(device)


### Test with Training Function

### Train Autoencoder

In [None]:
if use_autoencoder:
    
    # set the parameters for the training of the autoencoder
    batch_size = 8
    learning_rate = 1e-2
    num_epochs = 60

    # Bestimmung des Optimizers, standard: Adam
    optimizer = torch.optim.Adam(
        AE.parameters(),
        lr=learning_rate,
        betas=(0.9, 0.999),
    )

    # create seperate dataset for Autoencoder, as output of model is not compared to orignal target
    # but again to the input
    dataset_AE = torch.utils.data.TensorDataset(input_tensor, input_tensor)
    dataset_AE_split = nn_files.split_dataset(batch_size, dataset_AE)

    # train autoencoder with general training funciton, can handle autoencoder and other models
    nn_files.training(
        AE, device, dataset_AE_split,
        optimizer=optimizer, loss_func=loss_func_AE,  num_epochs=num_epochs,
    )


In [None]:
if use_autoencoder:
    
    # show the output of the autoencoder to visualize ability to decode and reconstruct multi-hot encoding vectors
    # X visualize an active feature, blanks a zero
    nn_files.test(
        AE, device, dataset_AE_split, data_creator=input_data_creator,
    )


### Train NN

In [None]:
if use_autoencoder:
    
    # parameters for training of FCN with pretrained encoder
    batch_size = 8
    learning_rate = 1e-4
    num_epochs = 20

    # Bestimmung des Optimizers, standard: Adam
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=learning_rate,
        betas=(0.9, 0.999),
    )
else:
    
    # parameters for training of FCN without pretrained encoder
    batch_size = 8
    learning_rate = 1e-4
    num_epochs = 20

    # Bestimmung des Optimizers, standard: Adam
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=learning_rate,
        betas=(0.9, 0.999),
    )


In [None]:
# create a dataset in which the input and target aka fround truth tensor are located enxt to each other
dataset = torch.utils.data.TensorDataset(input_tensor, target_tensor)

# split the dataset in 70% training data, 20% validation data, 10% test data
dataset_split = nn_files.split_dataset(batch_size, dataset)


In [None]:
# train the model, log_rhythm denotes after how many iterations in one epoch 
# the loss and accuracy if applicable is printe
nn_files.training(
    model, device, dataset_split,
    num_epochs=num_epochs,
    optimizer=optimizer, loss_func=loss_func_NN,
    real_effect=real_effect, calc_accuracy=calc_accuracy,
)


### Testing

In [None]:
# Print the real effect of the prediction, aka understandable by humans in a way of classification in
# correct diagnoses, false positives and false negatives
nn_files.test(
    model, device, dataset_split,
    target_data_creator,
    real_effect=real_effect, calc_accuracy=calc_accuracy,
    sort_output_by_confidence=True,
)
