In [1]:
import nn_data
import torch
import torch.nn as nn
import numpy as np
import nn_files

In [2]:
HPO_PATH = 'data/hp.obo'
LABEVENTS_HPO_PATH = 'data/OUT_LABEVENTS_HPO.csv'
DIAGNOSES_HPO_PATH = 'data/DIAGNOSE_ICD_hpo.csv'


## Dataset loading

In [3]:
data = nn_data.LoadedData(HPO_PATH, LABEVENTS_HPO_PATH, DIAGNOSES_HPO_PATH)


In [4]:
input_data_creator = nn_data.HPODatasetCreator(data, mode='labevents', enable_parent_nodes=False)
target_data_creator = nn_data.HPODatasetCreator(data, mode='diagnoses', enable_parent_nodes=False)


## Model Creation

### Dataset Creation

In [5]:
input_data: list[list[int]] = input_data_creator.data()
target_data: list[list[int]] = target_data_creator.data()

input_tensor = torch.FloatTensor(input_data)
target_tensor = torch.FloatTensor(target_data)


### Accuracy Functions

In [6]:
# definition of accuracy function
def calc_accuracy(output, target) -> float:
    result = np.zeros(output.shape)

    number_of_features = target.sum(axis=1)
    correctly_identified = (target * np.sqrt(output)).sum(axis=1)
    return np.mean(correctly_identified / (number_of_features + .00001))


In [7]:
# definition of real effect function
def real_effect(outputs, targets):
    correct_diagnosed = 0
    false_positive = 0
    false_negative = 0
    total_to_diagnose = sum(targets[0])

    for i in range(len(outputs[0])):
        if(outputs[0, i] >= 0.5 and targets[0, i] == 1):
            correct_diagnosed += 1
        if(outputs[0, i] < 0.5 and targets[0, i] == 1):
            false_negative += 1
        if(outputs[0, i] > 0.5 and targets[0, i] == 0):
            false_positive += 1

    print("Correct diagnoses:" f'{correct_diagnosed}/{total_to_diagnose}')
    print("False positives:" f'{false_positive}')
    print("False negatives:" f'{false_negative}\n')


### Model generation

In [8]:
# definition of loss function
loss_func_nn = nn.CrossEntropyLoss()
loss_func_auto = nn.MSELoss()
# usefullness od CrossEntropyLoss determined experimentally, best among available pytorch loss functions


### Usage of Autoencoder

In [9]:
# ste True or False
use_autoencoder = False

# device selection, where NN is trained
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Model Architectures

In [10]:
if use_autoencoder:
    # Model of the AutoEncoder
    reduction_factor_hidden = 0.7
    reduction_factor_latent = 0.5

    input_size_AE = len(input_data[0])
    hidden_size_AE = int(input_size_AE*reduction_factor_hidden)
    latent_size_AE = int(input_size_AE*reduction_factor_latent)
    
    AE = nn_files.Autoencoder(input_size_AE, hidden_size_AE, latent_size_AE)
    
    AE.to(device)
    
    # Model of NN with Encoder Structure
    enlarging_factor_NN = 1.4
    input_size_NN = latent_size_AE
else:
    AE = None
    
    # Model of NN without Encoder Structure
    enlarging_factor_NN = 1.4
    input_size_NN = len(input_data[0])
    
output_size_NN = len(target_data[0])
hidden_size_NN = int(max(input_size_NN, output_size_NN) * enlarging_factor_NN)

model = nn_files.NN(input_size_NN, hidden_size_NN, output_size_NN, enlarging_factor_NN, AE)
_ = model.to(device)


### Test with Training Function

### Train Autoencoder

In [11]:
if use_autoencoder:
    
    batch_size = 8
    learning_rate=1e-2
    num_epochs = 60
    
    # optimizer
    # Bestimmung des Optimizers, standard: Adam
    optimizer = torch.optim.Adam(
        AE.parameters(),
        lr=learning_rate,
        betas=(0.9, 0.999),
    )
    
    dataset_AE = torch.utils.data.TensorDataset(input_tensor, input_tensor)
    dataset_AE_split = nn_files.split_dataset(batch_size, dataset_AE)
    
    # train autoencoder
    nn_files.training(model = AE, optimizer=optimizer, loss_func= loss_func_auto, dataset_split=dataset_AE_split,
                     learning_rate=learning_rate, num_epochs=num_epochs, device=device, batch_size=batch_size)


In [12]:
if use_autoencoder:
    nn_files.test(model = AE,loss_func=loss_func_nn, device=device, dataset_split =dataset_AE_split,
              target_data_creator=target_data_creator)

### Train NN

In [13]:
if use_autoencoder:
    batch_size = 8
    learning_rate=1e-4
    num_epochs = 20
    
    # optimizer
    # Bestimmung des Optimizers, standard: Adam
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=learning_rate,
        betas=(0.9, 0.999),
    )
    
    # train NN with encoder structure
else:
    batch_size = 8
    learning_rate=1e-4
    num_epochs = 20
    
    # optimizer
    # Bestimmung des Optimizers, standard: Adam
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=learning_rate,
        betas=(0.9, 0.999),
    )
    
    # train NN without encoder Structure

In [14]:
dataset = torch.utils.data.TensorDataset(input_tensor, target_tensor)
dataset_split = nn_files.split_dataset(batch_size, dataset)

In [15]:
nn_files.training(model = model, optimizer=optimizer, loss_func=loss_func_nn, batch_size=batch_size,
                  dataset_split=dataset_split,
                 learning_rate=learning_rate, num_epochs=num_epochs, device=device, real_effect=real_effect, 
                 log_rhythm = 5, calc_accuracy = calc_accuracy)

[Epoch 1/20]
[Iteration 5]	TRAIN      loss/acc: 157.105	0.715
for this epoch:	TRAIN      loss/acc: 144.575	0.709
		VALIDATION loss/acc: 136.583	0.705
Correct diagnoses:5/5.0
False positives:212
False negatives:0



[Epoch 2/20]
[Iteration 5]	TRAIN      loss/acc: 145.009	0.731
for this epoch:	TRAIN      loss/acc: 145.405	0.751
		VALIDATION loss/acc: 135.572	0.735
Correct diagnoses:5/5.0
False positives:188
False negatives:0



[Epoch 3/20]
[Iteration 5]	TRAIN      loss/acc: 135.767	0.771
for this epoch:	TRAIN      loss/acc: 147.656	0.781
		VALIDATION loss/acc: 134.559	0.757
Correct diagnoses:5/5.0
False positives:159
False negatives:0



[Epoch 4/20]
[Iteration 5]	TRAIN      loss/acc: 137.609	0.798
for this epoch:	TRAIN      loss/acc: 140.176	0.809
		VALIDATION loss/acc: 133.660	0.772
Correct diagnoses:5/5.0
False positives:136
False negatives:0



[Epoch 5/20]
[Iteration 5]	TRAIN      loss/acc: 161.299	0.792
for this epoch:	TRAIN      loss/acc: 137.658	0.829
		VALIDATION loss/acc: 132.

### Testing

In [16]:
nn_files.test(model,loss_func=loss_func_nn, device=device, dataset_split =dataset_split,
              target_data_creator=target_data_creator, real_effect=real_effect, calc_accuracy=calc_accuracy)

Correct diagnoses:29/38.0
False positives:69
False negatives:9

Correct diagnoses:6/7.0
False positives:90
False negatives:1

Correct diagnoses:15/21.0
False positives:80
False negatives:6

Correct diagnoses:1/1.0
False positives:93
False negatives:0

Correct diagnoses:0/0.0
False positives:93
False negatives:0

Correct diagnoses:7/8.0
False positives:89
False negatives:1

Correct diagnoses:7/19.0
False positives:92
False negatives:12

Correct diagnoses:2/9.0
False positives:94
False negatives:7

Correct diagnoses:3/4.0
False positives:94
False negatives:1

Correct diagnoses:3/38.0
False positives:89
False negatives:35

Test Accuracy: 0.627
HP:0002621	0.27	 
HP:0000112	0.05	 
HP:0001251	0.13	 
HP:0000601	0.07	 
HP:0001335	0.07	 
HP:0002224	0.17	 
HP:0000407	0.08	 
HP:0000490	0.06	 
HP:0002929	0.12	 
HP:0004388	0.13	 
HP:0006698	0.17	 
HP:0002015	0.36	 
HP:0004409	0.38	 
HP:0000658	0.17	 
HP:0000270	0.08	 
HP:0001674	0.06	X
HP:0008213	0.09	 
HP:0004510	0.06	 
HP:0011462	0.23	 
HP:001240

In [17]:
AE.encoder == model.encoder

AttributeError: 'NoneType' object has no attribute 'encoder'