# Fitting Saturation-based Simulation Data Changes
Does models fitted on this simulation data work on real life?

In this notebook, we fit simulation data deltas using some model. But we make the simulation detectors placed at the same point as IRL detectors. Save the model. Then use it back into real life data!

In [1]:
from torch.optim import Adam, SGD
import torch.nn as nn
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
from inverse_modelling_tfo.models import train_model, train_model_wtih_reporting
from inverse_modelling_tfo.data import generate_data_loaders, equidistance_detector_normalization, constant_detector_count_normalization, generate_differential_data_loaders, DifferentialCombinationDataset
from inverse_modelling_tfo.data.intensity_interpolation import get_interpolate_fit_params_custom, interpolate_exp, interpolate_exp_transform
from inverse_modelling_tfo.data.interpolation_function_zoo import *
from inverse_modelling_tfo.models.custom_models import SplitChannelCNN, PerceptronReLU
from inverse_modelling_tfo.features.build_features import create_ratio, create_spatial_intensity
from inverse_modelling_tfo.misc.misc_training import set_seed
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
import torchinfo
# Set my GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "2"


In [2]:
import torch
import torch.nn as nn

class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()

    def forward(self, input, target):
        # Custom loss calculation
        loss = torch.mean(torch.abs(input - target)/target)  # Example: mean absolute error
        return loss
        


In [3]:
data = pd.read_pickle(r'/home/rraiyan/personal_projects/tfo_inverse_modelling/data/intensity/s_based_intensity_low_conc2.pkl')
equidistance_detector_normalization(data)

# Drop Uterus Thickness for now
data = data.drop(columns='Uterus Thickness')
experiment_sdd = [1.5, 3.0, 4.5, 7.0, 10.0]

# Interpolate simulation data to have same SDD as experiments
data = interpolate_exp_transform(data, experiment_sdd, [1.0, 0.8])

# Manual log(intensity) normalization
data['Intensity'] = np.log10(data['Intensity'])        # Far values wayy to small to affect anything. Take log
data.head()

Unnamed: 0,Wave Int,Maternal Wall Thickness,Maternal Hb Concentration,Maternal Saturation,Fetal Hb Concentration,Fetal Saturation,Intensity,SDD
0,1.0,6.0,12.0,0.9,0.11,0.1,14.032923,1.5
1,1.0,6.0,12.0,0.9,0.11,0.1,6.46192,3.0
2,1.0,6.0,12.0,0.9,0.11,0.1,2.244704,4.5
3,1.0,6.0,12.0,0.9,0.11,0.1,-1.975261,7.0
4,1.0,6.0,12.0,0.9,0.11,0.1,-4.967426,10.0


In [4]:
# data1 = create_ratio(data, True)
# data2 = create_spatial_intensity(data)
# sim_params = ['Maternal Wall Thickness', "Maternal Hb Concentration", "Fetal Hb Concentration", "Fetal Saturation", "Maternal Saturation"]
# data = pd.merge(data1, data2, how='inner', on=sim_params)

# data = create_ratio(data, True)

data = create_spatial_intensity(data)

data.head() 
# NOTE: Have only 1 on at the same time!

Unnamed: 0,Maternal Wall Thickness,Maternal Hb Concentration,Maternal Saturation,Fetal Hb Concentration,Fetal Saturation,1.5_1.0,3.0_1.0,4.5_1.0,7.0_1.0,10.0_1.0,1.5_2.0,3.0_2.0,4.5_2.0,7.0_2.0,10.0_2.0
0,2.0,15.0,0.9,0.11,0.1,-2.80538,-2.501517,-2.836565,-3.746699,-4.946812,15.166842,6.417421,1.824829,-2.473812,-5.260692
1,4.0,13.0,0.9,0.11,0.1,,,,,,21.665158,10.654919,4.61427,-1.324613,-5.43303
2,4.0,14.0,0.9,0.11,0.1,16.647799,7.441758,2.552808,-2.084165,-5.14461,,,,,
3,6.0,12.0,0.9,0.11,0.1,14.032923,6.46192,2.244704,-1.975261,-4.967426,10.41113,5.530323,2.174306,-1.865576,-5.339114
4,6.0,16.0,0.9,0.11,0.1,15.11656,6.530009,1.948905,-2.422432,-5.335436,5.407108,1.618792,-0.638019,-3.062143,-4.939586


In [5]:
# Cleanup
data.dropna(inplace=True)
data.head()

Unnamed: 0,Maternal Wall Thickness,Maternal Hb Concentration,Maternal Saturation,Fetal Hb Concentration,Fetal Saturation,1.5_1.0,3.0_1.0,4.5_1.0,7.0_1.0,10.0_1.0,1.5_2.0,3.0_2.0,4.5_2.0,7.0_2.0,10.0_2.0
0,2.0,15.0,0.9,0.11,0.1,-2.80538,-2.501517,-2.836565,-3.746699,-4.946812,15.166842,6.417421,1.824829,-2.473812,-5.260692
3,6.0,12.0,0.9,0.11,0.1,14.032923,6.46192,2.244704,-1.975261,-4.967426,10.41113,5.530323,2.174306,-1.865576,-5.339114
4,6.0,16.0,0.9,0.11,0.1,15.11656,6.530009,1.948905,-2.422432,-5.335436,5.407108,1.618792,-0.638019,-3.062143,-4.939586
7,10.0,12.0,0.9,0.11,0.1,-2.03184,-1.291314,-1.736562,-3.122224,-4.971014,-16.80801,-8.090147,-5.106674,-4.179162,-5.423988
8,10.0,16.0,0.9,0.11,0.1,-3.845289,-1.682541,-1.711198,-3.082258,-5.28067,-23.094774,-11.515449,-7.080987,-4.90099,-5.490049


## Normalizing Features
x_columns will be the input features and y_columns are the target

In [6]:
## Y -> Target
# y_columns = ['Maternal Wall Thickness', "Maternal Hb Concentration", "Maternal Saturation", "Fetal Hb Concentration", "Fetal Saturation"]
# y_columns = ['Maternal Saturation']
# y_columns = ['Maternal Hb Concentration']
# y_column = 'Fetal Hb Concentration'
y_column = 'Fetal Saturation'
fixed_columns = ['Maternal Wall Thickness', "Maternal Hb Concentration", "Maternal Saturation", "Fetal Hb Concentration"]
# Just to make sure I don't write stupid code
assert(len(fixed_columns) == 4), "4/5 TMPs should remain fixed. fixed_column length should be 4"
assert(y_column not in fixed_columns), "y_column is the TMP that changes. It cannot be inside fixed_columns"

x_columns = list(filter(lambda X: X.isdigit(), data.columns)) + list(filter(lambda X: '_' in X, data.columns))
## Scale y
y_scaler = preprocessing.StandardScaler()
data[y_column] = y_scaler.fit_transform(data[y_column].to_numpy().reshape(-1, 1))
x_scaler = preprocessing.StandardScaler()
data[x_columns] = x_scaler.fit_transform(data[x_columns])



In [7]:
# Print Out Scaler values
# print(f'Y scale mean {y_scaler.mean_}')
# print(f'Y scale var {y_scaler.var_}')

In [8]:
data[y_column].value_counts()

1.387779e-17    6
Name: Fetal Saturation, dtype: int64

## Model Configuration

In [9]:
IN_FEATURES = len(x_columns) * 2
OUT_FEATURES = 1
model_config = {
    # 'model_class' : SplitChannelCNN,  # Class name
    'model_class' : PerceptronReLU,  # Class name
    # 'model_params' :  [2, IN_FEATURES, 4, 5, [2, OUT_FEATURES]],    # Input params as an array
    # 'model_params' :  [3, IN_FEATURES, 6, 5, [6, 3, OUT_FEATURES]],    # Input params as an array
    # 'model_params' :  [3, IN_FEATURES, 6, 7, [3, OUT_FEATURES]],    # Input params as an array
    # 'model_params' :  [4, IN_FEATURES, 8, 7, [4, 2, OUT_FEATURES]],    # Input params as an array
    # 'model_params' :  [[IN_FEATURES, 20, 8, OUT_FEATURES]],    # Input params as an array
    'model_params' :  [[IN_FEATURES, 10, 4, OUT_FEATURES]],    # Input params as an array
    'train_split' : 0.8,
    'epochs' : 25,
    # 'total_data_len': 120000,
    'total_data_len': 80000,
    'allow_zero_diff': False,
    'hyperparam_search_count': 20,
    'hyperparam_max_epoch': 10,
    'seed': 42
}

In [10]:
# Custom Train Function 
def train_model2(iteration_config, epoch=model_config['hyperparam_max_epoch']):
    set_seed(model_config['seed'])
    params = {
        'batch_size': iteration_config['batch_size'], 'shuffle': True, 'num_workers': 2
    }
    # train, val = generate_data_loaders(data, params, x_columns, y_columns, model_config['train_split'])
    train, val = generate_differential_data_loaders(data, params, fixed_columns, x_columns, y_column, model_config['total_data_len'], model_config["allow_zero_diff"], model_config['train_split'])
    # model = create_perceptron_model(config['model'])
    # model = create_perceptron_model([42, 8, 1])
    # model = TwoChannelCNN(40, 4, 5, [4, 1])
    model = model_config['model_class'](*model_config['model_params'])
    # criterion = nn.MSELoss()
    criterion = CustomLoss()
    optimizer = SGD(model.parameters(), lr=iteration_config["lr"], momentum=iteration_config["momentum"])
    # optimizer = Adam(model.parameters(), lr=config["lr"], betas=[config["b1"], config["b2"]])
    train_loss, val_loss = train_model_wtih_reporting(model, optimizer=optimizer, criterion=criterion, train_loader=train, validation_loader=val, epochs=epoch)

In [11]:
# Hyper Parameter Search 
iteration_config = {
    "lr" : tune.loguniform(1e-5, 1e-3),
    # "b1" : tune.uniform(0.3, 1.0),
    # "b2" : tune.uniform(0.3, 1.0),
    "batch_size": tune.choice([32, 16, 8]),
    # "model": tune.choice([[40, 5, 1], [40, 10, 1], [40, 5, 2, 1]]),
    "momentum": tune.choice([0.93, 0.95, 0.97]),
}
scheduler = ASHAScheduler(metric="combined_loss", mode="min", max_t=40, grace_period=5, reduction_factor=2)
reporter = CLIReporter(metric_columns=["train_loss", "val_loss", "combined_loss", "training_iteration"])
result = tune.run(train_model2, config=iteration_config, scheduler=scheduler, progress_reporter=reporter,
                  num_samples=model_config['hyperparam_search_count'], resources_per_trial={"cpu": 4, "gpu": 0.05},)

best_trial = result.get_best_trial("combined_loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
    best_trial.last_result["val_loss"]))
print("Best trial final train loss: {}".format(
    best_trial.last_result["train_loss"]))


2023-08-29 16:26:53,444	INFO worker.py:1621 -- Started a local Ray instance.
2023-08-29 16:26:54,752	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2023-08-29 16:26:54,764	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2023-08-29 16:26:54 (running for 00:00:00.21)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 40.000: None | Iter 20.000: None | Iter 10.000: None | Iter 5.000: None
Logical resource usage: 64.0/64 CPUs, 0.8000000000000002/1 GPUs (0.0/1.0 accelerator_type:RTX)
Result logdir: /home/rraiyan/ray_results/train_model2_2023-08-29_16-26-54
Number of trials: 20/20 (20 PENDING)
+--------------------------+----------+-------+--------------+-------------+------------+
| Trial name               | status   | loc   |   batch_size |          lr |   momentum |
|--------------------------+----------+-------+--------------+-------------+------------|
| train_model2_8a49a_00000 | PENDING  |       |           32 | 4.8775e-05  |       0.95 |
| train_model2_8a49a_00001 | PENDING  |       |           16 | 0.000309078 |       0.93 |
| train_model2_8a49a_00002 | PENDING  |       |           32 | 2.31963e-05 |       0.93 |
| train_model2_8a49a_00003 | PENDING  |       |           3

2023-08-29 16:26:58,709	ERROR tune_controller.py:911 -- Trial task failed for trial train_model2_8a49a_00012
Traceback (most recent call last):
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/_private/worker.py", line 2493, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=1692186, ip=169.237.32.34, actor_id=d17a86bfe08c7c518a98e4c301000000, repr=train_model2)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/tune/trainable/tra

Trial name
train_model2_8a49a_00000
train_model2_8a49a_00001
train_model2_8a49a_00002
train_model2_8a49a_00003
train_model2_8a49a_00004
train_model2_8a49a_00005
train_model2_8a49a_00006
train_model2_8a49a_00007
train_model2_8a49a_00008
train_model2_8a49a_00009


2023-08-29 16:26:58,728	ERROR tune_controller.py:911 -- Trial task failed for trial train_model2_8a49a_00003
Traceback (most recent call last):
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/_private/worker.py", line 2493, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=1692145, ip=169.237.32.34, actor_id=79d1224c9acf81691e5348a101000000, repr=train_model2)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/tune/trainable/tra

== Status ==
Current time: 2023-08-29 16:26:59 (running for 00:00:05.21)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 40.000: None | Iter 20.000: None | Iter 10.000: None | Iter 5.000: None
Logical resource usage: 8.0/64 CPUs, 0.1/1 GPUs (0.0/1.0 accelerator_type:RTX)
Result logdir: /home/rraiyan/ray_results/train_model2_2023-08-29_16-26-54
Number of trials: 20/20 (16 ERROR, 4 PENDING)
+--------------------------+----------+-----------------------+--------------+-------------+------------+
| Trial name               | status   | loc                   |   batch_size |          lr |   momentum |
|--------------------------+----------+-----------------------+--------------+-------------+------------|
| train_model2_8a49a_00016 | PENDING  |                       |           16 | 1.73971e-05 |       0.93 |
| train_model2_8a49a_00017 | PENDING  |                       |            8 | 0.000484642 |       0.97 |
| train_model2_8a49a_00018 | PENDING  |                       |           16

2023-08-29 16:27:03,194	ERROR tune_controller.py:911 -- Trial task failed for trial train_model2_8a49a_00018
Traceback (most recent call last):
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/_private/worker.py", line 2493, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=1694278, ip=169.237.32.34, actor_id=f69ab764ffe46d4e39f3be5101000000, repr=train_model2)
  File "/home/rraiyan/cybercat/lib/python3.8/site-packages/ray/tune/trainable/tra

== Status ==
Current time: 2023-08-29 16:27:03 (running for 00:00:08.60)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 40.000: None | Iter 20.000: None | Iter 10.000: None | Iter 5.000: None
Logical resource usage: 4.0/64 CPUs, 0.05/1 GPUs (0.0/1.0 accelerator_type:RTX)
Result logdir: /home/rraiyan/ray_results/train_model2_2023-08-29_16-26-54
Number of trials: 20/20 (20 ERROR)
+--------------------------+----------+-----------------------+--------------+-------------+------------+
| Trial name               | status   | loc                   |   batch_size |          lr |   momentum |
|--------------------------+----------+-----------------------+--------------+-------------+------------|
| train_model2_8a49a_00000 | ERROR    | 169.237.32.34:1692110 |           32 | 4.8775e-05  |       0.95 |
| train_model2_8a49a_00001 | ERROR    | 169.237.32.34:1692111 |           16 | 0.000309078 |       0.93 |
| train_model2_8a49a_00002 | ERROR    | 169.237.32.34:1692112 |           32 | 2.31963

TuneError: ('Trials did not complete', [train_model2_8a49a_00000, train_model2_8a49a_00001, train_model2_8a49a_00002, train_model2_8a49a_00003, train_model2_8a49a_00004, train_model2_8a49a_00005, train_model2_8a49a_00006, train_model2_8a49a_00007, train_model2_8a49a_00008, train_model2_8a49a_00009, train_model2_8a49a_00010, train_model2_8a49a_00011, train_model2_8a49a_00012, train_model2_8a49a_00013, train_model2_8a49a_00014, train_model2_8a49a_00015, train_model2_8a49a_00016, train_model2_8a49a_00017, train_model2_8a49a_00018, train_model2_8a49a_00019])

<!-- Best trial config: {'lr': 0.0010630834634709364, 'b1': 0.4282116859842134, 'b2': 0.3089991262211405, 'batch_size': 8, 'model': [20, 16, 8, 4, 2, 1]}
Best trial final validation loss: 0.09234625198878348
Best trial final train loss: 0.22368373312056064 -->

In [None]:
best_trial.config

In [None]:
model_config

In [None]:
# Train Model with the given params.
set_seed(model_config['seed'])
params = {
    'batch_size': 8, 'shuffle': True, 'num_workers': 2
}
params['batch_size'] = best_trial.config['batch_size']
train, val = generate_differential_data_loaders(data, params, fixed_columns, x_columns, y_column, model_config['total_data_len'], model_config["allow_zero_diff"], model_config['train_split'])
# train, val = generate_differential_data_loaders(data, params, fixed_columns, x_columns, y_column, 140000, model_config["allow_zero_diff"], model_config['train_split'])

model = model_config['model_class'](*model_config['model_params'])
# criterion = nn.MSELoss()
criterion = CustomLoss()
# criterion = nn.HuberLoss()
# optimizer = Adam(model.parameters(), lr=0.0009, betas=[0.935, 0.701])
# optimizer = SGD(model.parameters(), lr=0.0004, momentum=0.9)
optimizer = SGD(model.parameters(), lr=best_trial.config['lr'], momentum=best_trial.config['momentum'])
# CUDA_VISIBLE is already set to only see one GPU
train_loss, validation_loss = train_model(model, optimizer, criterion, train, val, epochs=20, gpu_to_use=0)
# train_loss, validation_loss = train_model(model, optimizer, criterion, train, val, epochs=model_config['epochs'], gpu_to_use=0)
plt.figure()
plt.plot(train_loss, label='Training Loss', marker='x')
plt.plot(validation_loss, label='Validation Loss', marker='x')
# plt.yscale('log')
plt.legend()

In [None]:
print(f'Train MSE : {train_loss[-1]}, Val MSE : {validation_loss[-1]}')

In [None]:
x_data = DifferentialCombinationDataset(data, fixed_columns, x_columns, y_column, model_config['total_data_len'], model_config['allow_zero_diff'])
truth_column = []
prediction_column = []
loss_column = []
tissue_fixed_params = []

with torch.no_grad():
    for i, current_sample in enumerate(x_data):
        inputs, labels = current_sample
        inputs = inputs.view(1, -1).cuda()
        labels = labels.view(1, -1).cuda()
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Bookkeeping
        truth_column.append(labels.item())
        prediction_column.append(outputs.item())
        loss_column.append(loss.item())
        tissue_fixed_params.append(x_data.split_fixed_columns[x_data.randomized_indices_list[i]])


tissue_fixed_params = np.array(tissue_fixed_params)
truth_column = np.array(truth_column).reshape(-1, 1)
prediction_column = np.array(prediction_column).reshape(-1, 1)
loss_column = np.array(loss_column).reshape(-1, 1)

# un-normalize
truth_column = y_scaler.scale_ * truth_column
prediction_column = y_scaler.scale_ * prediction_column
absolute_error = np.abs(truth_column - prediction_column)


merged = np.hstack([tissue_fixed_params, truth_column, prediction_column, loss_column, absolute_error])
merged_df = pd.DataFrame(merged, columns=fixed_columns + ['Truth', 'Predicted', "Train Error", "Absolute Error"])
merged_df.head()

In [None]:
plt.figure()
plt.figure('Error Distribution')
plt.hist(merged_df['Absolute Error'], 50)
plt.xlabel('Abs Error')
plt.ylabel('Count')

In [None]:
merged_df['%Error'] = np.abs((merged_df['Truth'] - merged_df['Predicted'])) / merged_df['Truth'] * 100.

In [None]:
# Top Bad Samples
VIEW_TOP_N = 50
worst_errors = merged_df['Absolute Error'].argsort()[::-1][:VIEW_TOP_N]
# worst_errors = merged_df['%Error'].argsort()[::-1][:VIEW_TOP_N]
with pd.option_context("display.max_rows", None):
    display(merged_df.iloc[worst_errors, :])

In [None]:
# Rough MSE's in percentage
print(f'Train Error(non-normalized): {train_loss[-1] * y_scaler.var_ }')
print(f'Validation Error(non-normalized): {validation_loss[-1] * y_scaler.var_ }')

In [None]:
# Model Info
torchinfo.summary(model)

In [None]:
merged_df['Truth'].value_counts()