# Ray Tune Test
Ray tune is a hyperparameter tuning library. Trying to figure out how this works

In [1]:
from torch.optim import Adam, SGD
import torch.nn as nn
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import preprocessing
from inverse_modelling_tfo.models import train_model, create_perceptron_model, train_model_wtih_reporting
from inverse_modelling_tfo.data import generate_data_loaders, equidistance_detector_normalization, constant_detector_count_normalization
from inverse_modelling_tfo.data.intensity_interpolation import get_interpolate_fit_params_custom
from inverse_modelling_tfo.data.interpolation_function_zoo import *
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

In [2]:
data = pd.read_pickle(r'/home/rraiyan/personal_projects/tfo_inverse_modelling/data/intensity/intensity_summed_sim_data_equidistance_detector_extended.pkl')
equidistance_detector_normalization(data)

data = data[data["Wave Int"] == 1.0]    # Keep only 1 wavelength
data['Intensity'] = np.log10(data['Intensity'])        # Far values wayy to small to affect anything. Take log

# From long to wide
data = pd.pivot(data, index=['Wave Int', 'Uterus Thickness', 'Maternal Wall Thickness', "Maternal Mu_a", "Fetal Mu_a"], columns="SDD", values="Intensity").reset_index()
# Since SDD's have integer values, the columns get integer names - > which can't be accessed -> str conversion
data.columns = [str(X) for X in data.columns]   
x_columns = list(filter(lambda X: X.isdigit(), data.columns))


# fitting_param_table = get_interpolate_fit_params_custom(data, exponenet_4, weights=[1, 0])
# x_columns = list(filter(lambda X: 'alpha' in X, fitting_param_table.columns))
y_columns = ["Maternal Wall Thickness"]
# filtered_fitting_param_table = fitting_param_table[fitting_param_table['Wave Int'] == 2.0]
# x_scaler = preprocessing.StandardScaler()
# filtered_fitting_param_table[fitting_param_columns] = x_scaler.fit_transform(filtered_fitting_param_table[fitting_param_columns])
y_scaler = preprocessing.StandardScaler()
data[y_columns] = y_scaler.fit_transform(data[y_columns])
# filtered_fitting_param_table[y_columns] = y_scaler.fit_transform(filtered_fitting_param_table[y_columns])

In [3]:
# Print Out Scaler values
print(f'Y scale mean {y_scaler.mean_}')
print(f'Y scale var {y_scaler.var_}')

Y scale mean [20.]
Y scale var [120.]


In [4]:
def train_model2(config, epoch=100):
    np.random.seed(70)  # Set seed for consistentcy
    params = {
        'batch_size': config['batch_size'], 'shuffle': True, 'num_workers': 2
    }
    train, val = generate_data_loaders(data, params, x_columns, y_columns, 0.8)
    model = create_perceptron_model(config['model'])
    criterion = nn.MSELoss()
    optimizer = SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"])
    # optimizer = Adam(model.parameters(), lr=config["lr"], betas=[config["b1"], config["b2"]])
    train_loss, val_loss = train_model_wtih_reporting(model, optimizer=optimizer, criterion=criterion, train_loader=train, validation_loader=val, epochs=epoch)

In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

config = {
    "lr" : tune.loguniform(1e-4, 1e-1),
    # "b1" : tune.uniform(0.3, 1.0),
    # "b2" : tune.uniform(0.3, 1.0),
    "batch_size": tune.choice([64, 32, 16, 8, 4]),
    "model": tune.choice([[20, 10, 1], [20, 10, 5, 1], [20, 16, 8, 4, 2, 1], [20, 8, 2, 1]]),
    "momentum": tune.choice([0.7, 0.8, 0.9, 1.0]),
}
scheduler = ASHAScheduler(metric="combined_loss", mode="min", max_t=10, grace_period=5, reduction_factor=2)
reporter = CLIReporter(metric_columns=["train_loss", "val_loss", "combined_loss", "training_iteration"])
result = tune.run(train_model2, config=config, scheduler=scheduler, progress_reporter=reporter,
                  num_samples=50, resources_per_trial={"cpu": 4, "gpu": 1},)

best_trial = result.get_best_trial("combined_loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
    best_trial.last_result["val_loss"]))
print("Best trial final train loss: {}".format(
    best_trial.last_result["train_loss"]))


2023-05-31 03:17:32,319	INFO worker.py:1625 -- Started a local Ray instance.
2023-05-31 03:17:32,982	INFO tune.py:218 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.


== Status ==
Current time: 2023-05-31 03:17:33 (running for 00:00:00.18)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 10.000: None | Iter 5.000: None
Logical resource usage: 4.0/64 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:RTX)
Result logdir: /home/rraiyan/ray_results/train_model2_2023-05-31_03-17-32
Number of trials: 50/50 (49 PENDING, 1 RUNNING)
+--------------------------+----------+-----------------------+--------------+-------------+----------------------+------------+
| Trial name               | status   | loc                   |   batch_size |          lr | model                |   momentum |
|--------------------------+----------+-----------------------+--------------+-------------+----------------------+------------|
| train_model2_5b49e_00000 | RUNNING  | 169.237.32.34:1085853 |           32 | 0.00448831  | [20, 16, 8, 4, 2, 1] |        1   |
| train_model2_5b49e_00001 | PENDING  |                       |            4 | 0.00370686  | [20, 8, 2, 1]        |        0.7 |

Trial name,combined_loss,date,done,hostname,iterations_since_restore,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,train_loss,training_iteration,trial_id,val_loss
train_model2_5b49e_00000,1.18203,2023-05-31_03-17-40,True,blueberry,10,169.237.32.34,1085853,6.17038,0.473485,6.17038,1685528260,0.922227,10,5b49e_00000,0.259806
train_model2_5b49e_00001,1.55619,2023-05-31_03-17-47,True,blueberry,5,169.237.32.34,1088675,4.77055,0.693412,4.77055,1685528267,0.626174,5,5b49e_00001,0.930016
train_model2_5b49e_00002,1.74481,2023-05-31_03-17-53,True,blueberry,5,169.237.32.34,1090081,3.84965,0.507346,3.84965,1685528273,0.636988,5,5b49e_00002,1.10782
train_model2_5b49e_00003,0.925035,2023-05-31_03-18-01,True,blueberry,10,169.237.32.34,1091494,6.13014,0.46382,6.13014,1685528281,0.383247,10,5b49e_00003,0.541788
train_model2_5b49e_00004,1.62749,2023-05-31_03-18-07,True,blueberry,5,169.237.32.34,1094249,3.89911,0.497214,3.89911,1685528287,0.674214,5,5b49e_00004,0.953276
train_model2_5b49e_00005,1.74205,2023-05-31_03-18-15,True,blueberry,5,169.237.32.34,1095657,5.25285,0.749683,5.25285,1685528295,0.641249,5,5b49e_00005,1.1008
train_model2_5b49e_00006,1.88462,2023-05-31_03-18-20,True,blueberry,5,169.237.32.34,1097082,4.12924,0.569051,4.12924,1685528300,0.680117,5,5b49e_00006,1.2045
train_model2_5b49e_00007,0.202201,2023-05-31_03-18-31,True,blueberry,10,169.237.32.34,1098497,8.48911,0.732832,8.48911,1685528311,0.157859,10,5b49e_00007,0.0443414
train_model2_5b49e_00008,7.48388,2023-05-31_03-18-36,True,blueberry,5,169.237.32.34,1101251,3.77377,0.480526,3.77377,1685528316,1.95464,5,5b49e_00008,5.52924
train_model2_5b49e_00009,1.78195,2023-05-31_03-18-42,True,blueberry,5,169.237.32.34,1102652,3.78888,0.481225,3.78888,1685528322,0.641359,5,5b49e_00009,1.14059


== Status ==
Current time: 2023-05-31 03:17:38 (running for 00:00:05.18)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 10.000: None | Iter 5.000: -1.1147647246718406
Logical resource usage: 4.0/64 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:RTX)
Result logdir: /home/rraiyan/ray_results/train_model2_2023-05-31_03-17-32
Number of trials: 50/50 (49 PENDING, 1 RUNNING)
+--------------------------+----------+-----------------------+--------------+-------------+----------------------+------------+--------------+------------+-----------------+----------------------+
| Trial name               | status   | loc                   |   batch_size |          lr | model                |   momentum |   train_loss |   val_loss |   combined_loss |   training_iteration |
|--------------------------+----------+-----------------------+--------------+-------------+----------------------+------------+--------------+------------+-----------------+----------------------|
| train_model2_5b49e_00000 | RUNNIN

2023-05-31 03:23:39,996	INFO tune.py:945 -- Total run time: 367.01 seconds (366.98 seconds for the tuning loop).


== Status ==
Current time: 2023-05-31 03:23:39 (running for 00:06:06.98)
Using AsyncHyperBand: num_stopped=50
Bracket: Iter 10.000: -1.4036726597679026 | Iter 5.000: -1.7434291086530422
Logical resource usage: 0/64 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:RTX)
Result logdir: /home/rraiyan/ray_results/train_model2_2023-05-31_03-17-32
Number of trials: 50/50 (50 TERMINATED)
+--------------------------+------------+-----------------------+--------------+-------------+----------------------+------------+--------------+------------+-----------------+----------------------+
| Trial name               | status     | loc                   |   batch_size |          lr | model                |   momentum |   train_loss |   val_loss |   combined_loss |   training_iteration |
|--------------------------+------------+-----------------------+--------------+-------------+----------------------+------------+--------------+------------+-----------------+----------------------|
| train_model2_5b49e_0000

<!-- Best trial config: {'lr': 0.0010630834634709364, 'b1': 0.4282116859842134, 'b2': 0.3089991262211405, 'batch_size': 8, 'model': [20, 16, 8, 4, 2, 1]}
Best trial final validation loss: 0.09234625198878348
Best trial final train loss: 0.22368373312056064 -->

In [6]:
# Train Model with the given params.
# np.random.seed(12)  # Set seed for consistentcy
# params = {
#     'batch_size': 16, 'shuffle': True, 'num_workers': 2
# }
# train, val = generate_data_loaders(data, params, x_columns, y_columns, 0.8)
# model = create_perceptron_model([20, 10, 1])
# criterion = nn.MSELoss()
# # optimizer = Adam(model.parameters(), lr=0.1)
# optimizer = SGD(model.parameters(), lr=0.0007, momentum=0.9)
# train_loss, validation_loss = train_model(model, optimizer, criterion, train, val, epochs=50)
# plt.figure()
# plt.plot(train_loss, label='Training Loss', marker='x')
# plt.plot(validation_loss, label='Validation Loss', marker='x')
# plt.legend()