<a href="https://colab.research.google.com/github/goldjunge3010/masterpraktikum/blob/main/Example_Comparing_two_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install the DLOmix package in the current environment using pip

!python -m pip install -q dlomix==0.0.4

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone


In [2]:
!python -m pip install -q wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.6/215.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [3]:
# import necessary packages
import numpy as np
import pandas as pd
import tensorflow as tf
import re

import wandb
from wandb.keras import WandbCallback
from wandb.keras import WandbMetricsLogger
import wandb.apis.reports as wr

import dlomix
from dlomix import constants, data, eval, layers, models, pipelines, reports, utils
from dlomix.data import RetentionTimeDataset
from dlomix.models import PrositRetentionTimePredictor
from dlomix.models import RetentionTimePredictor
from dlomix.eval import TimeDeltaMetric




[34m[1mwandb[0m: Thanks for trying out the Report API!
[34m[1mwandb[0m: For a tutorial, check out https://colab.research.google.com/drive/1CzyJx1nuOS4pdkXa2XPaRQyZdmFmLmXV
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Try out tab completion to see what's available.
[34m[1mwandb[0m:   ∟ everything:    `wr.<tab>`
[34m[1mwandb[0m:       ∟ panels:    `wr.panels.<tab>`
[34m[1mwandb[0m:       ∟ blocks:    `wr.blocks.<tab>`
[34m[1mwandb[0m:       ∟ helpers:   `wr.helpers.<tab>`
[34m[1mwandb[0m:       ∟ templates: `wr.templates.<tab>`
[34m[1mwandb[0m:       
[34m[1mwandb[0m: For bugs/feature requests, please create an issue on github: https://github.com/wandb/wandb/issues


In [13]:
config = {
  "seq_length" : 30,
  "batch_size" : 64,
  "val_ratio" : 0.2,
  "lr" : 0.001,
  "optimizer" : "Adam",
  "loss" : "mse"
}

# Initialize WANDB
PROJECT = 'compare_two_models'
RUN = "run_12"
wandb.init(project = PROJECT, name = RUN, config = config)

In [14]:
# load small train dataset
#TRAIN_DATAPATH = 'https://raw.githubusercontent.com/goldjunge3010/masterpraktikum/main/third_pool_tresh_1_0_train.csv'
TRAIN_DATAPATH = 'https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop/example_dataset/proteomTools_train_val.csv'

# create dataset
rtdata = RetentionTimeDataset(data_source=TRAIN_DATAPATH,
                              seq_length = config["seq_length"],
                              batch_size = config["batch_size"],
                              val_ratio = config["val_ratio"],
                              test = False,
                              sequence_col = "sequence",
                              target_col = "irt")

print(f"Batch size: {rtdata.batch_size}")
print(f"Number training samples : {len(rtdata.train_data) * rtdata.batch_size}")
print(f"Number validation samples : {len(rtdata.val_data) * rtdata.batch_size}")

Batch size: 64
Number training samples : 27136
Number validation samples : 6784


In [15]:
# create Prosit retention time predictor
model = RetentionTimePredictor(seq_length = config["seq_length"])

# create the optimizer object
optimizer = tf.keras.optimizers.Adam(learning_rate = config["lr"])

# compile the model  with the optimizer and the metrics we want to use, we can add our custom timedelta metric
model.compile(optimizer = optimizer,
              loss = config["loss"],
              metrics=['mean_absolute_error', TimeDeltaMetric()])

# train the model
history = model.fit(rtdata.train_data,
                    validation_data=rtdata.val_data,
                    epochs=15, callbacks=[WandbMetricsLogger(log_freq = "batch")] )
wandb.finish()

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
batch/batch_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch/loss,█▄▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch/mean_absolute_error,█▅▄▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch/timedelta,█▅▄▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁
epoch/mean_absolute_error,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁
epoch/timedelta,█▃▃▂▂▂▁▁▁▁▁▁▁▁▁

0,1
batch/batch_step,6359.0
batch/learning_rate,0.001
batch/loss,55.946
batch/mean_absolute_error,5.02792
batch/timedelta,5.22672
epoch/epoch,14.0
epoch/learning_rate,0.001
epoch/loss,55.946
epoch/mean_absolute_error,5.02792
epoch/timedelta,5.22672


In [16]:
# save the model weights
save_path = "./output/rtmodel_2"
model.save_weights(save_path)

In [27]:
class Report():
  def __init__(self, models:dict, test_set:dlomix.data.RetentionTimeDataset, project:str, title: str, description: str):
    self.entity = wandb.apis.PublicApi().default_entity
    self.project = project
    self.title = title
    self.description = description
    self.models = models
    self.test_set = test_set
    self.api = wandb.Api()

  def create_report(self, add_residuals_section = True, add_r2_section = True, add_density_section = True):
    report = wr.Report(
        project = self.project,
        title = self.title,
        description = self.description
    )

    report.blocks = [
        wr.TableOfContents()
    ]
    if add_residuals_section:
      report.blocks += self.residuals_section()
    if add_r2_section:
      report.blocks += self.r2_section()
    if add_density_section:
      report.blocks += self.density_section()
    report.save()

  def calculate_r2(self, targets, predictions):
    from sklearn.metrics import r2_score
    r2 = r2_score(targets, predictions)
    return r2

  def calculate_residuals(self, targets, predictions):
    residuals = targets - predictions
    return residuals

  def residuals_section(self):
    panel_list_models = []
    for model in self.models:
      panel_list_models.append(
        wr.CustomChart(
          query = {'summaryTable': {"tableKey" : f"results_table_{model}"}},
          chart_name='master_praktikum/hist_residuals',
          chart_fields={'value': "residuals", "name": model}
        )
      )

    residuals_block = [
        wr.H1(text = "Residuals"),
        wr.P("Lorem ipsum dolor sit amet. Aut laborum perspiciatis sit odit omnis aut aliquam voluptatibus ut rerum molestiae sed assumenda nulla ut minus illo sit sunt explicabo? Sed quia architecto est voluptatem magni sit molestiae dolores. Non animi repellendus ea enim internos et iste itaque quo labore mollitia aut omnis totam."),
        wr.PanelGrid(
          runsets=[
            wr.Runset(self.entity, self.project),
          ],
          panels = panel_list_models
        ),
        wr.HorizontalRule(),
    ]

    return residuals_block

  def r2_section(self):
      r2_block = [
          wr.H1(text = "R2"),
          wr.P("Lorem ipsum dolor sit amet. Aut laborum perspiciatis sit odit omnis aut aliquam voluptatibus ut rerum molestiae sed assumenda nulla ut minus illo sit sunt explicabo? Sed quia architecto est voluptatem magni sit molestiae dolores. Non animi repellendus ea enim internos et iste itaque quo labore mollitia aut omnis totam."),
          wr.PanelGrid(
            runsets=[
              wr.Runset(self.entity, self.project),
            ],
            panels=[
                wr.BarPlot(
                    title="R2",
                    metrics=["r2"],
                    orientation='h',
                    title_x="R2",
                    # title_y="y axis title",
                    max_runs_to_show=20,
                    max_bars_to_show=3,
                    font_size="auto",
                ),
            ]
          ),
          wr.HorizontalRule(),
      ]
      return r2_block

  def density_section(self, irt_delta95 = 5 ):
    panel_list_models = []
    targets = self.test_set.get_split_targets(split = self.test_set.main_split)
    x_min = np.min(targets)
    x_max = np.max(targets)
    for model in self.models:
      panel_list_models.append(
        wr.CustomChart(
          query = {'summaryTable': {"tableKey" : f"results_table_{model}"}},
          chart_name='master_praktikum/density_plot',
          chart_fields={'measured': "irt", "predicted": "predicted_irt", "name": model, "irt_delta95": irt_delta95}
        )
      )

    density_block = [
        wr.H1(text = "Density"),
        wr.P("Lorem ipsum dolor sit amet. Aut laborum perspiciatis sit odit omnis aut aliquam voluptatibus ut rerum molestiae sed assumenda nulla ut minus illo sit sunt explicabo? Sed quia architecto est voluptatem magni sit molestiae dolores. Non animi repellendus ea enim internos et iste itaque quo labore mollitia aut omnis totam."),
        wr.PanelGrid(
          runsets=[
            wr.Runset(self.entity, self.project),
          ],
          panels = panel_list_models
        ),
        wr.HorizontalRule(),
    ]

    return density_block

  def compare_models(self):
    for model in self.models:
      # initialize WANDB
      RUN = model
      wandb.init(project = self.project, name = RUN, config = config)

      # predict on test_test
      predictions = self.models[model].predict(self.test_set.test_data)
      predictions = predictions.ravel()
      targets = self.test_set.get_split_targets(split = self.test_set.main_split)
      # create result df
      results_df = pd.DataFrame({"sequence": self.test_set.sequences,
                                  "irt": targets,
                                  "predicted_irt": predictions,
                                  "residuals": self.calculate_residuals(targets, predictions)})
      # log df as table to wandb
      table = wandb.Table(dataframe = results_df)
      wandb.log({f"results_table_{model}": table})

      # log r2 to wandb
      r2 = self.calculate_r2(targets, predictions)
      wandb.log({"r2": r2})

      # finish run
      wandb.finish()

#TODO
# Densitiy Plot see DLOmix

In [22]:
# create predictors from saved weights
load_path = "./output/rtmodel_1"
pre_trained_model_1 = RetentionTimePredictor(seq_length=30)
pre_trained_model_1.load_weights(load_path)

load_path = "./output/rtmodel_2"
pre_trained_model_2 = RetentionTimePredictor(seq_length=30)
pre_trained_model_2.load_weights(load_path)

models = {"model_1":pre_trained_model_1, "model_2":pre_trained_model_2}

# create testset
TEST_DATAPATH = 'https://raw.githubusercontent.com/wilhelm-lab/dlomix-resources/main/example_datasets/RetentionTime/proteomeTools_test.csv'
test_set = RetentionTimeDataset(data_source=TEST_DATAPATH,
                              seq_length = config["seq_length"],
                              batch_size = 32,
                              test = True,
                              sequence_col = "sequence",
                              target_col = "irt")




In [27]:
df = pd.read_csv('https://raw.githubusercontent.com/goldjunge3010/masterpraktikum/main/third_pool_tresh_1_0_test.csv')
min_target = df["indexed_retention_time"].min()
print(f"min: {min_target}")
max_target = df["indexed_retention_time"].max()
print(f"max: {max_target}")

min: -9.54625219353221
max: 88.7671581087951


In [28]:
report = Report(models = models,
                test_set = test_set,
                project = PROJECT,
                title = "Comparison of models",
                description = "A quick comparison of some models")
report.compare_models()
report.create_report(add_residuals_section = True, add_r2_section = True)



VBox(children=(Label(value='0.882 MB of 0.882 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
r2,▁

0,1
r2,0.95032






VBox(children=(Label(value='0.884 MB of 0.884 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
r2,▁

0,1
r2,0.96572


In [None]:
1