In [None]:
# install necessary packages
!python -m pip install -q dlomix==0.0.4
!python -m pip install -q wandb

In [None]:
# import necessary packages
import numpy as np
import pandas as pd
import tensorflow as tf
import re

import wandb
from wandb.keras import WandbCallback
from wandb.keras import WandbMetricsLogger
import wandb.apis.reports as wr

import dlomix
from dlomix import constants, data, eval, layers, models, pipelines, reports, utils
from dlomix.data import RetentionTimeDataset
from dlomix.models import PrositRetentionTimePredictor
from dlomix.eval import TimeDeltaMetric




[34m[1mwandb[0m: Thanks for trying out the Report API!
[34m[1mwandb[0m: For a tutorial, check out https://colab.research.google.com/drive/1CzyJx1nuOS4pdkXa2XPaRQyZdmFmLmXV
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Try out tab completion to see what's available.
[34m[1mwandb[0m:   ∟ everything:    `wr.<tab>`
[34m[1mwandb[0m:       ∟ panels:    `wr.panels.<tab>`
[34m[1mwandb[0m:       ∟ blocks:    `wr.blocks.<tab>`
[34m[1mwandb[0m:       ∟ helpers:   `wr.helpers.<tab>`
[34m[1mwandb[0m:       ∟ templates: `wr.templates.<tab>`
[34m[1mwandb[0m:       
[34m[1mwandb[0m: For bugs/feature requests, please create an issue on github: https://github.com/wandb/wandb/issues


In [None]:
config = {
  "seq_length" : 64,
  "batch_size" : 32,
  "val_ratio" : 0.2,
  "lr" : 0.001,
  "optimizer" : "Adam",
  "loss" : "mse"
}

# Initialize WANDB
PROJECT = 'rt_report_2'
RUN = "run_3"
wandb.init(project = PROJECT, name = RUN, config = config)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01667077395001494, max=1.0)…

In [None]:
# load small train dataset
TRAIN_DATAPATH = 'https://raw.githubusercontent.com/goldjunge3010/masterpraktikum/main/third_pool_tresh_1_0_train.csv'
#TRAIN_DATAPATH = 'https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop/example_dataset/proteomTools_train_val.csv'

# create dataset
rtdata = RetentionTimeDataset(data_source=TRAIN_DATAPATH,
                              seq_length = config["seq_length"],
                              batch_size = config["batch_size"],
                              val_ratio = config["val_ratio"],
                              test = False,
                              sequence_col = "modified_sequence",
                              target_col = "indexed_retention_time")

print(f"Batch size: {rtdata.batch_size}")
print(f"Number training samples : {len(rtdata.train_data) * rtdata.batch_size}")
print(f"Number validation samples : {len(rtdata.val_data) * rtdata.batch_size}")

Batch size: 32
Number training samples : 1440
Number validation samples : 384


In [None]:
# function to count sequence length
def count_seq_length(df: pd.DataFrame, seq_col: str) -> pd.Series:
    pattern = re.compile(r"\[UNIMOD:.*\]", re.IGNORECASE)
    df[seq_col].replace(pattern, "", inplace= True)
    return df[seq_col].str.len()


In [None]:
# create Prosit retention time predictor
model = PrositRetentionTimePredictor(seq_length = config["seq_length"])

# create the optimizer object
optimizer = tf.keras.optimizers.Adam(learning_rate = config["lr"])

# compile the model  with the optimizer and the metrics we want to use, we can add our custom timedelta metric
model.compile(optimizer = optimizer,
              loss = config["loss"],
              metrics=['mean_absolute_error', TimeDeltaMetric()])

# train the model
history = model.fit(rtdata.train_data,
                    validation_data=rtdata.val_data,
                    epochs=6, callbacks=[WandbMetricsLogger(log_freq = "batch")] )
wandb.finish()

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


VBox(children=(Label(value='0.001 MB of 0.011 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.095998…

0,1
batch/batch_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch/loss,▁▁▁▁▂▂▂▇▄▃▂▂▃██▄▃▃▃▃▇▆▃▃▃▃▃█▅▃▃▃▃▃▇▄▃▃▃▃
batch/mean_absolute_error,▁▁▁▁▁▂▂▇▄▃▂▂▃██▄▃▃▃▃▇▆▃▃▃▃▃█▄▃▃▃▃▃▇▄▃▃▃▃
batch/timedelta,▃▃▁▁▁▂▂▅▄▃▃▃▃▅█▄▂▂▃▃▂▄▂▂▂▃▃▆▃▂▂▂▃▃▆▂▂▂▂▃
epoch/epoch,▁▂▄▅▇█
epoch/learning_rate,▁▁▁▁▁▁
epoch/loss,▁▅▇███
epoch/mean_absolute_error,▁▆▇███
epoch/timedelta,▁█▆▅▅▄

0,1
batch/batch_step,269.0
batch/learning_rate,0.001
batch/loss,1490.35388
batch/mean_absolute_error,32.02077
batch/timedelta,30.51258
epoch/epoch,5.0
epoch/learning_rate,0.001
epoch/loss,1490.35388
epoch/mean_absolute_error,32.02077
epoch/timedelta,30.51258


In [None]:
class Report():
  def __init__(self, project:str, title: str, description: str):
    self.entity = wandb.apis.PublicApi().default_entity
    self.project = project
    self.title = title
    self.description = description
    self.api = wandb.Api()
  def create_report(self, add_config_section = True, add_data_section = True, add_train_section = True, add_val_section = True):
    report = wr.Report(
        project = self.project,
        title = self.title,
        description = self.description
    )

    report.blocks = [
        wr.TableOfContents()
    ]
    if add_config_section:
      report.blocks += self.config_section()
    if add_data_section:
      report.blocks += self.data_section()
    if add_train_section:
      report.blocks += self.train_section()
    if add_val_section:
      report.blocks += self.val_section()

    report.save()

  # get metrics of last run in project or from specified run_id
  def get_metrics(self, run_id = None):
    if run_id:
      # run is specified by <entity>/<project>/<run_id>
      run = self.api.run(path = f"{self.entity}/{self.project}/{run_id}")
      # save the metrics for the run to a csv file
      metrics_dataframe = run.history()
      return metrics_dataframe
    else:
      # get metrics of latest run
      # api.runs seems to have a delay
      runs = self.api.runs(path = f"{self.entity}/{self.project}")
      for run in runs:
        print(run)
      run = runs[0]
      metrics_dataframe = run.history()
      return metrics_dataframe

  # get metric names split into train/val, train is further split into batch/epoch
  def get_metrics_names(self):
    metrics = self.get_metrics()
    # filter strings from list that are not starting with "_" and do not contain "val"
    pre_filter = [string for string in metrics if not string.startswith("_")]
    batch_train_metrics_names = [string for string in pre_filter if ("val" not in string.lower()) & ("epoch" not in string.lower())]
    epoch_train_metrics_names = [string for string in pre_filter if ("val" not in string.lower()) & ("batch" not in string.lower())]
    # filter strings from list that contain "val"
    epoch_val_metrics_names = list(filter(lambda x: "val" in x.lower(), metrics))
    return batch_train_metrics_names, epoch_train_metrics_names, epoch_val_metrics_names


  def config_section(self):
    config_block = [
        wr.H1(text = "Config"),
        wr.PanelGrid(
          runsets=[
            wr.Runset(self.entity, self.project),
          ],
          panels=[
            wr.RunComparer(layout = {'w': 24})
          ],
        ),
        wr.HorizontalRule(),
    ]
    return config_block
  def data_section(self):
    data_block = [
        wr.H1(text = "Data"),
        wr.PanelGrid(
          runsets=[
            wr.Runset(self.entity, self.project),
          ],
          panels=[
            wr.LinePlot(x='Step', y=['batch/batch_step']),
            wr.LinePlot(x='Step', y=['batch/learning_rate']),
          ]
        ),
        wr.HorizontalRule(),
    ]
    return data_block

  def train_section(self):
    batch_train_metrics_names, epoch_train_metrics_names, _ = self.get_metrics_names()
    panel_list_batch = []
    panel_list_epoch = []
    for name in batch_train_metrics_names:
      panel_list_batch.append(wr.LinePlot(x='Step', y=[name]))
    for name in epoch_train_metrics_names:
      panel_list_epoch.append(wr.LinePlot(x='Step', y=[name]))
    train_block = [
        wr.H1(text = "Training metrics"),
        wr.H2(text = "per batch"),
        wr.PanelGrid(
          runsets=[
            wr.Runset(self.entity, self.project),
          ],
          panels = panel_list_batch
        ),
        wr.H2(text = "per epoch"),
        wr.PanelGrid(
          runsets=[
            wr.Runset(self.entity, self.project),
          ],
          panels = panel_list_epoch
        ),
        wr.HorizontalRule(),
    ]
    return train_block

  def val_section(self):
    _, _, epoch_val_metrics_names = self.get_metrics_names()
    panel_list_epoch = []
    for name in epoch_val_metrics_names:
      panel_list_epoch.append(wr.LinePlot(x='Step', y=[name]))
    val_block = [
        wr.H1(text = "Validation metrics"),
        wr.H2(text = "per epoch"),
        wr.PanelGrid(
          runsets=[
            wr.Runset(self.entity, self.project),
          ],
          panels = panel_list_epoch
        ),
        wr.HorizontalRule(),
    ]
    return val_block

In [None]:
# Create a report
report = Report(project = "rt_report_2", title = "Comparison of learning rates", description = "A quick comparison of the influence of  learning rates using the ADAM optimizer")
report.create_report(add_data_section = True, add_train_section = True, add_val_section = True)

<Run master_praktikum/rt_report_2/69bvctj7 (finished)>
<Run master_praktikum/rt_report_2/4wmc0bmu (finished)>
<Run master_praktikum/rt_report_2/69bvctj7 (finished)>
<Run master_praktikum/rt_report_2/4wmc0bmu (finished)>


#**Tryout Zone**

In [None]:
filtered_list = [string for string in l if not string.startswith("_")]
filtered_list = [string for string in filtered_list if "val" not in string.lower()]
print(filtered_list)

['batch/timedelta', 'epoch/timedelta', 'batch/mean_absolute_error', 'epoch/epoch', 'batch/batch_step', 'batch/learning_rate', 'epoch/learning_rate', 'epoch/mean_absolute_error', 'batch/loss', 'epoch/loss']


In [None]:
entity = wandb.apis.PublicApi().default_entity
project = "rt_report_2"
runs = api.runs(path = f"{entity}/{project}")

NameError: ignored

In [None]:
l.columns

Index(['_runtime', '_timestamp', 'batch/timedelta', '_step', 'epoch/timedelta',
       'epoch/val_mean_absolute_error', 'batch/mean_absolute_error',
       'epoch/epoch', 'epoch/val_loss', 'batch/batch_step',
       'batch/learning_rate', 'epoch/learning_rate', 'epoch/val_timedelta',
       'epoch/mean_absolute_error', 'batch/loss', 'epoch/loss'],
      dtype='object')

In [None]:
api = wandb.Api()

# run is specified by <entity>/<project>/<run_id>
run = api.run("master_praktikum/rt_report_2/rz569mhe")

# save the metrics for the run to a csv file
metrics_dataframe = run.history()


AttributeError: ignored

In [None]:
metrics_dataframe.columns

Index(['_runtime', '_timestamp', 'batch/timedelta', '_step', 'epoch/timedelta',
       'epoch/val_mean_absolute_error', 'batch/mean_absolute_error',
       'epoch/epoch', 'epoch/val_loss', 'batch/batch_step',
       'batch/learning_rate', 'epoch/learning_rate', 'epoch/val_timedelta',
       'epoch/mean_absolute_error', 'batch/loss', 'epoch/loss'],
      dtype='object')

In [None]:
for run in runs:
  print(run.path)

['master_praktikum', 'rt_report_2', 'rz569mhe']
['master_praktikum', 'rt_report_2', '8q12bkse']
['master_praktikum', 'rt_report_2', 'udpmrl5f']


In [None]:
ENTITY = wandb.apis.PublicApi().default_entity



pg_train = wr.PanelGrid(
    runsets=[
        wr.Runset(ENTITY, PROJECT),
    ],
    panels=[
        wr.LinePlot(x='Step', y=['batch/loss']),
        wr.LinePlot(x='Step', y=['batch/timedelta']),
        wr.LinePlot(x='Step', y=['batch/mean_absolute_error'])
    ]
)

pg_val = wr.PanelGrid(
    runsets=[
        wr.Runset(ENTITY, PROJECT),
    ],
    panels=[
        wr.LinePlot(x='Step', y=['epoch/val_loss']),
        wr.LinePlot(x='Step', y=['epoch/val_timedelta']),
        wr.LinePlot(x='Step', y=['epoch/val_mean_absolute_error'])
    ]
)

report.blocks = report.blocks[:1] +[wr.H1("Training metrics"), pg_train] + [wr.H1("Validation metrics"), pg_val] + report.blocks[1:]

report.save()


AttributeError: ignored

In [None]:
# Histogramm over peptide lengths

# Histogramm over retention times