In [None]:
# install necessary packages
!python -m pip install -q dlomix==0.0.4
!python -m pip install -q wandb

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chex 0.1.7 requires jax>=0.4.6, but you have jax 0.3.25 which is incompatible.
flax 0.6.11 requires jax>=0.4.2, but you have jax 0.3.25 which is incompatible.
orbax-checkpoint 0.2.7 requires jax>=0.4.9, but you have jax 0.3.25 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.7/214.7 kB[0m [

In [None]:
# import necessary packages
import numpy as np
import pandas as pd
import tensorflow as tf
import re

import wandb
from wandb.keras import WandbCallback
from wandb.keras import WandbMetricsLogger
import wandb.apis.reports as wr

import dlomix
from dlomix import constants, data, eval, layers, models, pipelines, reports, utils
from dlomix.data import RetentionTimeDataset
from dlomix.models import PrositRetentionTimePredictor
from dlomix.eval import TimeDeltaMetric




[34m[1mwandb[0m: Thanks for trying out the Report API!
[34m[1mwandb[0m: For a tutorial, check out https://colab.research.google.com/drive/1CzyJx1nuOS4pdkXa2XPaRQyZdmFmLmXV
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Try out tab completion to see what's available.
[34m[1mwandb[0m:   ∟ everything:    `wr.<tab>`
[34m[1mwandb[0m:       ∟ panels:    `wr.panels.<tab>`
[34m[1mwandb[0m:       ∟ blocks:    `wr.blocks.<tab>`
[34m[1mwandb[0m:       ∟ helpers:   `wr.helpers.<tab>`
[34m[1mwandb[0m:       ∟ templates: `wr.templates.<tab>`
[34m[1mwandb[0m:       
[34m[1mwandb[0m: For bugs/feature requests, please create an issue on github: https://github.com/wandb/wandb/issues


In [None]:
# load small train dataset
TRAIN_DATAPATH = 'https://raw.githubusercontent.com/goldjunge3010/masterpraktikum/main/third_pool_tresh_1_0_train.csv'
#TRAIN_DATAPATH = 'https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop/example_dataset/proteomTools_train_val.csv'
BATCH_SIZE = 64

# create dataset
rtdata = RetentionTimeDataset(data_source=TRAIN_DATAPATH,
                              seq_length=30, batch_size=BATCH_SIZE, val_ratio=0.2, test=False, sequence_col= "sequence", target_col= "irt")

print(f"Batch size: {rtdata.batch_size}")
print(f"Number training samples : {len(rtdata.train_data) * rtdata.batch_size}")
print(f"Number validation samples : {len(rtdata.val_data) * rtdata.batch_size}")

Batch size: 64
Number training samples : 1088
Number validation samples : 320


In [None]:
# Initialize WANDB
PROJECT = 'retention_time_report'
RUN = "run_7"
wandb.init(project = PROJECT, name = RUN)

# create report
report = wr.Report(project = PROJECT,
                   title = "Retention time report_data",
                   description = "A first try creating a WANDB report using DLOmix")
report.save()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# function to count sequence length
def count_seq_length(df: pd.DataFrame, seq_col: str) -> pd.Series:
    pattern = re.compile(r"\[UNIMOD:.*\]", re.IGNORECASE)
    df[seq_col].replace(pattern, "", inplace= True)
    return df[seq_col].str.len()


In [None]:
counts = count_seq_length(pd.read_csv(TRAIN_DATAPATH), "sequence")

In [None]:
c = counts.to_list()
data = [[s] for s in c]
table = wandb.Table(data=data, columns=["scores"])
wandb.log({'my_histogram': wandb.plot.histogram(table, "scores",
 	  title="Sequence length distribution")})
wandb.finish()


VBox(children=(Label(value='0.012 MB of 0.022 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.528234…

In [None]:
# WORKS!
c = counts.to_list()
data = [[s] for s in c]
table = wandb.Table(data=data, columns=["scores"])
hist = wandb.plot_table(
    vega_spec_name="wandb/simple_hist",
    data_table = table,
    fields = "scores")
wandb.log({'my_histogram': hist})

pg_val = wr.PanelGrid(
    runsets=[
        wr.Runset(ENTITY, PROJECT),
    ],
    panels=[
        wr.CustomChart().from_table(
          table,
        chart_fields = {"fields": "scores"}
)
    ]
)


wandb.finish()


VBox(children=(Label(value='0.022 MB of 0.031 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.701718…

In [None]:
c = counts.to_list()
data = [[s] for s in c]
custom_histogram = wandb.plot_table(
    vega_spec_name="wandb/simple_hist",
    data_table = table,
    fields = "scores")
wandb.log({"custom_id" : custom_histogram})


AssertionError: ignored

VBox(children=(Label(value='12.841 MB of 12.841 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…

In [None]:
c = counts.to_list()
c

[10,
 11,
 11,
 9,
 11,
 6,
 11,
 9,
 10,
 9,
 11,
 9,
 10,
 11,
 11,
 11,
 11,
 11,
 8,
 8,
 11,
 9,
 11,
 11,
 10,
 11,
 8,
 11,
 11,
 10,
 4,
 9,
 10,
 10,
 10,
 10,
 11,
 11,
 10,
 8,
 11,
 8,
 2,
 10,
 4,
 9,
 11,
 10,
 8,
 10,
 9,
 10,
 8,
 9,
 11,
 6,
 11,
 9,
 11,
 11,
 10,
 10,
 9,
 10,
 9,
 9,
 9,
 11,
 10,
 11,
 8,
 8,
 4,
 9,
 10,
 11,
 9,
 8,
 10,
 10,
 8,
 10,
 11,
 8,
 8,
 6,
 11,
 11,
 9,
 10,
 11,
 8,
 10,
 11,
 10,
 8,
 9,
 11,
 11,
 9,
 11,
 9,
 8,
 10,
 8,
 8,
 10,
 9,
 11,
 8,
 8,
 8,
 8,
 8,
 6,
 10,
 10,
 9,
 10,
 10,
 4,
 8,
 9,
 8,
 9,
 10,
 10,
 11,
 9,
 4,
 10,
 10,
 10,
 9,
 11,
 8,
 9,
 8,
 11,
 10,
 9,
 8,
 8,
 8,
 9,
 9,
 9,
 11,
 8,
 9,
 11,
 10,
 8,
 8,
 9,
 6,
 9,
 11,
 8,
 9,
 11,
 10,
 9,
 10,
 11,
 8,
 8,
 8,
 9,
 8,
 10,
 9,
 9,
 8,
 10,
 11,
 8,
 9,
 11,
 7,
 8,
 4,
 10,
 10,
 10,
 8,
 9,
 9,
 8,
 8,
 11,
 10,
 8,
 10,
 11,
 9,
 8,
 8,
 8,
 10,
 11,
 9,
 10,
 10,
 9,
 8,
 8,
 8,
 8,
 7,
 10,
 11,
 8,
 11,
 2,
 9,
 8,
 9,
 9,
 10,
 9,
 9,
 8,
 11,


array([[-0.05196425, -0.11119605,  1.0417968 , ...,  2.60281328,
        -1.98327783, -0.92686366],
       [ 0.94589607, -0.75058415, -0.79426004, ...,  0.41139831,
         0.88277695,  0.45908868],
       [ 2.56976704, -0.67412361, -0.35723887, ...,  0.07543409,
         0.54942243,  0.02074964],
       ...,
       [ 0.10565442, -1.04317779,  0.33991146, ...,  0.72611056,
        -0.00575399,  0.20698523],
       [-0.42645527, -1.66107771,  0.46212168, ..., -0.48419356,
         1.48999011,  0.46971444],
       [-0.47319654,  0.46891723,  0.07037226, ...,  0.22503128,
        -1.02666071, -0.38592419]])

In [None]:
# create Prosit retention time predictor
model = PrositRetentionTimePredictor(seq_length=30)

# create the optimizer object
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

# compile the model  with the optimizer and the metrics we want to use, we can add our custom timedelta metric
model.compile(optimizer=optimizer,
              loss='mse',
              metrics=['mean_absolute_error', TimeDeltaMetric()])

In [1]:
# config the model
config = wandb.config
config.seq_length = 30
config.batch_size = BATCH_SIZE
config.val_ratio = 0.2
config.lr = 0.0001
config.optimizer = "adam"

NameError: ignored

In [None]:
# train the model
history = model.fit(rtdata.train_data,
                    validation_data=rtdata.val_data,
                    epochs=2, callbacks=[WandbMetricsLogger(log_freq = "batch")] )
wandb.finish()

Epoch 1/2
Epoch 2/2


0,1
batch/batch_step,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
batch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch/loss,▁▂▂▂▄▄▅▄▄▅▅▆▆▆▇██▁▂▂▂▃▄▅▄▄▅▅▅▆▆▇▇█
batch/mean_absolute_error,▁▂▂▃▄▅▅▅▅▆▆▆▆▇▇██▁▂▂▃▄▅▅▅▅▅▆▆▆▇▇██
batch/timedelta,▁▁▁▃▄▄▅▅▅▆▇▇▇▆▇██▁▁▁▃▄▄▅▅▅▆▇▇▇▆▇▇█
epoch/epoch,▁█
epoch/learning_rate,▁▁
epoch/loss,█▁
epoch/mean_absolute_error,█▁
epoch/timedelta,█▁

0,1
batch/batch_step,33.0
batch/learning_rate,0.0001
batch/loss,3399.56177
batch/mean_absolute_error,47.85365
batch/timedelta,57.26155
epoch/epoch,1.0
epoch/learning_rate,0.0001
epoch/loss,3399.56177
epoch/mean_absolute_error,47.85365
epoch/timedelta,57.26155


In [None]:
ENTITY = wandb.apis.PublicApi().default_entity



pg_train = wr.PanelGrid(
    runsets=[
        wr.Runset(ENTITY, PROJECT),
    ],
    panels=[
        wr.LinePlot(x='Step', y=['batch/loss']),
        wr.LinePlot(x='Step', y=['batch/timedelta']),
        wr.LinePlot(x='Step', y=['batch/mean_absolute_error'])
    ]
)

pg_val = wr.PanelGrid(
    runsets=[
        wr.Runset(ENTITY, PROJECT),
    ],
    panels=[
        wr.LinePlot(x='Step', y=['epoch/val_loss']),
        wr.LinePlot(x='Step', y=['epoch/val_timedelta']),
        wr.LinePlot(x='Step', y=['epoch/val_mean_absolute_error'])
    ]
)

report.blocks = report.blocks[:1] +[wr.H1("Training metrics"), pg_train] + [wr.H1("Validation metrics"), pg_val] + report.blocks[1:]

report.save()


In [None]:
# Histogramm over peptide lengths

# Histogramm over retention times