# Data per task

We create and train models with mlflow. Here we develop some functions to help us loading the results and processing them.

In [1]:
import os
import itertools
from pathlib import Path
from typing import List
import warnings
import json


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import NegativeClassOptimization.config as config
import NegativeClassOptimization.utils as utils
import NegativeClassOptimization.preprocessing as preprocessing
from NegativeClassOptimization import ml
from NegativeClassOptimization import datasets, pipelines, visualisations

  from .autonotebook import tqdm as notebook_tqdm


## Collecting mlflow results

In [2]:
experiment_ids = ["11", "13", "14"]
df = utils.MLFlowTaskAPI.mlflow_results_as_dataframe(experiment_ids, run_name="dev-v0.1.2-3-with-replicates")
tasks = []
for i, row in df.iterrows():
    exp: str = row["experiment"]
    ag_neg: str = row["ag_neg"]
    if exp == "11":
        tasks.append("1v1")
    elif exp == "13":
        tasks.append("1v9")
    elif exp == "14":
        if ag_neg.split("_")[1] == "looser":
            tasks.append("high_vs_looser")
        elif ag_neg.split("_")[1] == "95low":
            tasks.append("high_vs_95low")
    else:
        raise ValueError(f"Experiment {exp} not recognized.")
df["task"] = tasks

KeyError in `mlflow_record_data`.
KeyError in `mlflow_record_data`.
KeyError in `mlflow_record_data`.
KeyError in `mlflow_record_data`.
KeyError in `mlflow_record_data`.
KeyError in `mlflow_record_data`.
KeyError in `mlflow_record_data`.
KeyError in `mlflow_record_data`.
KeyError in `mlflow_record_data`.
KeyError in `mlflow_record_data`.


In [3]:
print(df.columns)
print(df.shape)
df.head()

Index(['sample_train', 'load_from_miniabsolut_split_seed', 'optimizer_type',
       'ag_neg', 'batch_size', 'epochs', 'num_hidden_units', 'N_closed',
       'N_train', 'split_id', 'ag_pos', 'swa', 'shuffle_antigen_labels',
       'input_dim', 'learning_rate', 'weight_decay', 'momentum', 'seed_id',
       'load_from_miniabsolut', 'acc_closed', 'train_loss', 'roc_auc_closed',
       'f1_closed', 'avg_precision_closed', 'precision_closed', 'test_loss',
       'recall_closed', 'mlflow.source.git.commit', 'mlflow.note.content',
       'mlflow.source.type', 'mlflow.runName', 'mlflow.source.name',
       'mlflow.log-model.history', 'mlflow.user', 'experiment', 'run_id',
       'task'],
      dtype='object')
(1200, 37)


Unnamed: 0,sample_train,load_from_miniabsolut_split_seed,optimizer_type,ag_neg,batch_size,epochs,num_hidden_units,N_closed,N_train,split_id,...,mlflow.source.git.commit,mlflow.note.content,mlflow.source.type,mlflow.runName,mlflow.source.name,mlflow.log-model.history,mlflow.user,experiment,run_id,task
0,,4,Adam,1H0D,64,50,10,10000,30000,0,...,4a705699e06b6c53e8799c26571fc6c360ed4f22,1OB1 vs 1H0D,LOCAL,dev-v0.1.2-3-with-replicates,scripts/script_12a_train_SN10_clean.py,"[{""run_id"": ""45e7f26f9d494b7cb2302f6f13c4bf55""...",eugen,11,45e7f26f9d494b7cb2302f6f13c4bf55,1v1
1,,4,Adam,3VRL,64,50,10,10000,30000,0,...,4a705699e06b6c53e8799c26571fc6c360ed4f22,1OB1 vs 3VRL,LOCAL,dev-v0.1.2-3-with-replicates,scripts/script_12a_train_SN10_clean.py,"[{""run_id"": ""32f8fd48823c40c5bba365c9243e02de""...",eugen,11,32f8fd48823c40c5bba365c9243e02de,1v1
2,,4,Adam,1FBI,64,50,10,10000,30000,0,...,4a705699e06b6c53e8799c26571fc6c360ed4f22,1OB1 vs 1FBI,LOCAL,dev-v0.1.2-3-with-replicates,scripts/script_12a_train_SN10_clean.py,"[{""run_id"": ""4f0eb2a80be645e3b6852c649537a693""...",eugen,11,4f0eb2a80be645e3b6852c649537a693,1v1
3,,4,Adam,1WEJ,64,50,10,10000,30000,0,...,4a705699e06b6c53e8799c26571fc6c360ed4f22,1OB1 vs 1WEJ,LOCAL,dev-v0.1.2-3-with-replicates,scripts/script_12a_train_SN10_clean.py,"[{""run_id"": ""6c6537db1ea246da8ab39ab5b5a8749b""...",eugen,11,6c6537db1ea246da8ab39ab5b5a8749b,1v1
4,,4,Adam,1NSN,64,50,10,10000,30000,0,...,4a705699e06b6c53e8799c26571fc6c360ed4f22,1OB1 vs 1NSN,LOCAL,dev-v0.1.2-3-with-replicates,scripts/script_12a_train_SN10_clean.py,"[{""run_id"": ""87a555c5f8fb4a92b497369e3113f76b""...",eugen,11,87a555c5f8fb4a92b497369e3113f76b,1v1


In [4]:
df.ag_pos.value_counts()

1OB1         100
1FBI         100
1ADQ         100
1WEJ         100
1H0D         100
3RAJ         100
5E94         100
3VRL         100
1NSN         100
2YPV         100
1NSN_high     20
5E94_high     20
2YPV_high     20
1FBI_high     20
3VRL_high     20
3RAJ_high     20
1OB1_high     20
1H0D_high     20
1WEJ_high     20
1ADQ_high     20
Name: ag_pos, dtype: int64

In [5]:
df.ag_neg.value_counts()

1H0D                                                                        90
1FBI                                                                        90
1WEJ                                                                        90
1NSN                                                                        90
1OB1                                                                        90
5E94                                                                        90
2YPV                                                                        90
3RAJ                                                                        90
1ADQ                                                                        90
3VRL                                                                        90
1OB1_looser                                                                 10
3RAJ_looser                                                                 10
1NSN_95low                                          

In [6]:
base_p = Path(config.DATA_BASE_PATH) / "Frozen_MiniAbsolut_ML"
df.to_csv(base_p / "df_task_data.tsv", sep='\t', index=False)

## Collect results and organize in a directory

In [7]:
for task in df["task"].unique():
    for seed_id in df["seed_id"].unique():
        for split_id in df["load_from_miniabsolut_split_seed"].unique():
            if split_id == "None":
                split_id = 42
            dir_p = base_p / task / f"seed_{seed_id}" / f"split_{split_id}"
            dir_p.mkdir(parents=True, exist_ok=True)

In [13]:
for i, row in df.iterrows():
    exp = row["experiment"]
    task = row["task"]
    run_id = row["run_id"]
    seed_id = row["seed_id"]

    ag_pos = row["ag_pos"]
    ag_neg = row["ag_neg"]
    if "(" in ag_neg:
        # For 1v9
        ag_neg = "9"

    split_id = row["load_from_miniabsolut_split_seed"]
    if split_id == "None":
        # 42 was default seed used for splitting
        split_id = 42
    
    # Create dir for task by replicates
    dir_p = base_p / row["task"] / f"seed_{seed_id}" / f"split_{split_id}"
    if not dir_p.exists():
        dir_p.mkdir(parents=True, exist_ok=True)

    # Create dir for task
    dest_dir = dir_p / f"{ag_pos}__vs__{ag_neg}"
    dest_dir.mkdir(exist_ok=True, parents=True)

    # Copy files
    selected_paths: List[Path] = datasets.Task.compile_paths(exp, run_id)
    datasets.Task.copy_pathlist_to_dest(dest_dir, selected_paths[2:])



## [Deprecated] High vs looser|95low

In [None]:
# api = utils.MLFlowTaskAPI()
# run_name = "dev-v0.1.2-3-with-replicates"


# # High vs looser
# out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "high_vs_looser"
# if not out_dir.exists():
#     out_dir.mkdir()
# for antigen in config.ANTIGENS:
#     print(antigen)
#     task = datasets.Task(
#         ag_pos=f"{antigen}_high",
#         ag_neg=f"{antigen}_looser",
#         shuffle_antigen_labels=False,
#         run_name=run_name,
#     )
#     task.copy_files_to_dir(out_dir)


# # High vs 95low
# out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "high_vs_95low"
# if not out_dir.exists():
#     out_dir.mkdir()
# for antigen in config.ANTIGENS:
#     print(antigen)
#     task = datasets.Task(
#         ag_pos=f"{antigen}_high",
#         ag_neg=f"{antigen}_95low",
#         shuffle_antigen_labels=False,
#         run_name=run_name,
#     )
#     task.copy_files_to_dir(out_dir)


# # 1 vs 1
# out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "1_vs_1"
# if not out_dir.exists():
#     out_dir.mkdir()
# permutations = itertools.permutations(config.ANTIGENS, 2)
# for ag_pos, ag_neg in permutations:
#     print(ag_pos, ag_neg)
#     task = datasets.Task(
#         ag_pos=f"{ag_pos}",
#         ag_neg=f"{ag_neg}",
#         shuffle_antigen_labels=False,
#         run_name=run_name,
#     )
#     task.copy_files_to_dir(out_dir)


# # # 1 vs 9
# # out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "1_vs_9"
# # if not out_dir.exists():
# #     out_dir.mkdir()
# # for ag in config.ANTIGENS:
# #     print(ag)
# #     task = datasets.Task(
# #         ag_pos=ag,
# #         ag_neg=f"9",
# #         shuffle_antigen_labels=False,
# #         run_name=run_name,
# #     )
# #     task.copy_files_to_dir(out_dir)

3VRL
1NSN
3RAJ
5E94
1H0D
1WEJ
1ADQ
1FBI
2YPV
1OB1
3VRL
1NSN
3RAJ
5E94
1H0D
1WEJ
1ADQ
1FBI
2YPV
1OB1


In [None]:
# run_id, exp_id = api.get_experiment_and_run(
#     {
#         "ag_pos": "3VRL_high",
#         "ag_neg": "3VRL_looser",
#         "shuffle_antigen_labels": False,
#     }
# )

AssertionError: 

In [None]:
api = utils.MLFlowTaskAPI()
api.mlflow_request("14")
api.build_mlflow_results_df().columns

Index(['sample_train', 'optimizer_type', 'ag_neg', 'batch_size', 'epochs',
       'num_hidden_units', 'N_closed', 'N_train', 'split_id', 'ag_pos', 'swa',
       'shuffle_antigen_labels', 'input_dim', 'learning_rate', 'weight_decay',
       'momentum', 'seed_id', 'load_from_miniabsolut', 'acc_closed',
       'train_loss', 'roc_auc_closed', 'f1_closed', 'avg_precision_closed',
       'precision_closed', 'test_loss', 'recall_closed',
       'mlflow.source.git.commit', 'mlflow.note.content', 'mlflow.source.type',
       'mlflow.runName', 'mlflow.source.name', 'mlflow.log-model.history',
       'mlflow.user'],
      dtype='object')