# Data per task

We create and train models with mlflow. Here we develop some functions to help us loading the results and processing them.

In [33]:
import os
import itertools
from pathlib import Path
from typing import List
import warnings
import json


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import NegativeClassOptimization.config as config
import NegativeClassOptimization.utils as utils
import NegativeClassOptimization.preprocessing as preprocessing
from NegativeClassOptimization import ml
from NegativeClassOptimization import datasets, pipelines, visualisations

## High vs looser|95low

In [32]:
from dataclasses import dataclass


@dataclass
class Task:
    ag_pos: str
    ag_neg: str
    shuffle_antigen_labels: str = "False"

    def __post_init__(self):
        if isinstance(self.shuffle_antigen_labels, bool):
            self.shuffle_antigen_labels = str(self.shuffle_antigen_labels)

    def get_exp_and_run_ids(self) -> str:
        api = utils.MLFlowTaskAPI()
        exp_id, run_id = api.get_experiment_and_run(self.__dict__)
        return exp_id, run_id
    
    def get_paths_to_data(self) -> List[Path]:
        exp_id, run_id = self.get_exp_and_run_ids()

        self.artifacts_path = config.DATA_BASE_PATH / Path(f"nco_mlflow_runs/ftp/artifacts_store/{exp_id}/{run_id}/artifacts/")

        # This is a hack to correct for a bug in folder/file namiang
        glob_list = list((self.artifacts_path / "dataset/train_dataset.tsv").glob("*tsv"))
        self.dataset_hash = glob_list[0].stem.split("_")[0]
        self.df_train_path = self.artifacts_path / f"dataset/train_dataset.tsv/{self.dataset_hash}_train_dataset.tsv"
        self.df_test_path = self.artifacts_path / f"dataset/test_dataset.tsv/{self.dataset_hash}_test_dataset.tsv"

        self.metrics_path = self.artifacts_path / "eval_metrics.json"
        self.model_path = self.artifacts_path / f"models/trained_model"
        self.swa_model_path = self.artifacts_path / f"models/swa_model"
    
    def copy_files_to_dir(self, dest_dir: Path):
        self.get_paths_to_data()

        dest_dir = Path(dest_dir) / f"{self.ag_pos}__vs__{self.ag_neg}"
        dest_dir.mkdir(exist_ok=True, parents=True)
        
        for path in [self.df_train_path, self.df_test_path, self.metrics_path, self.model_path, self.swa_model_path]:
            dest_path = dest_dir / path.name
            if dest_path.exists():
                warnings.warn(f"File {dest_path} already exists. Skipping copy.")
            else:
                os.system(f"cp -r {path} {dest_path}")

In [34]:
api = utils.MLFlowTaskAPI()


# High vs looser
out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "high_vs_looser"
if not out_dir.exists():
    out_dir.mkdir()
for antigen in config.ANTIGENS:
    print(antigen)
    task = Task(
        ag_pos=f"{antigen}_high",
        ag_neg=f"{antigen}_looser",
        shuffle_antigen_labels=False,
    )
    task.copy_files_to_dir(out_dir)


# High vs 95low
out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "high_vs_95low"
if not out_dir.exists():
    out_dir.mkdir()
for antigen in config.ANTIGENS:
    print(antigen)
    task = Task(
        ag_pos=f"{antigen}_high",
        ag_neg=f"{antigen}_looser",
        shuffle_antigen_labels=False,
    )
    task.copy_files_to_dir(out_dir)


# 1 vs 1
out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "1_vs_1"
if not out_dir.exists():
    out_dir.mkdir()
permutations = itertools.permutations(config.ANTIGENS, 2)
for ag_pos, ag_neg in permutations:
    print(ag_pos, ag_neg)
    task = Task(
        ag_pos=f"{ag_pos}",
        ag_neg=f"{ag_neg}",
        shuffle_antigen_labels=False,
    )
    task.copy_files_to_dir(out_dir)


# 1 vs 9
out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "1_vs_9"
if not out_dir.exists():
    out_dir.mkdir()
for ag in config.ANTIGENS:
    print(ag)
    task = Task(
        ag_pos=ag,
        ag_neg=f"9",
        shuffle_antigen_labels=False,
    )
    task.copy_files_to_dir(out_dir)

3VRL
1NSN
3RAJ
5E94
1H0D
1WEJ
1ADQ
1FBI
2YPV
1OB1
3VRL
1NSN
3RAJ
5E94
1H0D
1WEJ
1ADQ
1FBI
2YPV
1OB1
3VRL 1NSN
3VRL 3RAJ
3VRL 5E94
3VRL 1H0D
3VRL 1WEJ
3VRL 1ADQ
3VRL 1FBI
3VRL 2YPV
3VRL 1OB1
1NSN 3VRL
1NSN 3RAJ
1NSN 5E94
1NSN 1H0D
1NSN 1WEJ
1NSN 1ADQ
1NSN 1FBI
1NSN 2YPV
1NSN 1OB1
3RAJ 3VRL
3RAJ 1NSN
3RAJ 5E94
3RAJ 1H0D
3RAJ 1WEJ
3RAJ 1ADQ
3RAJ 1FBI
3RAJ 2YPV
3RAJ 1OB1
5E94 3VRL
5E94 1NSN
5E94 3RAJ
5E94 1H0D
5E94 1WEJ
5E94 1ADQ
5E94 1FBI
5E94 2YPV
5E94 1OB1
1H0D 3VRL
1H0D 1NSN
1H0D 3RAJ
1H0D 5E94
1H0D 1WEJ
1H0D 1ADQ
1H0D 1FBI
1H0D 2YPV
1H0D 1OB1
1WEJ 3VRL
1WEJ 1NSN
1WEJ 3RAJ
1WEJ 5E94
1WEJ 1H0D
1WEJ 1ADQ
1WEJ 1FBI
1WEJ 2YPV
1WEJ 1OB1
1ADQ 3VRL
1ADQ 1NSN
1ADQ 3RAJ
1ADQ 5E94
1ADQ 1H0D
1ADQ 1WEJ
1ADQ 1FBI
1ADQ 2YPV
1ADQ 1OB1
1FBI 3VRL
1FBI 1NSN
1FBI 3RAJ
1FBI 5E94
1FBI 1H0D
1FBI 1WEJ
1FBI 1ADQ
1FBI 2YPV
1FBI 1OB1
2YPV 3VRL
2YPV 1NSN
2YPV 3RAJ
2YPV 5E94
2YPV 1H0D
2YPV 1WEJ
2YPV 1ADQ
2YPV 1FBI
2YPV 1OB1
1OB1 3VRL
1OB1 1NSN
1OB1 3RAJ
1OB1 5E94
1OB1 1H0D
1OB1 1WEJ
1OB1 1ADQ
1OB1 1FBI
1OB1 2YPV


In [16]:
run_id, exp_id = api.get_experiment_and_run(
    {
    "ag_pos": "3VRL_high",
    "ag_neg": "3VRL_looser",
    "shuffle_antigen_labels": False,
    }
)

AssertionError: 

In [22]:
api = utils.MLFlowTaskAPI()
api.mlflow_request("14")
api.build_mlflow_results_df().columns

Index(['sample_train', 'optimizer_type', 'ag_neg', 'batch_size', 'epochs',
       'num_hidden_units', 'N_closed', 'N_train', 'split_id', 'ag_pos', 'swa',
       'shuffle_antigen_labels', 'input_dim', 'learning_rate', 'weight_decay',
       'momentum', 'seed_id', 'load_from_miniabsolut', 'acc_closed',
       'train_loss', 'roc_auc_closed', 'f1_closed', 'avg_precision_closed',
       'precision_closed', 'test_loss', 'recall_closed',
       'mlflow.source.git.commit', 'mlflow.note.content', 'mlflow.source.type',
       'mlflow.runName', 'mlflow.source.name', 'mlflow.log-model.history',
       'mlflow.user'],
      dtype='object')

In [None]:
# Given a task specification, fetch experiment_id and run_id

# # Binders
# task1 = {
#     "ag_pos": "1ADQ_high",
#     "ag_neg": "1ADQ_looser",  # or "1ADQ_95low"
#     "shuffle_antigen_labels": "False",
# }

# 1v1
task2 = {
    "ag_pos": "1ADQ",
    "ag_neg": "3VRL",
    "shuffle_antigen_labels": "False",
}

## 1v2 - computed, but loading not implemented
# task3 = {
#     "ag_pos": "1ADQ",
#     "ag_neg": ("3VRL", "5E94"),
#     "shuffle_antigen_labels": "False",
# }

# 1v9
task4 = {
    "ag_pos": "1ADQ",
    "ag_neg": "9",
    "shuffle_antigen_labels": "False",
}

api = utils.MLFlowTaskAPI()
api.get_experiment_and_run(task1)
api.get_experiment_and_run(task2)
# api.get_experiment_and_run(task3)
api.get_experiment_and_run(task4)