# Data per task

We create and train models with mlflow. Here we develop some functions to help us loading the results and processing them.

In [1]:
import os
import itertools
from pathlib import Path
from typing import List
import warnings
import json


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import NegativeClassOptimization.config as config
import NegativeClassOptimization.utils as utils
import NegativeClassOptimization.preprocessing as preprocessing
from NegativeClassOptimization import ml
from NegativeClassOptimization import datasets, pipelines, visualisations

  from .autonotebook import tqdm as notebook_tqdm


## Collecting mlflow results

- dev-v0.1.2-3-with-replicates - main results (latest)
- dev-v0.1.2-3-with-replicates-linear - main linear results (latest)
- dev-v0.2-shuffled - shuffled (latest)

In [2]:
# experiment_ids = ["11", "13", "14"]
# experiment_ids = ["13", "14"]
# run_name = "dev-v0.2-shuffled"
# dir_name = "Frozen_MiniAbsolut_ML_shuffled"

# experiment_ids = ["11", "13", "14"]
# run_name = "dev-v0.1.2-3-with-replicates-linear"
# dir_name = "Frozen_MiniAbsolut_Linear_ML"

experiment_ids = ["11", "13", "14"]
run_name = "dev-v0.2-shuffled"
dir_name = "Frozen_MiniAbsolut_ML_shuffled"


df = utils.MLFlowTaskAPI.mlflow_results_as_dataframe(experiment_ids, run_name=run_name)
tasks = []
for i, row in df.iterrows():
    exp: str = row["experiment"]
    ag_neg: str = row["ag_neg"]
    if exp == "11":
        tasks.append("1v1")
    elif exp == "13":
        tasks.append("1v9")
    elif exp == "14":
        if ag_neg.split("_")[1] == "looser":
            tasks.append("high_vs_looser")
        elif ag_neg.split("_")[1] == "95low":
            tasks.append("high_vs_95low")
    else:
        raise ValueError(f"Experiment {exp} not recognized.")
df["task"] = tasks

In [9]:
print(df.columns)
print(df.shape)
df.head()

Index(['model_type', 'sample_train', 'load_from_miniabsolut_split_seed',
       'optimizer_type', 'ag_neg', 'batch_size', 'epochs', 'num_hidden_units',
       'N_closed', 'N_train', 'split_id', 'ag_pos', 'swa',
       'shuffle_antigen_labels', 'input_dim', 'learning_rate', 'weight_decay',
       'momentum', 'seed_id', 'load_from_miniabsolut', 'acc_closed',
       'train_loss', 'roc_auc_closed', 'f1_closed', 'avg_precision_closed',
       'precision_closed', 'test_loss', 'recall_closed',
       'mlflow.source.git.commit', 'mlflow.note.content', 'mlflow.source.type',
       'mlflow.runName', 'mlflow.source.name', 'mlflow.user',
       'mlflow.log-model.history', 'experiment', 'run_id', 'split_seed',
       'task'],
      dtype='object')
(39, 39)


Unnamed: 0,model_type,sample_train,load_from_miniabsolut_split_seed,optimizer_type,ag_neg,batch_size,epochs,num_hidden_units,N_closed,N_train,...,mlflow.note.content,mlflow.source.type,mlflow.runName,mlflow.source.name,mlflow.user,mlflow.log-model.history,experiment,run_id,split_seed,task
90,SNN,,,Adam,1OB1,64,50,10,10000,30000,...,1ADQ vs 1OB1,LOCAL,dev-v0.2-shuffled,scripts/script_12a_train_SN10_clean.py,eugen,"[{""run_id"": ""1921c5bc0b7145ce9b93792bed6acb1b""...",11,1921c5bc0b7145ce9b93792bed6acb1b,42,1v1
91,SNN,,,Adam,2YPV,64,50,10,10000,30000,...,1ADQ vs 2YPV,LOCAL,dev-v0.2-shuffled,scripts/script_12a_train_SN10_clean.py,eugen,"[{""run_id"": ""94bc48f602584b669b04c54ae6630540""...",11,94bc48f602584b669b04c54ae6630540,42,1v1
92,SNN,,,Adam,1FBI,64,50,10,10000,30000,...,1ADQ vs 1FBI,LOCAL,dev-v0.2-shuffled,scripts/script_12a_train_SN10_clean.py,eugen,"[{""run_id"": ""beb35d3c4f6f405795c1f03e75352f70""...",11,beb35d3c4f6f405795c1f03e75352f70,42,1v1
93,SNN,,,Adam,1H0D,64,50,10,10000,30000,...,1ADQ vs 1H0D,LOCAL,dev-v0.2-shuffled,scripts/script_12a_train_SN10_clean.py,eugen,"[{""run_id"": ""8a34e380be0a436d902644c625f9d686""...",11,8a34e380be0a436d902644c625f9d686,42,1v1
94,SNN,,,Adam,1WEJ,64,50,10,10000,30000,...,1ADQ vs 1WEJ,LOCAL,dev-v0.2-shuffled,scripts/script_12a_train_SN10_clean.py,eugen,"[{""run_id"": ""a07762780f5e4b639424d1e1b64004ad""...",11,a07762780f5e4b639424d1e1b64004ad,42,1v1


In [8]:
df.ag_pos.value_counts()

1ADQ         10
2YPV_high     2
3RAJ_high     2
1NSN_high     2
5E94_high     2
1H0D_high     2
1WEJ_high     2
1ADQ_high     2
1FBI_high     2
1OB1_high     2
3VRL_high     2
1OB1          1
3RAJ          1
1NSN          1
3VRL          1
5E94          1
1H0D          1
1WEJ          1
2YPV          1
1FBI          1
Name: ag_pos, dtype: int64

In [5]:
df.ag_neg.value_counts()

1OB1                                                                        1
1H0D_95low                                                                  1
1FBI_95low                                                                  1
1FBI_looser                                                                 1
2YPV_looser                                                                 1
1ADQ_looser                                                                 1
1ADQ_95low                                                                  1
1WEJ_95low                                                                  1
1WEJ_looser                                                                 1
1H0D_looser                                                                 1
1OB1_looser                                                                 1
5E94_95low                                                                  1
1NSN_95low                                                      

In [6]:
base_p = Path(config.DATA_BASE_PATH) / dir_name
df.to_csv(base_p / "df_task_data.tsv", sep='\t', index=False)

OSError: Cannot save file into a non-existent directory: '/data/sources/eugen/negative-class-optimization/data/Frozen_MiniAbsolut_ML_shuffled'

## Collect results and organize in a directory

In [None]:
# Create the directory structure for the dataset.
# task: The task of the dataset. E.g., "absa".
# seed_id: The seed of the dataset. E.g., 42.
# split_id: The id of the split. E.g., 0.
# base_p: The base path to the dataset. E.g., "/path/to/dataset".

for task in df["task"].unique():
    for seed_id in df["seed_id"].unique():
        for split_id in df["load_from_miniabsolut_split_seed"].unique():
            if split_id == "None":
                split_id = 42
            dir_p = base_p / task / f"seed_{seed_id}" / f"split_{split_id}"
            dir_p.mkdir(parents=True, exist_ok=True)

In [None]:
## The functions below are copied from 
## previous datasets.Task, which is deprecated
## for now.
def compile_paths(exp_id, run_id) -> List[Path]:
    artifacts_path = config.DATA_BASE_PATH / Path(f"nco_mlflow_runs/ftp/artifacts_store/{exp_id}/{run_id}/artifacts/")

    # This is a hack to correct for a bug in folder/file namiang
    glob_list = list((artifacts_path / "dataset/train_dataset.tsv").glob("*tsv"))
    dataset_hash = glob_list[0].stem.split("_")[0]
    df_train_path = artifacts_path / f"dataset/train_dataset.tsv/{dataset_hash}_train_dataset.tsv"
    df_test_path = artifacts_path / f"dataset/test_dataset.tsv/{dataset_hash}_test_dataset.tsv"

    metrics_path = artifacts_path / "eval_metrics.json"
    model_path = artifacts_path / f"models/trained_model"
    swa_model_path = artifacts_path / f"models/swa_model"
    return [artifacts_path, dataset_hash, df_train_path, df_test_path, metrics_path, model_path, swa_model_path]


def copy_pathlist_to_dest(dest_dir: Path, list_of_paths: List[Path]):
    for path in list_of_paths:
        dest_path = dest_dir / path.name
        if dest_path.exists():
            warnings.warn(f"File {dest_path} already exists. Skipping copy.")
        else:
            os.system(f"cp -r {path} {dest_path}")

In [None]:
for i, row in df.iterrows():
    exp = row["experiment"]
    task = row["task"]
    run_id = row["run_id"]
    seed_id = row["seed_id"]

    ag_pos = row["ag_pos"]
    ag_neg = row["ag_neg"]
    if "(" in ag_neg:
        # For 1v9
        ag_neg = "9"

    split_id = row["load_from_miniabsolut_split_seed"]
    if split_id == "None":
        # 42 was default seed used for splitting
        split_id = 42
    
    # Create dir for task by replicates
    dir_p = base_p / row["task"] / f"seed_{seed_id}" / f"split_{split_id}"
    if not dir_p.exists():
        dir_p.mkdir(parents=True, exist_ok=True)

    # Create dir for task
    dest_dir = dir_p / f"{ag_pos}__vs__{ag_neg}"
    dest_dir.mkdir(exist_ok=True, parents=True)

    # Copy files
    ## this section depends on datasets.Task
    selected_paths: List[Path] = compile_paths(exp, run_id)
    copy_pathlist_to_dest(dest_dir, selected_paths[2:])

## [Deprecated] High vs looser|95low

In [None]:
# api = utils.MLFlowTaskAPI()
# run_name = "dev-v0.1.2-3-with-replicates"


# # High vs looser
# out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "high_vs_looser"
# if not out_dir.exists():
#     out_dir.mkdir()
# for antigen in config.ANTIGENS:
#     print(antigen)
#     task = datasets.Task(
#         ag_pos=f"{antigen}_high",
#         ag_neg=f"{antigen}_looser",
#         shuffle_antigen_labels=False,
#         run_name=run_name,
#     )
#     task.copy_files_to_dir(out_dir)


# # High vs 95low
# out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "high_vs_95low"
# if not out_dir.exists():
#     out_dir.mkdir()
# for antigen in config.ANTIGENS:
#     print(antigen)
#     task = datasets.Task(
#         ag_pos=f"{antigen}_high",
#         ag_neg=f"{antigen}_95low",
#         shuffle_antigen_labels=False,
#         run_name=run_name,
#     )
#     task.copy_files_to_dir(out_dir)


# # 1 vs 1
# out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "1_vs_1"
# if not out_dir.exists():
#     out_dir.mkdir()
# permutations = itertools.permutations(config.ANTIGENS, 2)
# for ag_pos, ag_neg in permutations:
#     print(ag_pos, ag_neg)
#     task = datasets.Task(
#         ag_pos=f"{ag_pos}",
#         ag_neg=f"{ag_neg}",
#         shuffle_antigen_labels=False,
#         run_name=run_name,
#     )
#     task.copy_files_to_dir(out_dir)


# # # 1 vs 9
# # out_dir = config.DATA_BASE_PATH / "Frozen_MiniAbsolut_ML" / "1_vs_9"
# # if not out_dir.exists():
# #     out_dir.mkdir()
# # for ag in config.ANTIGENS:
# #     print(ag)
# #     task = datasets.Task(
# #         ag_pos=ag,
# #         ag_neg=f"9",
# #         shuffle_antigen_labels=False,
# #         run_name=run_name,
# #     )
# #     task.copy_files_to_dir(out_dir)

In [None]:
# run_id, exp_id = api.get_experiment_and_run(
#     {
#         "ag_pos": "3VRL_high",
#         "ag_neg": "3VRL_looser",
#         "shuffle_antigen_labels": False,
#     }
# )

In [None]:
# api = utils.MLFlowTaskAPI()
# api.mlflow_request("14")
# api.build_mlflow_results_df().columns