# Does resampling experiment help with predicting CtD sentences?

In [1]:
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import plydata as ply
from sqlalchemy import create_engine

from snorkel.labeling.analysis import LFAnalysis
from snorkeling_helper.generative_model_helper import (
    sample_lfs,
    run_generative_label_function_sampler,
)

warnings.filterwarnings("ignore")

In [2]:
username = "danich1"
password = "snorkel"
dbname = "pubmed_central_db"
database_str = (
    f"postgresql+psycopg2://{username}:{password}@/{dbname}?host=/var/run/postgresql"
)
conn = create_engine(database_str)

## Load the data

In [3]:
label_candidates_dir = Path("../label_candidates/output")
notebook_output_dir = Path("output/CtD")

In [4]:
L_abstracts = pd.read_csv(
    str(label_candidates_dir / Path("cd_abstract_train_candidates_resampling.tsv")),
    sep="\t",
)

print(L_abstracts.shape)
L_abstracts.head().T

(971820, 103)


Unnamed: 0,0,1,2,3,4
LF_HETNET_PHARMACOTHERAPYDB,-1,-1,-1,-1,-1
LF_HETNET_CD_ABSENT,0,0,0,0,0
LF_CD_CHECK_DISEASE_TAG,0,0,0,0,-1
LF_DG_IS_BIOMARKER,-1,-1,-1,-1,-1
LF_DaG_ASSOCIATION,-1,-1,-1,-1,-1
...,...,...,...,...,...
LF_GG_BICLUSTER_INCREASES_EXPRESSION,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_SIGNALING,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_CELL_PRODUCTION,-1,0,-1,-1,-1


In [5]:
L_full_text = pd.read_csv(
    str(label_candidates_dir / Path("cd_full_text_train_candidates_resampling.tsv")),
    sep="\t",
)

print(L_full_text.shape)
L_full_text.head().T

(1247188, 103)


Unnamed: 0,0,1,2,3,4
LF_HETNET_PHARMACOTHERAPYDB,-1,-1,-1,-1,-1
LF_HETNET_CD_ABSENT,0,0,0,0,0
LF_CD_CHECK_DISEASE_TAG,0,0,0,0,0
LF_DG_IS_BIOMARKER,-1,-1,-1,-1,-1
LF_DaG_ASSOCIATION,-1,-1,-1,-1,-1
...,...,...,...,...,...
LF_GG_BICLUSTER_INCREASES_EXPRESSION,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_SIGNALING,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_CELL_PRODUCTION,-1,-1,-1,-1,-1


In [6]:
L_dev = pd.read_csv(
    str(label_candidates_dir / Path("cd_dev_test_candidates_resampling.tsv")), sep="\t"
) >> ply.query("split==10")
print(L_dev.shape)
L_dev.head().T

(500, 105)


Unnamed: 0,1,2,3,4,6
LF_HETNET_PHARMACOTHERAPYDB,-1.0,-1.0,-1.0,1.0,1.0
LF_HETNET_CD_ABSENT,0.0,0.0,0.0,-1.0,-1.0
LF_CD_CHECK_DISEASE_TAG,-1.0,0.0,0.0,-1.0,-1.0
LF_DG_IS_BIOMARKER,-1.0,-1.0,-1.0,-1.0,-1.0
LF_DaG_ASSOCIATION,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1.0,-1.0,-1.0,-1.0,-1.0
LF_GG_BICLUSTER_CELL_PRODUCTION,-1.0,-1.0,-1.0,-1.0,-1.0
split,10.0,10.0,10.0,10.0,10.0
document_id,162660.0,146415.0,213795.0,33875.0,15752.0


In [7]:
L_test = pd.read_csv(
    str(label_candidates_dir / Path("cd_dev_test_candidates_resampling.tsv")), sep="\t"
) >> ply.query("split==11")
print(L_dev.shape)
L_test.head().T

(500, 105)


Unnamed: 0,0,5,8,9,17
LF_HETNET_PHARMACOTHERAPYDB,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_CD_ABSENT,0.0,0.0,0.0,0.0,0.0
LF_CD_CHECK_DISEASE_TAG,0.0,0.0,-1.0,0.0,0.0
LF_DG_IS_BIOMARKER,-1.0,-1.0,-1.0,-1.0,-1.0
LF_DaG_ASSOCIATION,-1.0,-1.0,-1.0,1.0,-1.0
...,...,...,...,...,...
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1.0,-1.0,-1.0,-1.0,-1.0
LF_GG_BICLUSTER_CELL_PRODUCTION,-1.0,-1.0,-1.0,-1.0,-1.0
split,11.0,11.0,11.0,11.0,11.0
document_id,207885.0,291052.0,165878.0,186450.0,212095.0


## Resort the Candidates Based on Abstract

In [8]:
# Grab the document ids for resampling
sql = """
select cd_candidates.sentence_id, document_id, cd_candidates.candidate_id
from sentence
inner join (
  select candidate.candidate_id, compound_disease.sentence_id from compound_disease
  inner join candidate on candidate.candidate_id=compound_disease.candidate_id
  ) as cd_candidates
on sentence.sentence_id = cd_candidates.sentence_id
"""
candidate_doc_df = pd.read_sql(sql, database_str)
candidate_doc_df.head()

Unnamed: 0,sentence_id,document_id,candidate_id
0,578151345,24670128,8160
1,568187139,27952167,9876
2,561956917,8224816,20087
3,244310297,28588380,45149
4,593065266,15261556,89224


In [9]:
dev_test_ids = (
    L_dev >> ply.select("document_id") >> ply.distinct() >> ply.pull("document_id")
)

filtered_candidate_id = (
    candidate_doc_df
    >> ply.query(f"document_id in {list(dev_test_ids)}")
    >> ply.pull("candidate_id")
)

In [10]:
sorted_train_df = pd.read_csv(
    str(notebook_output_dir / Path("ctd_dataset_mapper.tsv")), sep="\t"
)
sorted_train_df.head()

Unnamed: 0,document_id,dataset
0,24670128,train
1,27952167,train
2,8224816,train
3,28588380,tune
4,15261556,train


In [11]:
trained_documents = (
    sorted_train_df
    >> ply.inner_join(candidate_doc_df, on="document_id")
    >> ply.query("dataset=='train'")
    >> ply.pull("candidate_id")
)

In [12]:
filtered_L_abstracts = L_abstracts >> ply.query(
    f"candidate_id in {list(trained_documents)}"
)
print(filtered_L_abstracts.shape)
filtered_L_abstracts.head()

(680470, 103)


Unnamed: 0,LF_HETNET_PHARMACOTHERAPYDB,LF_HETNET_CD_ABSENT,LF_CD_CHECK_DISEASE_TAG,LF_DG_IS_BIOMARKER,LF_DaG_ASSOCIATION,LF_DaG_WEAK_ASSOCIATION,LF_DaG_NO_ASSOCIATION,LF_DaG_CELLULAR_ACTIVITY,LF_DaG_DISEASE_SAMPLE,LF_DG_METHOD_DESC,...,LF_GG_NO_VERB,LF_GG_BICLUSTER_BINDING,LF_GG_BICLUSTER_ENHANCES,LF_GG_BICLUSTER_ACTIVATES,LF_GG_BICLUSTER_AFFECTS_EXPRESSION,LF_GG_BICLUSTER_INCREASES_EXPRESSION,LF_GG_BICLUSTER_SIGNALING,LF_GG_BICLUSTER_IDENTICAL_PROTEIN,LF_GG_BICLUSTER_CELL_PRODUCTION,candidate_id
0,-1,0,0,-1,-1,-1,-1,-1,1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,52
3,-1,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,408
4,-1,0,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,409
5,-1,0,0,-1,-1,-1,-1,-1,1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,410
6,-1,0,-1,-1,-1,-1,-1,-1,1,0,...,-1,-1,-1,1,-1,-1,-1,-1,-1,413


In [13]:
filtered_L_full_text = L_full_text >> ply.query(
    f"candidate_id in {list(trained_documents)}"
)
print(filtered_L_full_text.shape)
filtered_L_full_text.head()

(873241, 103)


Unnamed: 0,LF_HETNET_PHARMACOTHERAPYDB,LF_HETNET_CD_ABSENT,LF_CD_CHECK_DISEASE_TAG,LF_DG_IS_BIOMARKER,LF_DaG_ASSOCIATION,LF_DaG_WEAK_ASSOCIATION,LF_DaG_NO_ASSOCIATION,LF_DaG_CELLULAR_ACTIVITY,LF_DaG_DISEASE_SAMPLE,LF_DG_METHOD_DESC,...,LF_GG_NO_VERB,LF_GG_BICLUSTER_BINDING,LF_GG_BICLUSTER_ENHANCES,LF_GG_BICLUSTER_ACTIVATES,LF_GG_BICLUSTER_AFFECTS_EXPRESSION,LF_GG_BICLUSTER_INCREASES_EXPRESSION,LF_GG_BICLUSTER_SIGNALING,LF_GG_BICLUSTER_IDENTICAL_PROTEIN,LF_GG_BICLUSTER_CELL_PRODUCTION,candidate_id
30,-1,0,0,-1,1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,233
31,-1,0,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,234
32,-1,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,420
33,-1,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,422
34,-1,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,424


## Construct the Grid Search

In [14]:
# Global Grid
epochs_grid = [250]
l2_param_grid = np.linspace(0.01, 5, num=5)
lr_grid = [1e-3]
grid = list(
    zip(epochs_grid * len(l2_param_grid), l2_param_grid, lr_grid * len(l2_param_grid))
)

# Abstracts

In [15]:
analysis_module = LFAnalysis(
    filtered_L_abstracts >> ply.select("candidate_id", drop=True)
)

abstract_lf_summary = analysis_module.lf_summary()
abstract_lf_summary.index = (
    filtered_L_abstracts >> ply.select("candidate_id", drop=True)
).columns.tolist()

abstract_lf_summary

Unnamed: 0,Polarity,Coverage,Overlaps,Conflicts
LF_HETNET_PHARMACOTHERAPYDB,[1],0.263468,0.263468,0.263468
LF_HETNET_CD_ABSENT,[0],0.736532,0.736532,0.736532
LF_CD_CHECK_DISEASE_TAG,[0],0.736955,0.736955,0.736955
LF_DG_IS_BIOMARKER,[1],0.070260,0.070260,0.070260
LF_DaG_ASSOCIATION,[1],0.091335,0.091335,0.091335
...,...,...,...,...
LF_GG_BICLUSTER_AFFECTS_EXPRESSION,[1],0.014528,0.014528,0.014528
LF_GG_BICLUSTER_INCREASES_EXPRESSION,[0],0.033060,0.033060,0.033060
LF_GG_BICLUSTER_SIGNALING,[1],0.033065,0.033065,0.033065
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,[0],0.019811,0.019811,0.019811


# Set up fields for resampling

In [16]:
lf_columns_base = list(L_abstracts.columns[0:3])
data_columns = []

# Abstracts

## baseline

In [17]:
ctd_start = 0
ctd_end = 3
number_of_samples = 1

ctd_lf_range = range(ctd_start, ctd_end)
size_of_samples = [len(ctd_lf_range)]

In [18]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(ctd_lf_range),
            len(list(ctd_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [19]:
data_columns += run_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("ctd_training_marginals_baseline.tsv")
    ),
    curated_label="curated_ctd",
    entity_label="CtD_baseline",
    data_source="abstract",
)

100%|██████████| 1/1 [00:05<00:00,  5.04s/it]


## DaG

In [20]:
dag_start = 3
dag_end = 32

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, dag_end - dag_start]
number_of_samples = 50
dag_lf_range = range(dag_start, dag_end)

In [21]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(dag_lf_range),
            len(list(dag_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [22]:
data_columns += run_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file="",
    curated_label="curated_ctd",
    entity_label="DaG",
    data_source="abstract",
)

100%|██████████| 50/50 [01:45<00:00,  2.10s/it]
100%|██████████| 50/50 [02:13<00:00,  2.68s/it]
100%|██████████| 50/50 [02:48<00:00,  3.36s/it]
100%|██████████| 50/50 [03:42<00:00,  4.44s/it]
100%|██████████| 50/50 [05:30<00:00,  6.61s/it]


## CtD

In [23]:
ctd_start = 32
ctd_end = 54

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, ctd_end - ctd_start]
number_of_samples = 50
ctd_lf_range = range(ctd_start, ctd_end)

In [24]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(ctd_lf_range),
            len(list(ctd_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [25]:
data_columns += run_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("ctd_predicts_ctd_training_marginals.tsv")
    ),
    curated_label="curated_ctd",
    entity_label="CtD",
    data_source="abstract",
)

100%|██████████| 50/50 [04:07<00:00,  4.95s/it]
100%|██████████| 50/50 [04:37<00:00,  5.56s/it]
100%|██████████| 50/50 [05:16<00:00,  6.32s/it]
100%|██████████| 50/50 [06:10<00:00,  7.40s/it]
100%|██████████| 50/50 [06:59<00:00,  8.40s/it]


## CbG

In [26]:
cbg_start = 54
cbg_end = 74

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, cbg_end - cbg_start]
number_of_samples = 50
cbg_lf_range = range(cbg_start, cbg_end)

In [27]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(cbg_lf_range),
            len(list(cbg_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [28]:
data_columns += run_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file="",
    curated_label="curated_ctd",
    entity_label="CbG",
    data_source="abstract",
)

100%|██████████| 50/50 [01:46<00:00,  2.13s/it]
100%|██████████| 50/50 [02:13<00:00,  2.68s/it]
100%|██████████| 50/50 [02:48<00:00,  3.37s/it]
100%|██████████| 50/50 [03:40<00:00,  4.40s/it]
100%|██████████| 50/50 [04:10<00:00,  5.01s/it]


## GiG

In [29]:
gig_start = 74
gig_end = 102

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, gig_end - gig_start]
number_of_samples = 50
gig_lf_range = range(gig_start, gig_end)

In [30]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(gig_lf_range),
            len(list(gig_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [31]:
data_columns += run_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file="",
    curated_label="curated_ctd",
    entity_label="GiG",
    data_source="abstract",
)

100%|██████████| 50/50 [01:46<00:00,  2.12s/it]
100%|██████████| 50/50 [02:14<00:00,  2.69s/it]
100%|██████████| 50/50 [02:49<00:00,  3.38s/it]
100%|██████████| 50/50 [03:42<00:00,  4.44s/it]
100%|██████████| 50/50 [05:26<00:00,  6.53s/it]


# Full Text

## DaG

In [32]:
dag_start = 2
dag_end = 32

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, dag_end - dag_start]
number_of_samples = 50
dag_lf_range = range(dag_start, dag_end)

In [33]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(dag_lf_range),
            len(list(dag_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [34]:
data_columns += run_generative_label_function_sampler(
    filtered_L_full_text,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file="",
    curated_label="curated_ctd",
    entity_label="DaG",
    data_source="full_text",
)

100%|██████████| 50/50 [02:02<00:00,  2.44s/it]
100%|██████████| 50/50 [02:36<00:00,  3.14s/it]
100%|██████████| 50/50 [03:37<00:00,  4.36s/it]
100%|██████████| 50/50 [04:24<00:00,  5.28s/it]
100%|██████████| 50/50 [06:46<00:00,  8.13s/it]


## CtD

In [35]:
ctd_start = 32
ctd_end = 54

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, ctd_end - ctd_start]
number_of_samples = 50
ctd_lf_range = range(ctd_start, ctd_end)

In [36]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(ctd_lf_range),
            len(list(ctd_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [37]:
data_columns += run_generative_label_function_sampler(
    filtered_L_full_text,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file="",
    curated_label="curated_ctd",
    entity_label="CtD",
    data_source="full_text",
)

100%|██████████| 50/50 [02:02<00:00,  2.45s/it]
100%|██████████| 50/50 [02:40<00:00,  3.21s/it]
100%|██████████| 50/50 [03:44<00:00,  4.49s/it]
100%|██████████| 50/50 [04:34<00:00,  5.49s/it]
100%|██████████| 50/50 [05:37<00:00,  6.75s/it]


## CbG

In [38]:
cbg_start = 54
cbg_end = 74

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, cbg_end - cbg_start]
number_of_samples = 50
cbg_lf_range = range(cbg_start, cbg_end)

In [39]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(cbg_lf_range),
            len(list(cbg_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [40]:
data_columns += run_generative_label_function_sampler(
    filtered_L_full_text,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file="",
    curated_label="curated_ctd",
    entity_label="CbG",
    data_source="full_text",
)

100%|██████████| 50/50 [02:01<00:00,  2.43s/it]
100%|██████████| 50/50 [02:37<00:00,  3.15s/it]
100%|██████████| 50/50 [03:39<00:00,  4.40s/it]
100%|██████████| 50/50 [04:28<00:00,  5.37s/it]
100%|██████████| 50/50 [05:10<00:00,  6.21s/it]


## GiG

In [41]:
gig_start = 74
gig_end = 102

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, gig_end - gig_start]
number_of_samples = 50
gig_lf_range = range(gig_start, gig_end)

In [42]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(gig_lf_range),
            len(list(gig_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [43]:
data_columns += run_generative_label_function_sampler(
    filtered_L_full_text,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file="",
    curated_label="curated_ctd",
    entity_label="GiG",
    data_source="full_text",
)

100%|██████████| 50/50 [02:01<00:00,  2.43s/it]
100%|██████████| 50/50 [02:37<00:00,  3.16s/it]
100%|██████████| 50/50 [03:40<00:00,  4.41s/it]
100%|██████████| 50/50 [04:29<00:00,  5.39s/it]
100%|██████████| 50/50 [06:40<00:00,  8.02s/it]


# Write Performance to File

In [44]:
performance_df = pd.DataFrame.from_records(data_columns)
performance_df

Unnamed: 0,lf_num,auroc,aupr,bce_loss,sampled_lf_name,label_source,data_source,model,epochs,l2_param,lr_param
0,3,0.674569,0.475489,0.544531,"LF_HETNET_PHARMACOTHERAPYDB,LF_HETNET_CD_ABSEN...",CtD_baseline,abstract,tune,250,0.01,0.001
1,3,0.538901,0.299052,0.563438,"LF_HETNET_PHARMACOTHERAPYDB,LF_HETNET_CD_ABSEN...",CtD_baseline,abstract,test,250,0.01,0.001
2,1,0.677612,0.476133,0.543084,"LF_HETNET_PHARMACOTHERAPYDB,LF_HETNET_CD_ABSEN...",DaG,abstract,tune,250,0.01,0.001
3,1,0.538211,0.298225,0.564514,"LF_HETNET_PHARMACOTHERAPYDB,LF_HETNET_CD_ABSEN...",DaG,abstract,test,250,0.01,0.001
4,1,0.675459,0.464090,0.544241,"LF_HETNET_PHARMACOTHERAPYDB,LF_HETNET_CD_ABSEN...",DaG,abstract,tune,250,0.01,0.001
...,...,...,...,...,...,...,...,...,...,...,...
3997,28,0.586490,0.282289,1.255202,"LF_HETNET_PHARMACOTHERAPYDB,LF_HETNET_CD_ABSEN...",GiG,full_text,test,250,0.01,0.001
3998,28,0.678914,0.382293,0.877900,"LF_HETNET_PHARMACOTHERAPYDB,LF_HETNET_CD_ABSEN...",GiG,full_text,tune,250,0.01,0.001
3999,28,0.586490,0.282289,1.255202,"LF_HETNET_PHARMACOTHERAPYDB,LF_HETNET_CD_ABSEN...",GiG,full_text,test,250,0.01,0.001
4000,28,0.678914,0.382293,0.877900,"LF_HETNET_PHARMACOTHERAPYDB,LF_HETNET_CD_ABSEN...",GiG,full_text,tune,250,0.01,0.001


In [45]:
(
    performance_df
    >> ply.call(
        "to_csv",
        str(Path("output/performance") / Path("CtD_performance.tsv")),
        index=False,
        sep="\t",
    )
)