# Does resampling experiment help with predicting CbG sentences?

In [1]:
from itertools import product
from pathlib import Path
import warnings

import pandas as pd
import plydata as ply
from sqlalchemy import create_engine

from snorkel.labeling.analysis import LFAnalysis
from snorkeling_helper.generative_model_helper import (
    sample_lfs,
    train_generative_label_function_sampler,
)

warnings.filterwarnings("ignore")

In [2]:
username = "danich1"
password = "snorkel"
dbname = "pubmed_central_db"
database_str = (
    f"postgresql+psycopg2://{username}:{password}@/{dbname}?host=/var/run/postgresql"
)
conn = create_engine(database_str)

## Load the data

In [3]:
label_candidates_dir = Path("../label_candidates/output")
notebook_output_dir = Path("output/CtD")

In [4]:
L_abstracts = pd.read_csv(
    str(label_candidates_dir / Path("cg_abstract_train_candidates_resampling.tsv")),
    sep="\t",
)

print(L_abstracts.shape)
L_abstracts.head().T

(1292772, 109)


Unnamed: 0,0,1,2,3,4
LF_HETNET_DRUGBANK,-1,-1,-1,-1,-1
LF_HETNET_DRUGCENTRAL,-1,-1,-1,-1,-1
LF_HETNET_ChEMBL,-1,-1,-1,-1,-1
LF_HETNET_BINDINGDB,-1,-1,-1,-1,-1
LF_HETNET_PDSP_KI,-1,-1,-1,-1,-1
...,...,...,...,...,...
LF_GG_BICLUSTER_INCREASES_EXPRESSION,0,-1,-1,-1,-1
LF_GG_BICLUSTER_SIGNALING,1,-1,-1,-1,-1
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_CELL_PRODUCTION,0,-1,-1,-1,-1


In [4]:
L_full_text = pd.read_csv(
    str(label_candidates_dir / Path("cg_full_text_train_candidates_resampling.tsv")),
    sep="\t",
)

print(L_full_text.shape)
L_full_text.head().T

(2121209, 109)


Unnamed: 0,0,1,2,3,4
LF_HETNET_DRUGBANK,-1,-1,-1,-1,-1
LF_HETNET_DRUGCENTRAL,-1,-1,-1,-1,-1
LF_HETNET_ChEMBL,-1,-1,-1,-1,-1
LF_HETNET_BINDINGDB,-1,-1,-1,-1,-1
LF_HETNET_PDSP_KI,-1,-1,-1,-1,-1
...,...,...,...,...,...
LF_GG_BICLUSTER_INCREASES_EXPRESSION,-1,-1,0,-1,-1
LF_GG_BICLUSTER_SIGNALING,-1,-1,1,-1,-1
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_CELL_PRODUCTION,-1,-1,0,-1,-1


In [5]:
L_dev = pd.read_csv(
    str(label_candidates_dir / Path("cg_dev_test_candidates_resampling.tsv")), sep="\t"
) >> ply.query("split==7")
print(L_dev.shape)
L_dev.head().T

(500, 111)


Unnamed: 0,0,4,9,10,11
LF_HETNET_DRUGBANK,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_DRUGCENTRAL,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_ChEMBL,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_BINDINGDB,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_PDSP_KI,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1.0,-1.0,0.0,-1.0,-1.0
LF_GG_BICLUSTER_CELL_PRODUCTION,-1.0,-1.0,0.0,-1.0,-1.0
split,7.0,7.0,7.0,7.0,7.0
document_id,209829.0,178683.0,132454.0,19485.0,358873.0


## Restort Based on the Candidate Abstracts

In [6]:
# Grab the document ids for resampling
sql = """
select cg_candidates.sentence_id, document_id, cg_candidates.candidate_id from sentence
inner join (
  select candidate.candidate_id, compound_gene.sentence_id from compound_gene
  inner join candidate on candidate.candidate_id=compound_gene.candidate_id
  ) as cg_candidates
on sentence.sentence_id = cg_candidates.sentence_id
"""
candidate_doc_df = pd.read_sql(sql, database_str)
candidate_doc_df.head()

Unnamed: 0,sentence_id,document_id,candidate_id
0,293853012,23111736,17520
1,361497023,24837142,35137
2,395950432,24746171,35735
3,481487831,30853913,42182
4,579412852,21454644,58965


In [7]:
dev_test_ids = (
    L_dev >> ply.select("document_id") >> ply.distinct() >> ply.pull("document_id")
)

filtered_candidate_id = (
    candidate_doc_df
    >> ply.query(f"document_id in {list(dev_test_ids)}")
    >> ply.pull("candidate_id")
)

In [8]:
sorted_train_df = pd.read_csv(
    str(notebook_output_dir / Path("cbg_dataset_mapper.tsv")), sep="\t"
)
sorted_train_df.head()

Unnamed: 0,document_id,dataset
0,23111736,train
1,24837142,train
2,24746171,train
3,30853913,tune
4,21454644,train


In [9]:
trained_documents = (
    sorted_train_df
    >> ply.inner_join(candidate_doc_df, on="document_id")
    >> ply.query("dataset=='train'")
    >> ply.pull("candidate_id")
)

In [10]:
filtered_L_abstracts = L_abstracts >> ply.query(
    f"candidate_id in {list(trained_documents)}"
)
print(filtered_L_abstracts.shape)
filtered_L_abstracts.head()

(904644, 109)


Unnamed: 0,LF_HETNET_DRUGBANK,LF_HETNET_DRUGCENTRAL,LF_HETNET_ChEMBL,LF_HETNET_BINDINGDB,LF_HETNET_PDSP_KI,LF_HETNET_US_PATENT,LF_HETNET_PUBCHEM,LF_HETNET_CG_ABSENT,LF_CG_CHECK_GENE_TAG,LF_DG_IS_BIOMARKER,...,LF_GG_NO_VERB,LF_GG_BICLUSTER_BINDING,LF_GG_BICLUSTER_ENHANCES,LF_GG_BICLUSTER_ACTIVATES,LF_GG_BICLUSTER_AFFECTS_EXPRESSION,LF_GG_BICLUSTER_INCREASES_EXPRESSION,LF_GG_BICLUSTER_SIGNALING,LF_GG_BICLUSTER_IDENTICAL_PROTEIN,LF_GG_BICLUSTER_CELL_PRODUCTION,candidate_id
1,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,191
7,-1,-1,-1,-1,-1,-1,-1,-1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,449
8,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,450
10,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,897
11,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,898


In [11]:
filtered_L_full_text = L_full_text >> ply.query(
    f"candidate_id in {list(trained_documents)}"
)
print(filtered_L_full_text.shape)
filtered_L_full_text.head()

(1483352, 109)


Unnamed: 0,LF_HETNET_DRUGBANK,LF_HETNET_DRUGCENTRAL,LF_HETNET_ChEMBL,LF_HETNET_BINDINGDB,LF_HETNET_PDSP_KI,LF_HETNET_US_PATENT,LF_HETNET_PUBCHEM,LF_HETNET_CG_ABSENT,LF_CG_CHECK_GENE_TAG,LF_DG_IS_BIOMARKER,...,LF_GG_NO_VERB,LF_GG_BICLUSTER_BINDING,LF_GG_BICLUSTER_ENHANCES,LF_GG_BICLUSTER_ACTIVATES,LF_GG_BICLUSTER_AFFECTS_EXPRESSION,LF_GG_BICLUSTER_INCREASES_EXPRESSION,LF_GG_BICLUSTER_SIGNALING,LF_GG_BICLUSTER_IDENTICAL_PROTEIN,LF_GG_BICLUSTER_CELL_PRODUCTION,candidate_id
0,-1,-1,-1,-1,-1,-1,-1,-1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,170
1,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,171
8,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,360
9,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,361
10,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,362


## Construct the Grid Search

In [12]:
# Global Grid
epochs_grid = [100]
l2_param_grid = [0.75]
lr_grid = [1e-3]
grid = list(product(epochs_grid, l2_param_grid, lr_grid))

# Abstracts

In [13]:
analysis_module = LFAnalysis(
    filtered_L_abstracts >> ply.select("candidate_id", drop=True)
)

abstract_lf_summary = analysis_module.lf_summary()
abstract_lf_summary.index = (
    filtered_L_abstracts >> ply.select("candidate_id", drop=True)
).columns.tolist()

abstract_lf_summary

Unnamed: 0,Polarity,Coverage,Overlaps,Conflicts
LF_HETNET_DRUGBANK,[1],0.079681,0.079681,0.079681
LF_HETNET_DRUGCENTRAL,[1],0.026545,0.026545,0.026545
LF_HETNET_ChEMBL,[1],0.029977,0.029977,0.029977
LF_HETNET_BINDINGDB,[1],0.004335,0.004335,0.004335
LF_HETNET_PDSP_KI,[1],0.003047,0.003047,0.003047
...,...,...,...,...
LF_GG_BICLUSTER_AFFECTS_EXPRESSION,[1],0.022028,0.022028,0.022028
LF_GG_BICLUSTER_INCREASES_EXPRESSION,[0],0.039219,0.039219,0.039219
LF_GG_BICLUSTER_SIGNALING,[1],0.045618,0.045618,0.045618
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,[0],0.027567,0.027567,0.027567


# Set up fields for resampling

In [14]:
lf_columns_base = list(L_abstracts.columns[0:9])
candidate_id_field = list(L_abstracts.columns[-1:])
dev_column_base = ["split", "curated_cbg", "document_id"]
data_columns = []

# Abstracts

## Baseline

In [15]:
cbg_start = 0
cbg_end = 9
number_of_samples = 1

cbg_lf_range = range(cbg_start, cbg_end)
size_of_samples = [len(cbg_lf_range)]

In [16]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(cbg_lf_range),
            len(list(cbg_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [17]:
data_columns += train_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("cbg_training_marginals_baseline.tsv")
    ),
    curated_label="curated_cbg",
    entity_label="CbG",
    data_source="abstract",
)

100%|██████████| 1/1 [00:00<00:00,  1.82it/s]


## DaG

In [18]:
dag_start = 9
dag_end = 38

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, dag_end - dag_start]
number_of_samples = 50
dag_lf_range = range(dag_start, dag_end)

In [19]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(dag_lf_range),
            len(list(dag_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [20]:
data_columns += train_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("dag_predicts_cbg_training_marginals.tsv")
    ),
    curated_label="curated_cbg",
    entity_label="DaG",
    data_source="abstract",
)

100%|██████████| 1/1 [00:00<00:00,  3.21it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
100%|██████████| 1/1 [00:00<00:00,  3.62it/s]
100%|██████████| 1/1 [00:00<00:00,  3.51it/s]
100%|██████████| 1/1 [00:00<00:00,  3.50it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.64it/s]
100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.64it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
100%|██████████| 1/1 [00:00<00:00,  3.58it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.36it/s]
100%|██████████| 1/1 [00:00<00:00,  3.38it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.58it/s]
100%|██████████| 1/1 [00:00<00:00,

## CtD

In [21]:
ctd_start = 38
ctd_end = 60

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, ctd_end - ctd_start]
number_of_samples = 50
ctd_lf_range = range(ctd_start, ctd_end)

In [22]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(ctd_lf_range),
            len(list(ctd_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [23]:
data_columns += train_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("ctd_predicts_cbg_training_marginals.tsv")
    ),
    curated_label="curated_cbg",
    entity_label="CtD",
    data_source="abstract",
)

100%|██████████| 1/1 [00:00<00:00,  3.19it/s]
100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
100%|██████████| 1/1 [00:00<00:00,  3.58it/s]
100%|██████████| 1/1 [00:00<00:00,  3.53it/s]
100%|██████████| 1/1 [00:00<00:00,  3.56it/s]
100%|██████████| 1/1 [00:00<00:00,  3.49it/s]
100%|██████████| 1/1 [00:00<00:00,  3.44it/s]
100%|██████████| 1/1 [00:00<00:00,  3.45it/s]
100%|██████████| 1/1 [00:00<00:00,  3.47it/s]
100%|██████████| 1/1 [00:00<00:00,  3.50it/s]
100%|██████████| 1/1 [00:00<00:00,  3.52it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.53it/s]
100%|██████████| 1/1 [00:00<00:00,  3.38it/s]
100%|██████████| 1/1 [00:00<00:00,  3.50it/s]
100%|██████████| 1/1 [00:00<00:00,  3.58it/s]
100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
100%|██████████| 1/1 [00:00<00:00,  3.62it/s]
100%|██████████| 1/1 [00:00<00:00,  3.48it/s]
100%|██████████| 1/1 [00:00<00:00,

## CbG

In [24]:
cbg_start = 60
cbg_end = 80

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, cbg_end - cbg_start]
number_of_samples = 50
cbg_lf_range = range(cbg_start, cbg_end)

In [25]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(cbg_lf_range),
            len(list(cbg_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [26]:
data_columns += train_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("cbg_predicts_cbg_training_marginals.tsv")
    ),
    curated_label="curated_cbg",
    entity_label="CbG",
    data_source="abstract",
)

100%|██████████| 1/1 [00:00<00:00,  3.48it/s]
100%|██████████| 1/1 [00:00<00:00,  3.48it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.36it/s]
100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
100%|██████████| 1/1 [00:00<00:00,  3.36it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.56it/s]
100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
100%|██████████| 1/1 [00:00<00:00,  3.52it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.62it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,

## GiG

In [27]:
gig_start = 80
gig_end = 108

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, gig_end - gig_start]
number_of_samples = 50
gig_lf_range = range(gig_start, gig_end)

In [28]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(gig_lf_range),
            len(list(gig_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [29]:
data_columns += train_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("gig_predicts_cbg_training_marginals.tsv")
    ),
    curated_label="curated_cbg",
    entity_label="GiG",
    data_source="abstract",
)

100%|██████████| 1/1 [00:00<00:00,  2.97it/s]
100%|██████████| 1/1 [00:00<00:00,  3.56it/s]
100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.62it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
100%|██████████| 1/1 [00:00<00:00,  3.51it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.52it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
100%|██████████| 1/1 [00:00<00:00,  3.36it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
100%|██████████| 1/1 [00:00<00:00,

# Full Text

## DaG

In [30]:
dag_start = 9
dag_end = 38

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, dag_end - dag_start]
number_of_samples = 50
dag_lf_range = range(dag_start, dag_end)

In [31]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(dag_lf_range),
            len(list(dag_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [32]:
data_columns += train_generative_label_function_sampler(
    filtered_L_full_text,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("dag_predicts_cbg_training_marginals_full_text.tsv")
    ),
    curated_label="curated_cbg",
    entity_label="DaG",
    data_source="full_text",
)

100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.43it/s]
100%|██████████| 1/1 [00:00<00:00,  2.42it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
100%|██████████| 1/1 [00:00<00:00,  2.52it/s]
100%|██████████| 1/1 [00:00<00:00,  2.47it/s]
100%|██████████| 1/1 [00:00<00:00,  2.52it/s]
100%|██████████| 1/1 [00:00<00:00,  2.53it/s]
100%|██████████| 1/1 [00:00<00:00,  2.52it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
100%|██████████| 1/1 [00:00<00:00,  2.52it/s]
100%|██████████| 1/1 [00:00<00:00,  2.32it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.52it/s]
100%|██████████| 1/1 [00:00<00:00,  2.32it/s]
100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,

## CtD

In [33]:
ctd_start = 38
ctd_end = 60

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, ctd_end - ctd_start]
number_of_samples = 50
ctd_lf_range = range(ctd_start, ctd_end)

In [34]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(ctd_lf_range),
            len(list(ctd_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [35]:
data_columns += train_generative_label_function_sampler(
    filtered_L_full_text,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("ctd_predicts_cbg_training_marginals_full_text.tsv")
    ),
    curated_label="curated_cbg",
    entity_label="CtD",
    data_source="full_text",
)

100%|██████████| 1/1 [00:00<00:00,  2.33it/s]
100%|██████████| 1/1 [00:00<00:00,  2.46it/s]
100%|██████████| 1/1 [00:00<00:00,  2.47it/s]
100%|██████████| 1/1 [00:00<00:00,  2.45it/s]
100%|██████████| 1/1 [00:00<00:00,  2.53it/s]
100%|██████████| 1/1 [00:00<00:00,  2.45it/s]
100%|██████████| 1/1 [00:00<00:00,  2.53it/s]
100%|██████████| 1/1 [00:00<00:00,  2.47it/s]
100%|██████████| 1/1 [00:00<00:00,  2.48it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
100%|██████████| 1/1 [00:00<00:00,  2.46it/s]
100%|██████████| 1/1 [00:00<00:00,  2.34it/s]
100%|██████████| 1/1 [00:00<00:00,  2.46it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.52it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
100%|██████████| 1/1 [00:00<00:00,  2.52it/s]
100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
100%|██████████| 1/1 [00:00<00:00,

## CbG

In [36]:
cbg_start = 60
cbg_end = 80

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, cbg_end - cbg_start]
number_of_samples = 50
cbg_lf_range = range(cbg_start, cbg_end)

In [37]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(cbg_lf_range),
            len(list(cbg_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [38]:
data_columns += train_generative_label_function_sampler(
    filtered_L_full_text,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("cbg_predicts_cbg_training_marginals_full_text.tsv")
    ),
    curated_label="curated_cbg",
    entity_label="CbG",
    data_source="full_text",
)

100%|██████████| 1/1 [00:00<00:00,  2.28it/s]
100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
100%|██████████| 1/1 [00:00<00:00,  2.33it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
100%|██████████| 1/1 [00:00<00:00,  2.33it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
100%|██████████| 1/1 [00:00<00:00,  2.46it/s]
100%|██████████| 1/1 [00:00<00:00,  2.34it/s]
100%|██████████| 1/1 [00:00<00:00,  2.33it/s]
100%|██████████| 1/1 [00:00<00:00,  2.45it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
100%|██████████| 1/1 [00:00<00:00,

## GiG

In [39]:
gig_start = 80
gig_end = 108

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, gig_end - gig_start]
number_of_samples = 50
gig_lf_range = range(gig_start, gig_end)

In [40]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(gig_lf_range),
            len(list(gig_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [41]:
data_columns += train_generative_label_function_sampler(
    filtered_L_full_text,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("gig_predicts_cbg_training_marginals_full_text.tsv")
    ),
    curated_label="curated_cbg",
    entity_label="GiG",
    data_source="full_text",
)

100%|██████████| 1/1 [00:00<00:00,  2.12it/s]
100%|██████████| 1/1 [00:00<00:00,  2.42it/s]
100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
100%|██████████| 1/1 [00:00<00:00,  2.28it/s]
100%|██████████| 1/1 [00:00<00:00,  2.41it/s]
100%|██████████| 1/1 [00:00<00:00,  2.41it/s]
100%|██████████| 1/1 [00:00<00:00,  2.43it/s]
100%|██████████| 1/1 [00:00<00:00,  2.43it/s]
100%|██████████| 1/1 [00:00<00:00,  2.39it/s]
100%|██████████| 1/1 [00:00<00:00,  2.40it/s]
100%|██████████| 1/1 [00:00<00:00,  2.42it/s]
100%|██████████| 1/1 [00:00<00:00,  2.41it/s]
100%|██████████| 1/1 [00:00<00:00,  2.43it/s]
100%|██████████| 1/1 [00:00<00:00,  2.45it/s]
100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
100%|██████████| 1/1 [00:00<00:00,  2.34it/s]
100%|██████████| 1/1 [00:00<00:00,  2.40it/s]
100%|██████████| 1/1 [00:00<00:00,  2.37it/s]
100%|██████████| 1/1 [00:00<00:00,  2.22it/s]
100%|██████████| 1/1 [00:00<00:00,  2.39it/s]
100%|██████████| 1/1 [00:00<00:00,  2.41it/s]
100%|██████████| 1/1 [00:00<00:00,

# Write Performance to File

In [42]:
performance_df = pd.DataFrame.from_records(data_columns)
performance_df

Unnamed: 0,lf_sample,aupr,auroc,bce_loss,sampled_lf_name,entity_label,data_source
0,9,0.285701,0.555309,0.385774,"LF_HETNET_DRUGBANK,LF_HETNET_DRUGCENTRAL,LF_HE...",CbG,abstract
1,1,0.285913,0.558374,0.451068,LF_DG_DISTANCE_LONG,DaG,abstract
2,1,0.069079,0.553091,0.460584,LF_DG_BICLUSTER_BIOMARKERS,DaG,abstract
3,1,0.090265,0.542000,0.477596,LF_DG_BICLUSTER_PATHOGENESIS,DaG,abstract
4,1,0.285535,0.554696,0.453491,LF_DaG_CELLULAR_ACTIVITY,DaG,abstract
...,...,...,...,...,...,...,...
1996,28,0.065156,0.480532,0.649011,"LF_GiG_BINDING_IDENTIFICATIONS,LF_GiG_CELL_IDE...",GiG,full_text
1997,28,0.065156,0.480532,0.649011,"LF_GiG_BINDING_IDENTIFICATIONS,LF_GiG_CELL_IDE...",GiG,full_text
1998,28,0.065156,0.480532,0.649011,"LF_GiG_BINDING_IDENTIFICATIONS,LF_GiG_CELL_IDE...",GiG,full_text
1999,28,0.065156,0.480532,0.649011,"LF_GiG_BINDING_IDENTIFICATIONS,LF_GiG_CELL_IDE...",GiG,full_text


In [43]:
(
    performance_df
    >> ply.call(
        "to_csv",
        str(Path("output/performance") / Path("CbG_performance.tsv")),
        index=False,
        sep="\t",
    )
)