# Does resampling experiment help with predicting DaG sentences?

In [1]:
from itertools import product
from pathlib import Path
import re
import warnings

import numpy as np
import pandas as pd
import plotnine as p9
import scipy.stats
from sqlalchemy import create_engine
from snorkel.labeling.analysis import LFAnalysis
from snorkel.labeling.model import LabelModel
from sklearn.metrics import roc_curve, precision_recall_curve, auc
import torch
import torch.nn.functional as F
import tqdm

from snorkeling_helper.generative_model_helper import (
    sample_lfs,
    train_generative_label_function_sampler,
)

warnings.filterwarnings("ignore")

In [2]:
username = "danich1"
password = "snorkel"
dbname = "pubmed_central_db"
database_str = (
    f"postgresql+psycopg2://{username}:{password}@/{dbname}?host=/var/run/postgresql"
)
conn = create_engine(database_str)

## Load the data

In [3]:
L_abstracts = pd.read_csv(
    "../label_candidates/output/dg_abstract_train_candidates_resampling.tsv", sep="\t"
)

print(L_abstracts.shape)
L_abstracts.head().T

(1539670, 105)


Unnamed: 0,0,1,2,3,4
LF_HETNET_DISEASES,-1,-1,-1,-1,-1
LF_HETNET_DOAF,-1,-1,-1,-1,-1
LF_HETNET_DisGeNET,-1,-1,-1,1,-1
LF_HETNET_GWAS,-1,-1,-1,-1,-1
LF_HETNET_DaG_ABSENT,0,0,0,-1,0
...,...,...,...,...,...
LF_GG_BICLUSTER_INCREASES_EXPRESSION,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_SIGNALING,-1,-1,1,-1,-1
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_CELL_PRODUCTION,-1,-1,-1,-1,-1


In [4]:
L_full_text = pd.read_csv(
    "../label_candidates/output/dg_full_text_train_candidates_resampling.tsv", sep="\t"
)

print(L_full_text.shape)
L_full_text.head().T

(1956984, 105)


Unnamed: 0,0,1,2,3,4
LF_HETNET_DISEASES,-1,-1,-1,-1,-1
LF_HETNET_DOAF,-1,-1,-1,-1,-1
LF_HETNET_DisGeNET,-1,-1,-1,-1,-1
LF_HETNET_GWAS,-1,-1,-1,-1,-1
LF_HETNET_DaG_ABSENT,0,0,0,0,0
...,...,...,...,...,...
LF_GG_BICLUSTER_INCREASES_EXPRESSION,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_SIGNALING,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_CELL_PRODUCTION,-1,0,-1,-1,-1


In [5]:
L_dev = pd.read_csv(
    "../label_candidates/output/dg_dev_test_candidates_resampling.tsv", sep="\t"
).query("split==1")
print(L_dev.shape)
L_dev.head().T

(975, 107)


Unnamed: 0,0,8,25,33,50
LF_HETNET_DISEASES,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_DOAF,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_DisGeNET,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_GWAS,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_DaG_ABSENT,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1.0,-1.0,-1.0,-1.0,-1.0
LF_GG_BICLUSTER_CELL_PRODUCTION,-1.0,-1.0,-1.0,-1.0,-1.0
split,1.0,1.0,1.0,1.0,1.0
document_id,23520.0,629602.0,993337.0,1434797.0,1350353.0


## Resort the Candidates Based on Abstract

In [6]:
# Grab the document ids for resampling
sql = """
select dg_candidates.sentence_id, document_id, dg_candidates.candidate_id from sentence
inner join (
  select candidate.candidate_id, disease_gene.sentence_id from disease_gene
  inner join candidate on candidate.candidate_id=disease_gene.candidate_id
  ) as dg_candidates
on sentence.sentence_id = dg_candidates.sentence_id
"""
candidate_doc_df = pd.read_sql(sql, database_str)
candidate_doc_df.head()

Unnamed: 0,sentence_id,document_id,candidate_id
0,577814033,8168034,3623
1,592544979,12960042,11812
2,298670465,26635731,33796
3,409575187,23452434,51810
4,588478777,18324346,57316


In [7]:
filtered_candidate_id = candidate_doc_df.query(
    f"document_id in {list(L_dev.document_id.astype(int).unique())}"
).candidate_id.tolist()

In [8]:
if not Path("output/dag_dataset_mapper.tsv").exists():
    np.random.seed(100)
    sorted_train_df = (
        candidate_doc_df.query(
            f"document_id not in {list(L_dev.document_id.astype(int).unique())}"
        )[["document_id"]]
        .drop_duplicates()
        .assign(
            dataset=lambda x: np.random.choice(
                ["train", "tune", "test"], x.shape[0], p=[0.7, 0.2, 0.1]
            )
        )
    )
    sorted_train_df.to_csv("output/dag_dataset_mapper.tsv", sep="\t", index=False)
else:
    sorted_train_df = pd.read_csv("output/dag_dataset_mapper.tsv", sep="\t")
sorted_train_df.head()

Unnamed: 0,document_id,dataset
0,8168034,train
1,12960042,train
2,26635731,train
3,23452434,tune
4,18324346,train


In [9]:
trained_documents = (
    sorted_train_df.merge(candidate_doc_df, on="document_id")
    .query("dataset=='train'")
    .candidate_id.tolist()
)

In [10]:
filtered_L_abstracts = L_abstracts.query(f"candidate_id in {trained_documents}")
print(filtered_L_abstracts.shape)
filtered_L_abstracts.head()

(1076965, 105)


Unnamed: 0,LF_HETNET_DISEASES,LF_HETNET_DOAF,LF_HETNET_DisGeNET,LF_HETNET_GWAS,LF_HETNET_DaG_ABSENT,LF_DG_IS_BIOMARKER,LF_DaG_ASSOCIATION,LF_DaG_WEAK_ASSOCIATION,LF_DaG_NO_ASSOCIATION,LF_DaG_CELLULAR_ACTIVITY,...,LF_GG_NO_VERB,LF_GG_BICLUSTER_BINDING,LF_GG_BICLUSTER_ENHANCES,LF_GG_BICLUSTER_ACTIVATES,LF_GG_BICLUSTER_AFFECTS_EXPRESSION,LF_GG_BICLUSTER_INCREASES_EXPRESSION,LF_GG_BICLUSTER_SIGNALING,LF_GG_BICLUSTER_IDENTICAL_PROTEIN,LF_GG_BICLUSTER_CELL_PRODUCTION,candidate_id
0,-1,-1,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,121
1,-1,-1,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,122
2,-1,-1,-1,-1,0,-1,-1,-1,-1,-1,...,-1,1,-1,-1,-1,-1,1,-1,-1,124
3,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,148
4,-1,-1,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,190


In [11]:
filtered_L_full_text = L_full_text.query(f"candidate_id in {trained_documents}")
print(filtered_L_full_text.shape)
filtered_L_full_text.head()

(1372364, 105)


Unnamed: 0,LF_HETNET_DISEASES,LF_HETNET_DOAF,LF_HETNET_DisGeNET,LF_HETNET_GWAS,LF_HETNET_DaG_ABSENT,LF_DG_IS_BIOMARKER,LF_DaG_ASSOCIATION,LF_DaG_WEAK_ASSOCIATION,LF_DaG_NO_ASSOCIATION,LF_DaG_CELLULAR_ACTIVITY,...,LF_GG_NO_VERB,LF_GG_BICLUSTER_BINDING,LF_GG_BICLUSTER_ENHANCES,LF_GG_BICLUSTER_ACTIVATES,LF_GG_BICLUSTER_AFFECTS_EXPRESSION,LF_GG_BICLUSTER_INCREASES_EXPRESSION,LF_GG_BICLUSTER_SIGNALING,LF_GG_BICLUSTER_IDENTICAL_PROTEIN,LF_GG_BICLUSTER_CELL_PRODUCTION,candidate_id
0,-1,-1,-1,-1,0,1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,97
1,-1,-1,-1,-1,0,-1,1,-1,-1,-1,...,-1,1,-1,-1,-1,-1,-1,-1,0,98
2,-1,-1,-1,-1,0,1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,100
3,-1,-1,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,101
4,-1,-1,-1,-1,0,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,102


## Construct the Grid Search

In [12]:
# Global Grid
epochs_grid = [100]
l2_param_grid = [0.75]
lr_grid = [1e-3]
grid = list(product(epochs_grid, l2_param_grid, lr_grid))

# Abstracts

In [13]:
analysis_module = LFAnalysis(filtered_L_abstracts.drop(["candidate_id"], axis=1))

abstract_lf_summary = analysis_module.lf_summary()
abstract_lf_summary.index = filtered_L_abstracts.drop(
    ["candidate_id"], axis=1
).columns.tolist()
abstract_lf_summary

Unnamed: 0,Polarity,Coverage,Overlaps,Conflicts
LF_HETNET_DISEASES,[1],0.325606,0.325606,0.325606
LF_HETNET_DOAF,[1],0.145826,0.145826,0.145826
LF_HETNET_DisGeNET,[1],0.299600,0.299600,0.299600
LF_HETNET_GWAS,[1],0.036476,0.036476,0.036476
LF_HETNET_DaG_ABSENT,[0],0.574512,0.574512,0.574512
...,...,...,...,...
LF_GG_BICLUSTER_AFFECTS_EXPRESSION,[1],0.020184,0.020184,0.020184
LF_GG_BICLUSTER_INCREASES_EXPRESSION,[0],0.042169,0.042169,0.042169
LF_GG_BICLUSTER_SIGNALING,[1],0.046558,0.046558,0.046558
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,[0],0.023896,0.023896,0.023896


# Set up For Resampling

In [14]:
lf_columns_base = list(L_abstracts.columns[0:5])
candidate_id_field = list(L_abstracts.columns[-1:])
dev_column_base = ["split", "curated_dsh", "document_id"]
data_columns = []

# Abstracts

## Baseline

In [15]:
dag_start = 0
dag_end = 5
number_of_samples = 1

dag_lf_range = range(dag_start, dag_end)
size_of_samples = [len(dag_lf_range)]

In [16]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(dag_lf_range),
            len(list(dag_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [17]:
data_columns += train_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file="output/DaG/dag_training_marginals_baseline.tsv",
    curated_label="curated_dsh",
    entity_label="DaG",
    data_source="abstract",
)

100%|██████████| 1/1 [00:02<00:00,  2.18s/it]


## DaG

In [18]:
dag_start = 5
dag_end = 34

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, dag_end - dag_start]
number_of_samples = 50
dag_lf_range = range(dag_start, dag_end)

In [19]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(dag_lf_range),
            len(list(dag_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [20]:
data_columns += train_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file="output/DaG/dag_predicts_dag_training_marginals.tsv",
    curated_label="curated_dsh",
    entity_label="DaG",
    data_source="abstract",
)

100%|██████████| 1/1 [00:00<00:00,  3.63it/s]
100%|██████████| 1/1 [00:00<00:00,  3.89it/s]
100%|██████████| 1/1 [00:00<00:00,  3.82it/s]
100%|██████████| 1/1 [00:00<00:00,  3.91it/s]
100%|██████████| 1/1 [00:00<00:00,  3.78it/s]
100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
100%|██████████| 1/1 [00:00<00:00,  3.90it/s]
100%|██████████| 1/1 [00:00<00:00,  3.95it/s]
100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  3.94it/s]
100%|██████████| 1/1 [00:00<00:00,  4.01it/s]
100%|██████████| 1/1 [00:00<00:00,  4.00it/s]
100%|██████████| 1/1 [00:00<00:00,  3.94it/s]
100%|██████████| 1/1 [00:00<00:00,  3.97it/s]
100%|██████████| 1/1 [00:00<00:00,  3.65it/s]
100%|██████████| 1/1 [00:00<00:00,  3.98it/s]
100%|██████████| 1/1 [00:00<00:00,  3.98it/s]
100%|██████████| 1/1 [00:00<00:00,  3.64it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.99it/s]
100%|██████████| 1/1 [00:00<00:00,  4.00it/s]
100%|██████████| 1/1 [00:00<00:00,

## CtD

In [21]:
ctd_start = 34
ctd_end = 56

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, ctd_end - ctd_start]
number_of_samples = 50
ctd_lf_range = range(ctd_start, ctd_end)

In [22]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(ctd_lf_range),
            len(list(ctd_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [23]:
data_columns += train_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file="output/DaG/ctd_predicts_dag_training_marginals.tsv",
    curated_label="curated_dsh",
    entity_label="CtD",
    data_source="abstract",
)

100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.92it/s]
100%|██████████| 1/1 [00:00<00:00,  4.01it/s]
100%|██████████| 1/1 [00:00<00:00,  3.96it/s]
100%|██████████| 1/1 [00:00<00:00,  4.08it/s]
100%|██████████| 1/1 [00:00<00:00,  3.69it/s]
100%|██████████| 1/1 [00:00<00:00,  4.09it/s]
100%|██████████| 1/1 [00:00<00:00,  3.96it/s]
100%|██████████| 1/1 [00:00<00:00,  3.92it/s]
100%|██████████| 1/1 [00:00<00:00,  3.89it/s]
100%|██████████| 1/1 [00:00<00:00,  3.92it/s]
100%|██████████| 1/1 [00:00<00:00,  3.96it/s]
100%|██████████| 1/1 [00:00<00:00,  3.97it/s]
100%|██████████| 1/1 [00:00<00:00,  3.72it/s]
100%|██████████| 1/1 [00:00<00:00,  3.88it/s]
100%|██████████| 1/1 [00:00<00:00,  4.04it/s]
100%|██████████| 1/1 [00:00<00:00,  4.06it/s]
100%|██████████| 1/1 [00:00<00:00,  4.07it/s]
100%|██████████| 1/1 [00:00<00:00,  4.01it/s]
100%|██████████| 1/1 [00:00<00:00,  3.99it/s]
100%|██████████| 1/1 [00:00<00:00,  3.91it/s]
100%|██████████| 1/1 [00:00<00:00,

## CbG

In [24]:
cbg_start = 56
cbg_end = 76

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, cbg_end - cbg_start]
number_of_samples = 50
cbg_lf_range = range(cbg_start, cbg_end)

In [25]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(cbg_lf_range),
            len(list(cbg_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [26]:
data_columns += train_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file="output/DaG/cbg_predicts_dag_training_marginals.tsv",
    curated_label="curated_dsh",
    entity_label="CbG",
    data_source="abstract",
)

100%|██████████| 1/1 [00:00<00:00,  3.81it/s]
100%|██████████| 1/1 [00:00<00:00,  3.83it/s]
100%|██████████| 1/1 [00:00<00:00,  3.95it/s]
100%|██████████| 1/1 [00:00<00:00,  3.93it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.91it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.95it/s]
100%|██████████| 1/1 [00:00<00:00,  3.93it/s]
100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.85it/s]
100%|██████████| 1/1 [00:00<00:00,  3.94it/s]
100%|██████████| 1/1 [00:00<00:00,  3.95it/s]
100%|██████████| 1/1 [00:00<00:00,  3.97it/s]
100%|██████████| 1/1 [00:00<00:00,  3.96it/s]
100%|██████████| 1/1 [00:00<00:00,  3.85it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.90it/s]
100%|██████████| 1/1 [00:00<00:00,  3.94it/s]
100%|██████████| 1/1 [00:00<00:00,

## GiG

In [27]:
gig_start = 76
gig_end = 104

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, gig_end - gig_start]
number_of_samples = 50
gig_lf_range = range(gig_start, gig_end)

In [28]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(gig_lf_range),
            len(list(gig_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [29]:
data_columns += train_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file="output/DaG/gig_predicts_dag_training_marginals.tsv",
    curated_label="curated_dsh",
    entity_label="GiG",
    data_source="abstract",
)

100%|██████████| 1/1 [00:00<00:00,  2.94it/s]
100%|██████████| 1/1 [00:00<00:00,  3.20it/s]
100%|██████████| 1/1 [00:00<00:00,  3.28it/s]
100%|██████████| 1/1 [00:00<00:00,  3.09it/s]
100%|██████████| 1/1 [00:00<00:00,  3.81it/s]
100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  3.89it/s]
100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
100%|██████████| 1/1 [00:00<00:00,  3.67it/s]
100%|██████████| 1/1 [00:00<00:00,  3.83it/s]
100%|██████████| 1/1 [00:00<00:00,  3.77it/s]
100%|██████████| 1/1 [00:00<00:00,  3.70it/s]
100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  3.83it/s]
100%|██████████| 1/1 [00:00<00:00,  3.67it/s]
100%|██████████| 1/1 [00:00<00:00,  3.79it/s]
100%|██████████| 1/1 [00:00<00:00,  3.83it/s]
100%|██████████| 1/1 [00:00<00:00,  3.53it/s]
100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  3.73it/s]
100%|██████████| 1/1 [00:00<00:00,

# Full Text

## DaG

In [30]:
dag_start = 5
dag_end = 24

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, dag_end - dag_start]
number_of_samples = 50
dag_lf_range = range(dag_start, dag_end)

In [31]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(dag_lf_range),
            len(list(dag_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [32]:
data_columns += train_generative_label_function_sampler(
    L_full_text,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file="output/DaG/dag_predicts_dag_training_marginals_full_text.tsv",
    curated_label="curated_dsh",
    entity_label="DaG",
    data_source="full_text",
)

100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
100%|██████████| 1/1 [00:00<00:00,  2.72it/s]
100%|██████████| 1/1 [00:00<00:00,  2.72it/s]
100%|██████████| 1/1 [00:00<00:00,  2.84it/s]
100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
100%|██████████| 1/1 [00:00<00:00,  2.84it/s]
100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
100%|██████████| 1/1 [00:00<00:00,  2.83it/s]
100%|██████████| 1/1 [00:00<00:00,  2.84it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  2.82it/s]
100%|██████████| 1/1 [00:00<00:00,  2.84it/s]
100%|██████████| 1/1 [00:00<00:00,  2.73it/s]
100%|██████████| 1/1 [00:00<00:00,  2.86it/s]
100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
100%|██████████| 1/1 [00:00<00:00,  2.72it/s]
100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
100%|██████████| 1/1 [00:00<00:00,  2.83it/s]
100%|██████████| 1/1 [00:00<00:00,  2.81it/s]
100%|██████████| 1/1 [00:00<00:00,  2.57it/s]
100%|██████████| 1/1 [00:00<00:00,

## CtD

In [33]:
ctd_start = 34
ctd_end = 56

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, ctd_end - ctd_start]
number_of_samples = 50
ctd_lf_range = range(ctd_start, ctd_end)

In [34]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(ctd_lf_range),
            len(list(ctd_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [35]:
data_columns += train_generative_label_function_sampler(
    L_full_text,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file="output/DaG/ctd_predicts_dag_training_marginals_full_text.tsv",
    curated_label="curated_dsh",
    entity_label="CtD",
    data_source="full_text",
)

100%|██████████| 1/1 [00:00<00:00,  2.86it/s]
100%|██████████| 1/1 [00:00<00:00,  2.79it/s]
100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
100%|██████████| 1/1 [00:00<00:00,  2.80it/s]
100%|██████████| 1/1 [00:00<00:00,  2.89it/s]
100%|██████████| 1/1 [00:00<00:00,  2.79it/s]
100%|██████████| 1/1 [00:00<00:00,  2.81it/s]
100%|██████████| 1/1 [00:00<00:00,  2.79it/s]
100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
100%|██████████| 1/1 [00:00<00:00,  2.88it/s]
100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
100%|██████████| 1/1 [00:00<00:00,  2.80it/s]
100%|██████████| 1/1 [00:00<00:00,  2.60it/s]
100%|██████████| 1/1 [00:00<00:00,  2.80it/s]
100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
100%|██████████| 1/1 [00:00<00:00,  2.90it/s]
100%|██████████| 1/1 [00:00<00:00,  2.89it/s]
100%|██████████| 1/1 [00:00<00:00,  2.86it/s]
100%|██████████| 1/1 [00:00<00:00,  2.92it/s]
100%|██████████| 1/1 [00:00<00:00,  2.78it/s]
100%|██████████| 1/1 [00:00<00:00,

## CbG

In [36]:
cbg_start = 56
cbg_end = 76

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, cbg_end - cbg_start]
number_of_samples = 50
cbg_lf_range = range(cbg_start, cbg_end)

In [37]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(cbg_lf_range),
            len(list(cbg_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [38]:
data_columns += train_generative_label_function_sampler(
    L_full_text,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file="output/DaG/cbg_predicts_dag_training_marginals_full_text.tsv",
    curated_label="curated_dsh",
    entity_label="CbG",
    data_source="full_text",
)

100%|██████████| 1/1 [00:00<00:00,  2.73it/s]
100%|██████████| 1/1 [00:00<00:00,  2.78it/s]
100%|██████████| 1/1 [00:00<00:00,  2.88it/s]
100%|██████████| 1/1 [00:00<00:00,  2.89it/s]
100%|██████████| 1/1 [00:00<00:00,  2.59it/s]
100%|██████████| 1/1 [00:00<00:00,  2.89it/s]
100%|██████████| 1/1 [00:00<00:00,  2.60it/s]
100%|██████████| 1/1 [00:00<00:00,  2.90it/s]
100%|██████████| 1/1 [00:00<00:00,  2.87it/s]
100%|██████████| 1/1 [00:00<00:00,  2.81it/s]
100%|██████████| 1/1 [00:00<00:00,  2.60it/s]
100%|██████████| 1/1 [00:00<00:00,  2.60it/s]
100%|██████████| 1/1 [00:00<00:00,  2.78it/s]
100%|██████████| 1/1 [00:00<00:00,  2.87it/s]
100%|██████████| 1/1 [00:00<00:00,  2.91it/s]
100%|██████████| 1/1 [00:00<00:00,  2.90it/s]
100%|██████████| 1/1 [00:00<00:00,  2.89it/s]
100%|██████████| 1/1 [00:00<00:00,  2.80it/s]
100%|██████████| 1/1 [00:00<00:00,  2.60it/s]
100%|██████████| 1/1 [00:00<00:00,  2.87it/s]
100%|██████████| 1/1 [00:00<00:00,  2.90it/s]
100%|██████████| 1/1 [00:00<00:00,

## GiG

In [39]:
gig_start = 76
gig_end = 104

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, gig_end - gig_start]
number_of_samples = 50
gig_lf_range = range(gig_start, gig_end)

In [40]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(gig_lf_range),
            len(list(gig_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [41]:
data_columns += train_generative_label_function_sampler(
    L_full_text,
    L_dev,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    candidate_id_field=candidate_id_field,
    dev_column_base=dev_column_base,
    search_grid=grid,
    marginals_df_file="output/DaG/gig_predicts_dag_training_marginals_full_text.tsv",
    curated_label="curated_dsh",
    entity_label="GiG",
    data_source="full_text",
)

100%|██████████| 1/1 [00:00<00:00,  2.48it/s]
100%|██████████| 1/1 [00:00<00:00,  2.72it/s]
100%|██████████| 1/1 [00:00<00:00,  2.77it/s]
100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
100%|██████████| 1/1 [00:00<00:00,  2.76it/s]
100%|██████████| 1/1 [00:00<00:00,  2.78it/s]
100%|██████████| 1/1 [00:00<00:00,  2.75it/s]
100%|██████████| 1/1 [00:00<00:00,  2.74it/s]
100%|██████████| 1/1 [00:00<00:00,  2.65it/s]
100%|██████████| 1/1 [00:00<00:00,  2.74it/s]
100%|██████████| 1/1 [00:00<00:00,  2.70it/s]
100%|██████████| 1/1 [00:00<00:00,  2.69it/s]
100%|██████████| 1/1 [00:00<00:00,  2.76it/s]
100%|██████████| 1/1 [00:00<00:00,  2.73it/s]
100%|██████████| 1/1 [00:00<00:00,  2.76it/s]
100%|██████████| 1/1 [00:00<00:00,  2.63it/s]
100%|██████████| 1/1 [00:00<00:00,  2.73it/s]
100%|██████████| 1/1 [00:00<00:00,  2.69it/s]
100%|██████████| 1/1 [00:00<00:00,  2.46it/s]
100%|██████████| 1/1 [00:00<00:00,  2.71it/s]
100%|██████████| 1/1 [00:00<00:00,  2.74it/s]
100%|██████████| 1/1 [00:00<00:00,

# Write Performance to File

In [42]:
performance_df = pd.DataFrame.from_records(data_columns)
performance_df

Unnamed: 0,lf_sample,aupr,auroc,bce_loss,sampled_lf_name,entity_label,data_source
0,5,0.692821,0.500000,1.956008,"LF_HETNET_DISEASES,LF_HETNET_DOAF,LF_HETNET_Di...",DaG,abstract
1,1,0.691416,0.504358,0.849345,LF_DG_DISTANCE_LONG,DaG,abstract
2,1,0.462427,0.514668,0.827043,LF_DG_BICLUSTER_BIOMARKERS,DaG,abstract
3,1,0.464732,0.515456,0.804404,LF_DG_BICLUSTER_PATHOGENESIS,DaG,abstract
4,1,0.633416,0.530401,0.827718,LF_DaG_CELLULAR_ACTIVITY,DaG,abstract
...,...,...,...,...,...,...,...
1996,28,0.460373,0.568421,3.535775,"LF_GiG_BINDING_IDENTIFICATIONS,LF_GiG_CELL_IDE...",GiG,full_text
1997,28,0.460373,0.568421,3.535775,"LF_GiG_BINDING_IDENTIFICATIONS,LF_GiG_CELL_IDE...",GiG,full_text
1998,28,0.460373,0.568421,3.535775,"LF_GiG_BINDING_IDENTIFICATIONS,LF_GiG_CELL_IDE...",GiG,full_text
1999,28,0.460373,0.568421,3.535775,"LF_GiG_BINDING_IDENTIFICATIONS,LF_GiG_CELL_IDE...",GiG,full_text


In [43]:
performance_df.to_csv("output/performance/DaG_performance.tsv", index=False, sep="\t")