# Does resampling experiment help with predicting GiG sentences?

In [1]:
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import plydata as ply
from sqlalchemy import create_engine

from snorkel.labeling.analysis import LFAnalysis
from snorkeling_helper.generative_model_helper import (
    sample_lfs,
    run_generative_label_function_sampler,
)

warnings.filterwarnings("ignore")

In [2]:
username = "danich1"
password = "snorkel"
dbname = "pubmed_central_db"
database_str = (
    f"postgresql+psycopg2://{username}:{password}@/{dbname}?host=/var/run/postgresql"
)
conn = create_engine(database_str)

## Load the data

In [3]:
label_candidates_dir = Path("../label_candidates/output")
notebook_output_dir = Path("output/GiG")

In [4]:
L_abstracts = pd.read_csv(
    str(label_candidates_dir / Path("gg_abstract_train_candidates_resampling.tsv")),
    sep="\t",
)

print(L_abstracts.shape)
L_abstracts.head().T

(5420798, 109)


Unnamed: 0,0,1,2,3,4
LF_HETNET_HI_I_05,-1,-1,-1,-1,-1
LF_HETNET_VENKATESAN_09,-1,-1,-1,-1,-1
LF_HETNET_YU_11,-1,-1,-1,-1,-1
LF_HETNET_HI_II_14,-1,-1,-1,-1,-1
LF_HETNET_LIT_BM_13,-1,-1,-1,-1,-1
...,...,...,...,...,...
LF_GG_BICLUSTER_INCREASES_EXPRESSION,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_SIGNALING,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1,-1,-1,-1,-1
LF_GG_BICLUSTER_CELL_PRODUCTION,-1,-1,-1,-1,-1


In [5]:
L_dev = pd.read_csv(
    str(label_candidates_dir / Path("gg_dev_test_candidates_resampling.tsv")), sep="\t"
) >> ply.query("split==4")
print(L_dev.shape)
L_dev.head().T

(500, 111)


Unnamed: 0,0,1,2,6,7
LF_HETNET_HI_I_05,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_VENKATESAN_09,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_YU_11,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_HI_II_14,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_LIT_BM_13,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1.0,-1.0,-1.0,-1.0,0.0
LF_GG_BICLUSTER_CELL_PRODUCTION,-1.0,-1.0,-1.0,-1.0,0.0
split,4.0,4.0,4.0,4.0,4.0
document_id,304472.0,221544.0,225351.0,404727.0,38292.0


In [6]:
L_test = pd.read_csv(
    str(label_candidates_dir / Path("gg_dev_test_candidates_resampling.tsv")), sep="\t"
) >> ply.query("split==5")
print(L_test.shape)
L_test.head().T

(500, 111)


Unnamed: 0,3,4,5,13,14
LF_HETNET_HI_I_05,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_VENKATESAN_09,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_YU_11,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_HI_II_14,-1.0,-1.0,-1.0,-1.0,-1.0
LF_HETNET_LIT_BM_13,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,-1.0,-1.0,-1.0,-1.0,-1.0
LF_GG_BICLUSTER_CELL_PRODUCTION,-1.0,-1.0,-1.0,-1.0,-1.0
split,5.0,5.0,5.0,5.0,5.0
document_id,35240.0,185144.0,140314.0,495305.0,447650.0


## Resort Based on the Candidate Abstracts

In [7]:
# Grab the document ids for resampling
sql = """
select gg_candidates.sentence_id, document_id, gg_candidates.candidate_id from sentence
inner join (
  select candidate.candidate_id, gene_gene.sentence_id from gene_gene
  inner join candidate on candidate.candidate_id=gene_gene.candidate_id
  ) as gg_candidates
on sentence.sentence_id = gg_candidates.sentence_id
"""
candidate_doc_df = pd.read_sql(sql, database_str)
candidate_doc_df.head()

Unnamed: 0,sentence_id,document_id,candidate_id
0,243,18728748,38967952
1,284,18728748,38967951
2,290,18728748,38967950
3,295,18728748,38967949
4,299,18728748,38967948


In [8]:
dev_test_ids = (
    L_dev >> ply.select("document_id") >> ply.distinct() >> ply.pull("document_id")
)

filtered_candidate_id = (
    candidate_doc_df
    >> ply.query(f"document_id in {list(dev_test_ids)}")
    >> ply.pull("candidate_id")
)

In [9]:
sorted_train_df = pd.read_csv(
    str(notebook_output_dir / Path("gig_dataset_mapper.tsv")), sep="\t"
)
sorted_train_df.head()

Unnamed: 0,document_id,dataset
0,18728748,train
1,24496597,train
2,29615870,train
3,26338040,tune
4,27224915,train


## Load full text after document sorting

In [10]:
trained_documents = (
    sorted_train_df
    >> ply.inner_join(candidate_doc_df, on="document_id")
    >> ply.query("dataset=='train'")
    >> ply.pull("candidate_id")
)

## Update the data based on sorting

In [11]:
filtered_L_abstracts = L_abstracts >> ply.query(
    f"candidate_id in {list(trained_documents)}"
)
print(filtered_L_abstracts.shape)
filtered_L_abstracts.head()

(3792568, 109)


Unnamed: 0,LF_HETNET_HI_I_05,LF_HETNET_VENKATESAN_09,LF_HETNET_YU_11,LF_HETNET_HI_II_14,LF_HETNET_LIT_BM_13,LF_HETNET_II_BINARY,LF_HETNET_II_LITERATURE,LF_HETNET_HETIO_DAG,LF_HETNET_GiG_ABSENT,LF_DG_IS_BIOMARKER,...,LF_GG_NO_VERB,LF_GG_BICLUSTER_BINDING,LF_GG_BICLUSTER_ENHANCES,LF_GG_BICLUSTER_ACTIVATES,LF_GG_BICLUSTER_AFFECTS_EXPRESSION,LF_GG_BICLUSTER_INCREASES_EXPRESSION,LF_GG_BICLUSTER_SIGNALING,LF_GG_BICLUSTER_IDENTICAL_PROTEIN,LF_GG_BICLUSTER_CELL_PRODUCTION,candidate_id
0,-1,-1,-1,-1,-1,-1,-1,-1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,273
1,-1,-1,-1,-1,-1,-1,-1,-1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,274
2,-1,-1,-1,-1,-1,-1,-1,-1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,275
3,-1,-1,-1,-1,-1,-1,-1,-1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,276
4,-1,-1,-1,-1,-1,-1,-1,-1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,278


## Construct the Grid Search

In [12]:
# Global Grid
epochs_grid = [250]
l2_param_grid = np.linspace(0.01, 5, num=5)
lr_grid = [1e-3]
grid = list(
    zip(epochs_grid * len(l2_param_grid), l2_param_grid, lr_grid * len(l2_param_grid))
)

# Abstracts

In [13]:
analysis_module = LFAnalysis(
    filtered_L_abstracts >> ply.select("candidate_id", drop=True)
)

abstract_lf_summary = analysis_module.lf_summary()
abstract_lf_summary.index = (
    filtered_L_abstracts >> ply.select("candidate_id", drop=True)
).columns.tolist()

abstract_lf_summary

Unnamed: 0,Polarity,Coverage,Overlaps,Conflicts
LF_HETNET_HI_I_05,[1],0.002701,0.002701,0.002701
LF_HETNET_VENKATESAN_09,[1],0.000101,0.000101,0.000101
LF_HETNET_YU_11,[1],0.000617,0.000617,0.000617
LF_HETNET_HI_II_14,[1],0.008483,0.008483,0.008483
LF_HETNET_LIT_BM_13,[1],0.140026,0.140026,0.140026
...,...,...,...,...
LF_GG_BICLUSTER_AFFECTS_EXPRESSION,[1],0.026010,0.026010,0.026010
LF_GG_BICLUSTER_INCREASES_EXPRESSION,[0],0.043327,0.043327,0.043327
LF_GG_BICLUSTER_SIGNALING,[1],0.054802,0.054802,0.054802
LF_GG_BICLUSTER_IDENTICAL_PROTEIN,[0],0.031499,0.031499,0.031499


# Set up fields for resampling

In [14]:
lf_columns_base = list(L_abstracts.columns[0:9])
candidate_id_field = list(L_abstracts.columns[-1:])
dev_column_base = ["split", "curated_gig", "document_id"]
data_columns = []

# Abstracts

## Baseline

In [15]:
gig_start = 0
gig_end = 9
number_of_samples = 1

gig_lf_range = range(gig_start, gig_end)
size_of_samples = [len(gig_lf_range)]

In [16]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(gig_lf_range),
            len(list(gig_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [17]:
data_columns += run_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("gig_training_marginals_baseline.tsv")
    ),
    curated_label="curated_gig",
    entity_label="GiG_baseline",
    data_source="abstract",
)

100%|██████████| 1/1 [00:32<00:00, 32.49s/it]


## DaG

In [18]:
dag_start = 9
dag_end = 38

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, dag_end - dag_start]
number_of_samples = 50
dag_lf_range = range(dag_start, dag_end)

In [19]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(dag_lf_range),
            len(list(dag_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [20]:
data_columns += run_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file="",
    curated_label="curated_gig",
    entity_label="DaG",
    data_source="abstract",
)

100%|██████████| 50/50 [10:38<00:00, 12.77s/it]
100%|██████████| 50/50 [18:38<00:00, 22.36s/it]
100%|██████████| 50/50 [22:18<00:00, 26.77s/it]
100%|██████████| 50/50 [32:02<00:00, 38.44s/it]


## CtD

In [21]:
ctd_start = 38
ctd_end = 60

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, ctd_end - ctd_start]
number_of_samples = 50
ctd_lf_range = range(ctd_start, ctd_end)

In [22]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(ctd_lf_range),
            len(list(ctd_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [23]:
data_columns += run_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file="",
    curated_label="curated_gig",
    entity_label="CtD",
    data_source="abstract",
)

100%|██████████| 50/50 [10:34<00:00, 12.69s/it]
100%|██████████| 50/50 [15:23<00:00, 18.48s/it]
100%|██████████| 50/50 [18:47<00:00, 22.55s/it]
100%|██████████| 50/50 [22:40<00:00, 27.21s/it]
100%|██████████| 50/50 [27:25<00:00, 32.92s/it]


## CbG

In [24]:
cbg_start = 60
cbg_end = 80

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, cbg_end - cbg_start]
number_of_samples = 50
cbg_lf_range = range(cbg_start, cbg_end)

In [25]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(cbg_lf_range),
            len(list(cbg_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [26]:
data_columns += run_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file="",
    curated_label="curated_gig",
    entity_label="CbG",
    data_source="abstract",
)

100%|██████████| 50/50 [10:32<00:00, 12.66s/it]
100%|██████████| 50/50 [15:23<00:00, 18.48s/it]
100%|██████████| 50/50 [22:28<00:00, 26.97s/it]
100%|██████████| 50/50 [25:31<00:00, 30.64s/it]


## GiG

In [27]:
gig_start = 80
gig_end = 108

# Spaced out number of sampels including total
size_of_samples = [1, 6, 11, 16, gig_end - gig_start]
number_of_samples = 50
gig_lf_range = range(gig_start, gig_end)

In [28]:
sampled_lfs_dict = {
    sample_size: (
        sample_lfs(
            list(gig_lf_range),
            len(list(gig_lf_range)),
            sample_size,
            number_of_samples,
            random_state=100,
        )
    )
    for sample_size in size_of_samples
}

In [None]:
data_columns += run_generative_label_function_sampler(
    filtered_L_abstracts,
    L_dev,
    L_test,
    sampled_lfs_dict,
    lf_columns_base=lf_columns_base,
    grid_param=grid,
    marginals_df_file=str(
        notebook_output_dir / Path("gig_predicts_gig_training_marginals.tsv")
    ),
    curated_label="curated_gig",
    entity_label="GiG",
    data_source="abstract",
)

100%|██████████| 50/50 [23:48<00:00, 28.57s/it]
100%|██████████| 50/50 [28:50<00:00, 34.61s/it]
100%|██████████| 50/50 [32:21<00:00, 38.83s/it]
100%|██████████| 50/50 [36:09<00:00, 43.38s/it]
 98%|█████████▊| 49/50 [44:52<00:54, 54.75s/it]

# Full Text

Full text cannot load into memory on my work desktop machine (RAM:64GB).
Would have to run it on a cluster that has more memory; however, given the fact that full text hasn't improved performance I'm electing to ignore this section and work with abstracts alone.
Check [06_plot_labels_sampling_performance.ipynb](06_plot_labels_sampling_performance.ipynb) for full text and abstract analysis results.

# Write Performance to File

In [None]:
performance_df = pd.DataFrame.from_records(data_columns)
performance_df

In [None]:
performance_df.to_csv("output/performance/GiG_performance.tsv", index=False, sep="\t")