In [1]:
import os 
import numpy as np
import pandas as pd
import pickle
import sys

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from utils import Sample_Size_Extractor
from utils import performance

In [4]:
import timeit
from sklearn.utils import shuffle
from sklearn.model_selection import KFold

In [14]:
# load dataframe: 400 abstracts from PubMed, Covid-Set(300) + General-Set(100)
df_all = pd.read_csv('data/df_ab_w_tt_ss_400.csv')

In [15]:
df_all.head()

Unnamed: 0,pmid,tt_sample_size,abstract
0,33607104,50.0,TITLE: No clinical benefit of high dose cortic...
1,33106170,400.0,TITLE: BCG revaccination of health workers in ...
2,32673060,423.0,TITLE: Hydroxychloroquine in Nonhospitalized A...
3,33165621,479.0,TITLE: Effect of Hydroxychloroquine on Clinica...
4,33619178,1000.0,TITLE: Influence of a COVID-19 vaccine's effec...


In [9]:
max_features=10000
epochs=10 
batch_size=32
wvs = Sample_Size_Extractor.load_trained_w2v_model("PubMed-w2v.bin")
preprocessor = Sample_Size_Extractor.Preprocessor(max_features, wvs, df_all["abstract"].values)



# Model intialized with ramdomized weights

In [10]:
kf = KFold(n_splits=5, random_state=0, shuffle=True) 

In [22]:
# Generate 5-fold Cross-Validation predicted result on these 400 abstracts
df_pred_result_all = pd.DataFrame()
for train_index, test_index in kf.split(df_all):
    start = timeit.default_timer()
    df_train = df_all.loc[train_index, :]
    df_test = df_all.loc[test_index, :]

    X_tr, y_tr = Sample_Size_Extractor.generate_X_y(df_train)
    nn_ = Sample_Size_Extractor.SampleSizeClassifier(preprocessor)
    nn_.fit_MLP_model()
    #nn_.model.load_weights(pre_w_path)

    X_tr_fvs = nn_.featurize_for_input(X_tr)
    nn_.model.fit(X_tr_fvs, y_tr,
                         epochs=10, batch_size=32, verbose=0)
    
    pmid_list_, ss_pred_list_, conf_pred_list_ = performance.get_test_result(df_test, nn_, 0.2)
    df_test_pred = pd.DataFrame({'pmid':pmid_list_, 'pred_ss':ss_pred_list_, 'conf': conf_pred_list_})
    df_pred_result_all = df_pred_result_all.append(df_test_pred)
    del nn_
    stop = timeit.default_timer()
    print('finish one fold, spend '+ str(stop-start)+'s.')

df_pred_result_all.to_csv('data/result/pred_no_pretraining_400_5fcv.csv')

finish one fold, spend 135.89152275200013s.
finish one fold, spend 128.24783697800103s.
finish one fold, spend 133.26289692200044s.
finish one fold, spend 128.8037666830005s.
finish one fold, spend 126.7202151299989s.


In [23]:
# Train with all data and save weights
X_tr, y_tr = Sample_Size_Extractor.generate_X_y(df_all)
nn_ = Sample_Size_Extractor.SampleSizeClassifier(preprocessor)
nn_.fit_MLP_model()
X_tr_fvs = nn_.featurize_for_input(X_tr)
nn_.model.fit(X_tr_fvs, y_tr, epochs=10, batch_size=32, verbose=0)
pmid_list_, ss_pred_list_, conf_pred_list_ = performance.get_test_result(df_all, nn_, 0.2)
df_test_pred = pd.DataFrame({'pmid':pmid_list_, 'pred_ss':ss_pred_list_, 'conf': conf_pred_list_})
df_test_pred.to_csv('data/result/pred_no_pretraining_400_nocv.csv')
nn_.model.save_weights('data/pretrained_weights/SSE_no_pretraining_with_ebmnlp.h5')

In [24]:
# Generate gold standard with fold id for evaluation
# define a df listing fold id
train_index_dict = {}
test_index_dict = {}
i = 0
for train_index, test_index in kf.split(df_all):
    train_index_dict[i] = train_index
    test_index_dict[i] = test_index
    i = i+1

pmid_ls = []
tt_ss_ls = []
fold_id_ls = []
for i in test_index_dict.keys():
    cur_test_index = test_index_dict[i]
    cur_df = df_all.loc[cur_test_index, ]
    pmid_ls = pmid_ls + list(cur_df['pmid'])
    tt_ss_ls = tt_ss_ls +list(cur_df['tt_sample_size'])
    fold_id_ls = fold_id_ls + [i]*len(cur_df)

gold_w_foldid_df = pd.DataFrame({'pmid': pmid_ls, 'tt_sample_size': tt_ss_ls, 'fold_id':fold_id_ls})
gold_w_foldid_df.to_csv('data/result/gold_w_foldid.csv')    

# Model intialized with pretrained weights

In [25]:
pre_w_path = 'data/pretrained_weights/p2_wo_es_32.h5'
df_pred_result_all = pd.DataFrame()
for train_index, test_index in kf.split(df_all):
    start = timeit.default_timer()
    df_train = df_all.loc[train_index, :]
    df_test = df_all.loc[test_index, :]

    X_tr, y_tr = Sample_Size_Extractor.generate_X_y(df_train)
    nn_ = Sample_Size_Extractor.SampleSizeClassifier(preprocessor)
    nn_.fit_MLP_model()
    nn_.model.load_weights(pre_w_path)

    X_tr_fvs = nn_.featurize_for_input(X_tr)
    nn_.model.fit(X_tr_fvs, y_tr,
                         epochs=10, batch_size=32, verbose=0)
    
    pmid_list_, ss_pred_list_, conf_pred_list_ = performance.get_test_result(df_test, nn_, 0.2)
    df_test_pred = pd.DataFrame({'pmid':pmid_list_, 'pred_ss':ss_pred_list_, 'conf': conf_pred_list_})
    df_pred_result_all = df_pred_result_all.append(df_test_pred)
    del nn_
    stop = timeit.default_timer()
    print('finish one fold, spend '+ str(stop-start)+'s.')

df_pred_result_all.to_csv('data/result/pred_pretraining_400_5fcv.csv')

finish one fold, spend 123.02742466000018s.
finish one fold, spend 120.21527635699931s.
finish one fold, spend 121.16634548199909s.
finish one fold, spend 120.16461057500055s.
finish one fold, spend 124.96583413400003s.


In [26]:
# Train with all data and save weights
X_tr, y_tr = Sample_Size_Extractor.generate_X_y(df_all)
nn_ = Sample_Size_Extractor.SampleSizeClassifier(preprocessor)
nn_.fit_MLP_model()
nn_.model.load_weights(pre_w_path)
X_tr_fvs = nn_.featurize_for_input(X_tr)
nn_.model.fit(X_tr_fvs, y_tr, epochs=10, batch_size=32, verbose=0)
pmid_list_, ss_pred_list_, conf_pred_list_ = performance.get_test_result(df_all, nn_, 0.2)
df_test_pred = pd.DataFrame({'pmid':pmid_list_, 'pred_ss':ss_pred_list_, 'conf': conf_pred_list_})
df_test_pred.to_csv('data/result/pred_pretraining_400_nocv.csv')
nn_.model.save_weights('data/pretrained_weights/SSE_pretraining_with_ebmnlp.h5')

In [39]:
# save the preprocessor
# with open('data/preprocessor.pickle', 'wb') as handle:
#    pickle.dump(preprocessor, handle)
# save all text in abstract for future preprocessor
with open('data/400_abstract_text.pickle', 'wb') as handle:
    pickle.dump(df_all["abstract"].values, handle)

# Extract total sample size for any RCT abstracts

In [29]:
# input: your dataframe should have at least two columns: 'pmid', 'abstract'
df_to_extract = df_all[['pmid', 'abstract']].iloc[0:20, ]
df_to_extract.to_csv('data/sample_input.csv', index = False)
#df_to_extract = pd.read_csv('data/sample_input.csv')

In [34]:
df_to_extract.head()

Unnamed: 0,pmid,abstract
0,33607104,TITLE: No clinical benefit of high dose cortic...
1,33106170,TITLE: BCG revaccination of health workers in ...
2,32673060,TITLE: Hydroxychloroquine in Nonhospitalized A...
3,33165621,TITLE: Effect of Hydroxychloroquine on Clinica...
4,33619178,TITLE: Influence of a COVID-19 vaccine's effec...


In [41]:
# load all text in pretrained abstract for initializing the preprocessor
with open('data/400_abstract_text.pickle', 'rb') as handle:
    prev_abstract_text = pickle.load(handle)

In [47]:
# Use our sample size extractor
pre_w_path = 'data/pretrained_weights/SSE_pretraining_with_ebmnlp.h5'
max_features=10000
epochs=10 
batch_size=32

#wvs was loaded before
#wvs = Sample_Size_Extractor.load_trained_w2v_model("PubMed-w2v.bin")
p_all_text = list(prev_abstract_text)+list(df_to_extract["abstract"].values)
preprocessor = Sample_Size_Extractor.Preprocessor(max_features, wvs, p_all_text)

nn_ = Sample_Size_Extractor.SampleSizeClassifier(preprocessor)
# load our trained model
nn_.fit_MLP_model()
nn_.model.load_weights(pre_w_path)
pmid_list_, ss_pred_list_, conf_pred_list_ = performance.get_test_result(df_to_extract, nn_, 0.2)
df_test_pred = pd.DataFrame({'pmid':pmid_list_, 'pred_ss':ss_pred_list_, 'conf': conf_pred_list_})
df_test_pred.to_csv('data/sample_output.csv', index = False)

In [53]:
df_test_pred.head(10)

Unnamed: 0,pmid,pred_ss,conf
0,33607104,50.0,0.602764
1,33106170,400.0,0.584205
2,32673060,10.0,0.757549
3,33165621,102.0,0.884804
4,33619178,3.0,0.746485
5,33246499,,
6,33568628,100.0,0.75669
7,33596857,4099.0,0.491054
8,33306283,1033.0,0.682457
9,32627205,1584.0,0.970726


# Appendix: pretraining with EBM-NLP

In [None]:
#df_ebm: a dataframe generated with sample size information from EBM-NLP corpus
#df_ebm = pd.read_csv('ebm_df_ab_w_tt_ss.csv', index_col=0)
#p2 = Sample_Size_Extractor.Preprocessor(max_features, wvs, df_ebm["abstract"].values)

In [None]:
#df_to_train = df_ebm.copy()
#X_tr, y_tr = Sample_Size_Extractor.generate_X_y(df_to_train)
#nn_ = Sample_Size_Extractor.SampleSizeClassifier(p2)
#nn_.fit_MLP_model()
#X_tr_fvs = nn_.featurize_for_input(X_tr)
#nn_.model.fit(X_tr_fvs, y_tr,
#                     epochs=10, batch_size=64, validation_split=0.1)
#nn_.model.save_weights('data/pretrained_weights/pretraining_weights_ebmnlp.h5')