# Setting

In [1]:
import os

import numpy as np
import pandas as pd

from bionlp.util import fs, io, func


EVNT_ARG_TYPE = {'2016':{'Lives_In':['Bacteria', 'Location']}, '2013':{'Localization':['Bacterium', 'Localization'], 'PartOf':['Host', 'Part']}, '2011':{'RegulonDependence':['Regulon', 'Target'], 'BindTo':['Agent', 'Target'], 'TranscriptionFrom':['Transcription', 'Site'], 'RegulonMember':['Regulon', 'Member'], 'SiteOf':['Site', 'Entity'], 'TranscriptionBy':['Transcription', 'Agent'], 'PromoterOf':['Promoter', 'Gene'], 'PromoterDependence':['Promoter', 'Protein'], 'ActionTarget':['Action', 'Target'], 'Interaction':['Agent', 'Target']}}
FIX_EVNT_ARG_TYPE = {'2016':{'Lives_In':['Bacteria', 'Location']}, '2013':{'Localization':['Bacterium', 'Localization'], 'PartOf':['Host', 'Part']}, '2011':{'RegulonDependence':['Agent', 'Target'], 'BindTo':['Agent', 'Target'], 'TranscriptionFrom':['Process', 'Promoter'], 'RegulonMember':['Regulon', 'Member'], 'SiteOf':['Target', 'Site'], 'TranscriptionBy':['Transcription', 'Polymerase'], 'PromoterOf':['Agent', 'Target'], 'PromoterDependence':['Agent', 'Target'], 'ActionTarget':['Action', 'Target'], 'Interaction':['Agent', 'Target']}}

YEAR='2011'
TASK='bgi'
DATA_PATH = '../../data/bioevent/bnlpst/%s/%s' % (YEAR, TASK)
WS_PATH = '../../data/bioevent/bnlpst/%s' % (TASK+YEAR)
DEMO_PATH = '../../data/bioevent/bnlpst/%s-demo' % (TASK+YEAR)
TEST_DIR = os.path.join(DATA_PATH, 'dev')
PRED_DIR = os.path.join(DATA_PATH, 'pred')

ENTITIES = list(set(func.flatten_list(EVNT_ARG_TYPE[YEAR].values())))
EVENTS = EVNT_ARG_TYPE[YEAR].keys()

# Load Data

In [2]:
def get_data(ret_field='entity', iterator=False, batch_size=32, dataset='dev', fmt='h5'):
    if (ret_field == 'event'):
        file_name = (['%s_X%i.%s' % (dataset, i, fmt) for i in range(4)], '%s_Y.%s' % (dataset, fmt))
    elif (ret_field == 'entity'):
        file_name = (['%s_ent_X%i.%s' % (dataset, i, fmt) for i in range(2)], '%s_ent_Y.%s' % (dataset, fmt))

    x_fname = ['cbow/%s' % os.path.splitext(xfn)[0] for xfn in file_name[0]]
    y_fname = 'cbow/%s' % os.path.splitext(file_name[1])[0] if (type(file_name[1]) != list) else ['cbow/%s' % os.path.splitext(yfn)[0] for yfn in file_name[1]]
    file_name = x_fname, y_fname, 'dataset.h5'
    if (dataset == 'test'): return [pd.read_hdf(os.path.join(DATA_PATH, file_name[2]), key=fname, iterator=iterator, chunksize=batch_size if iterator else None) for fname in file_name[0]], io.read_obj(os.path.join(DATA_PATH, 'test_rawdata.pkl'))
    return [pd.read_hdf(os.path.join(DATA_PATH, file_name[2]), key=fname, iterator=iterator, chunksize=batch_size if iterator else None) for fname in file_name[0]], pd.read_hdf(os.path.join(DATA_PATH, file_name[2]), key=file_name[1], iterator=iterator, chunksize=batch_size if iterator else None) if (type(file_name[1]) != list) else [pd.read_hdf(os.path.join(DATA_PATH, file_name[2]), key=y, iterator=iterator, chunksize=batch_size if iterator else None) for y in file_name[1]]

# Original data sets
train_ent_Xs, train_ent_Y = get_data(ret_field='entity', dataset='train')
train_evnt_Xs, train_evnt_Y = get_data(ret_field='event', dataset='train')
dev_ent_Xs, dev_ent_Y = get_data(ret_field='entity', dataset='dev')
dev_evnt_Xs, dev_evnt_Y = get_data(ret_field='event', dataset='dev')
evnt_idx = dev_evnt_Y.index.tolist()

# Argument embeddings
all_evnt_args = train_ent_Y.columns.tolist()
evnt_type, lent_type, rent_type = train_evnt_Y.columns[0].split(':')
evnt_args = [lent_type, rent_type]
evnt_arg_idx = [all_evnt_args.index(x) for x in evnt_args]
evnt_arg_idcs = [evnt_arg_idx, evnt_arg_idx[::-1]]
argvecs = [pd.concat([pd.read_hdf(os.path.join(DATA_PATH, 'dataset.h5'), key='cbow/dev_argvec%i_X%i' % (arg_idx, i)) for arg_idx in arg_idcs], axis=1) for i, arg_idcs in enumerate(evnt_arg_idcs)]

# Predictions
# ent_preds = [io.read_npz(os.path.join(DEMO_PATH, 'clf_pred_vecentnet_%i.npz' % i)) for i in range(len(ENTITIES))]
# ent_pred_lbs = np.column_stack([x['pred_lb'] for x in ent_preds])
evnt_preds = [io.read_npz(os.path.join(DEMO_PATH, 'clf_pred_vecomnet_%i.npz' % i)) for i in range(len(EVENTS))]
evnt_pred_lbs = np.column_stack([x['pred_lb'] for x in evnt_preds])

# Select Cases

In [8]:
evnt_ids = ['PMID-10629188-S5|T1|T2', 'PMID-10629188-S5|T4|T2', 'PMID-10629188-S5|T3|T2']
ent_ids = ['PMID-10629188-S5|T1', 'PMID-10629188-S5|T2', 'PMID-10629188-S5|T3', 'PMID-10629188-S5|T4']
cand_evnt_ids = evnt_ids + ['PMID-10629188-S5|T2|T1', 'PMID-10629188-S5|T2|T4', 'PMID-10629188-S5|T2|T3', 'PMID-10629188-S5|T1|T3', 'PMID-10629188-S5|T1|T4', 'PMID-10629188-S5|T3|T4', 'PMID-10629188-S5|T4|T3']
cand_evnt_idx = [evnt_idx.index(x) for x in cand_evnt_ids]

case_ent_Xs, case_ent_Y = [x.loc[ent_ids].iloc[:,:-1] for x in dev_ent_Xs], dev_ent_Y.loc[ent_ids]
case_evnt_Xs, case_evnt_Y = [x.loc[cand_evnt_ids].iloc[:,:-1] for x in dev_evnt_Xs], dev_evnt_Y.loc[cand_evnt_ids]
case_argvecs = [x.loc[cand_evnt_ids] for x in argvecs]

In [9]:
case_argvecs[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
PMID-10629188-S5|T1|T2,0.021462,0.002148,0.219799,-0.198471,0.043353,0.231267,0.079087,-0.197723,-0.101788,0.026542,...,-0.215574,-0.181692,-0.138249,0.260174,0.063956,0.192489,-0.051192,-0.155358,-0.171592,0.150099
PMID-10629188-S5|T4|T2,0.077294,0.130082,-0.065601,0.068243,-0.116693,-0.308104,0.161389,0.178533,0.376327,-0.132733,...,-0.028199,0.033322,-0.072644,-0.007327,0.023465,-0.035529,-0.036928,-0.031568,0.018851,0.06637
PMID-10629188-S5|T3|T2,0.097857,0.130305,-0.046709,0.078384,-0.118899,-0.289273,0.152811,0.165581,0.361983,-0.100643,...,-0.025805,0.054395,-0.052587,-0.051897,0.01222,-0.040572,0.019874,-0.032239,0.037888,0.063138
PMID-10629188-S5|T2|T1,0.081511,0.143499,0.037592,0.034386,-0.057944,-0.223832,0.077743,0.071256,0.174321,-0.097735,...,0.034602,0.081207,0.071162,-0.017889,-0.048623,0.025016,-0.00907,-0.057166,0.137263,0.081832
PMID-10629188-S5|T2|T4,0.081511,0.143499,0.037592,0.034386,-0.057944,-0.223832,0.077743,0.071256,0.174321,-0.097735,...,0.034602,0.081207,0.071162,-0.017889,-0.048623,0.025016,-0.00907,-0.057166,0.137263,0.081832
PMID-10629188-S5|T2|T3,0.081511,0.143499,0.037592,0.034386,-0.057944,-0.223832,0.077743,0.071256,0.174321,-0.097735,...,0.034602,0.081207,0.071162,-0.017889,-0.048623,0.025016,-0.00907,-0.057166,0.137263,0.081832
PMID-10629188-S5|T1|T3,0.021462,0.002148,0.219799,-0.198471,0.043353,0.231267,0.079087,-0.197723,-0.101788,0.026542,...,-0.215574,-0.181692,-0.138249,0.260174,0.063956,0.192489,-0.051192,-0.155358,-0.171592,0.150099
PMID-10629188-S5|T1|T4,0.021462,0.002148,0.219799,-0.198471,0.043353,0.231267,0.079087,-0.197723,-0.101788,0.026542,...,-0.215574,-0.181692,-0.138249,0.260174,0.063956,0.192489,-0.051192,-0.155358,-0.171592,0.150099
PMID-10629188-S5|T3|T4,0.097857,0.130305,-0.046709,0.078384,-0.118899,-0.289273,0.152811,0.165581,0.361983,-0.100643,...,-0.025805,0.054395,-0.052587,-0.051897,0.01222,-0.040572,0.019874,-0.032239,0.037888,0.063138
PMID-10629188-S5|T4|T3,0.077294,0.130082,-0.065601,0.068243,-0.116693,-0.308104,0.161389,0.178533,0.376327,-0.132733,...,-0.028199,0.033322,-0.072644,-0.007327,0.023465,-0.035529,-0.036928,-0.031568,0.018851,0.06637


In [10]:
case_evnt_Y

Unnamed: 0,ActionTarget:Action:Target,Interaction:Agent:Target,TranscriptionBy:Transcription:Agent
PMID-10629188-S5|T1|T2,1,0,0
PMID-10629188-S5|T4|T2,0,1,0
PMID-10629188-S5|T3|T2,0,1,0
PMID-10629188-S5|T2|T1,-1,0,0
PMID-10629188-S5|T2|T4,0,-1,0
PMID-10629188-S5|T2|T3,0,-1,0
PMID-10629188-S5|T1|T3,0,0,0
PMID-10629188-S5|T1|T4,0,0,0
PMID-10629188-S5|T3|T4,0,0,0
PMID-10629188-S5|T4|T3,0,0,0


# Predictions for selected cases

In [11]:
pred_df = pd.DataFrame(evnt_pred_lbs[cand_evnt_idx], index=cand_evnt_ids, columns=func.flatten_list([[col, 'Dir'] for col in train_evnt_Y.columns]))
pred_df

Unnamed: 0,ActionTarget:Action:Target,Dir,BindTo:Agent:Target,Dir.1,Interaction:Agent:Target,Dir.2,PromoterDependence:Promoter:Protein,Dir.3,PromoterOf:Promoter:Gene,Dir.4,RegulonDependence:Regulon:Target,Dir.5,RegulonMember:Regulon:Member,Dir.6,SiteOf:Site:Entity,Dir.7,TranscriptionBy:Transcription:Agent,Dir.8,TranscriptionFrom:Transcription:Site,Dir.9
PMID-10629188-S5|T1|T2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PMID-10629188-S5|T4|T2,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PMID-10629188-S5|T3|T2,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PMID-10629188-S5|T2|T1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PMID-10629188-S5|T2|T4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PMID-10629188-S5|T2|T3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PMID-10629188-S5|T1|T3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PMID-10629188-S5|T1|T4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PMID-10629188-S5|T3|T4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PMID-10629188-S5|T4|T3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
# from bionlp.model import kerasext, vecomnet
# fnames = ['vecentnet_clf_%i.pkl' % i for i in range(len(entities))]
# ent_models = [io.read_obj(os.path.join(WS_PATH, 'ent-pred', 'ent%i' % i, fname)) for i, fname in enumerate(fnames)]
# preds = []
# for i, mdl, fname in zip(range(len(ent_models)), ent_models, fnames):
#     custom_objects = {}
#     custom_objects = func.update_dict(func.update_dict(custom_objects, kerasext.CUSTOM_METRIC), vecomnet.CUSTOM_LOSS)
#     mdl.load(os.path.join(WS_PATH, 'bgi2011-ent-pred-mdl', 'ent%i' % i, os.path.splitext(fname)[0]), custom_objects=custom_objects)
#     for layer in mdl.model.layers:
#         layer.name = '%s%i-%s' % ('Obj', i, layer.name)
#     preds.append([mdl.predict(case_ent_Xs), mdl.predict_proba(case_ent_Xs)])
#     del mdl