In [19]:
import torch
import os
import pandas as pd
import matchzoo as mz
import numpy as np

In [4]:
TYPE = 'classification'

classification_task = mz.tasks.Classification(num_classes=3)
classification_task.metrics = ['acc']
print(classification_task.num_classes)
print(classification_task.output_shape)
print(classification_task.output_dtype)
print(classification_task)

3
(3,)
<class 'int'>
Classification Task with 3 classes


# Data Pack Sample

In [8]:
# pack
df = pd.DataFrame(data={'text_left': list('ABBCD'),
                       'text_right': list('aacbd'),
                       'label': [-1, 1, 0, 0, 1]})
mz.pack(df, task=TYPE).frame()

Unnamed: 0,id_left,text_left,id_right,text_right,label
0,L-0,A,R-0,a,-1
1,L-1,B,R-0,a,1
2,L-1,B,R-1,c,0
3,L-2,C,R-2,b,0
4,L-3,D,R-3,d,1


In [16]:
# data_pack
left = [
    ['artid1', 'A1'],
    ['artid2', 'A2'],
    ['artid3', 'A3']
]
right = [
    ['hypoid1', 'prompt1'],
    ['hypoid2', 'prompt2'],
    ['hypoid3', 'prompt3']
]
relation = [
    ['artid1', 'hypoid1', -1],
    ['artid1', 'hypoid3', 1],
    ['artid2', 'hypoid2', 0],
    ['artid3', 'hypoid3', 1]
]

relation_df = pd.DataFrame(relation)
# relation_df
left = pd.DataFrame(left)
right = pd.DataFrame(right)
dp = mz.DataPack(
    relation=relation_df,
    left=left,
    right=right
)
print(len(dp))
dp

4


<matchzoo.data_pack.data_pack.DataPack at 0x1fc9e3fad68>

In [18]:
data_pack = mz.datasets.toy.load_data(stage='train')
type(data_pack)

matchzoo.data_pack.data_pack.DataPack

# Annotations

In [22]:
annot = pd.read_csv('./annotations/annotations_merged.csv')
annot.sort_values('PMCID').head(10)

Unnamed: 0,UserID,PromptID,PMCID,Valid Label,Valid Reasoning,Label,Annotations,Label Code,In Abstract,Evidence Start,Evidence End
13994,0,7902,60007,True,True,significantly increased,Mivacurium 250 μg/kg produced a maximal T bloc...,1,True,1280,1395
14000,3,7904,60007,True,True,no significant difference,Heart rate was similar between doses,0,True,1452,1488
12483,0,7905,60007,True,True,significantly decreased,while both AUC-SBP and AUC-DBP were significan...,-1,False,16737,16836
12484,1,7905,60007,True,True,significantly decreased,"In relation to the cardiovascular response, th...",-1,False,16641,16836
13999,0,7904,60007,True,True,no significant difference,"In relation to the cardiovascular response, th...",0,False,16641,16736
13998,3,7903,60007,True,True,no significant difference,Spontaneous recovery times were similar in bot...,0,True,1396,1450
13997,0,7903,60007,True,True,no significant difference,The times to OA and to spontaneous recovery of...,0,False,14755,15079
13995,3,7902,60007,True,True,significantly increased,Mivacurium 250 μg/kg produced a maximal T bloc...,1,True,1280,1395
2855,0,1806,111193,True,True,no significant difference,The amount of blood transfusion was identical ...,0,True,1418,1578
2334,0,1808,111193,True,True,no significant difference,There was no difference in time spent in hospi...,0,False,14701,14770


In [38]:
# Process txt file for Articles input

TXT_PATH = './annotations/txt_files/'
TAR_PATH = './annotations/processed_txt_files/'

if not os.path.exists(TAR_PATH):
    os.mkdir(TAR_PATH)
# else: os.removedirs(TAR_PATH)
for file in os.listdir(TXT_PATH):
    # print(file)
    fname = file[3:]
    with open(TXT_PATH+file, 'r', encoding='utf-8') as f, open(TAR_PATH+fname, 'w', encoding='utf-8') as t:
        for line in f.readlines():
            t.write(line.strip())

In [70]:
# Articles Map

left_articles = []
for file in os.listdir(TAR_PATH):
    with open(TAR_PATH+file, 'r', encoding='utf-8') as f:
        left_articles.append([int(file[:-4]), f.readlines()[0]])
print(left_articles[0:5])

[[1090584, 'TITLE: Who wants to join preventive trials? – Experience from the Estonian Postmenopausal Hormone Therapy Trial [ISRCTN35338757]ABSTRACT.BACKGROUND:The interest of patients in participating in randomized clinical trials involving treatments has been widely studied, but there has been much less research on interest in preventive trials. The objective of this study was to find out how many women would be interested in a trial involving postmenopausal hormone therapy (PHT) and how the women\'s background characteristics and opinions correlated to their interest.ABSTRACT.METHODS:The data come from recruitment questionnaires (n = 2000) sent to women in Estonia in 1998. A random sample of women aged 45 to 64 was drawn from the Population Registry. The trial is a two-group randomized trial comparing estrogen-progestogen therapy with placebo or no drugs. A brief description of the study was attached to the questionnaires. Women were not told at this stage of the recruitment which g

In [54]:
# Prompts Map

right_prompts = [list(pair) for pair in zip(annot['PromptID'].values, annot['Annotations'].values)]
print(right_prompts[0:5])

[[213, 'IL-6r (ng/ml)\t\t\t\t\t\t\t Group A\t43.6 (1.7–125.0)\t\t47.4 (0.7–109.5)\t56.2 (25.2–226.3)\t\t\t0.949† Group B\t40.7 (15.6–94.6)\t\t42.4 (22.2–100.5)\t50.2 (13.2–104.9)\t\t\t0.861†\tp = 0.607*\t\tp = 0.914*\tp = 0.304*'], [213, 'There was no significant difference in IL 6, IL-6r and C-reactive protein values between groups.'], [213, 'There was no significant difference in IL 6, IL-6r and C-reactive protein values between groups'], [213, 'There was no significant difference in IL 6, IL-6r and C-reactive protein values between groups'], [98, 'After two weeks of treatment, the reduction in ulcer area was doubled in the HBOT group (P = 0.037)']]


In [75]:
# Articles <-> Prompts Map

article_prompt_relations = [list(triplet) for triplet in zip(annot['PMCID'].values, 
                                                             annot['PromptID'].values, 
                                                             annot['Label Code'].values)]
print(article_prompt_relations[0:5])

[[2206488, 213, 0], [2206488, 213, 0], [2206488, 213, 0], [2206488, 213, 0], [2858204, 98, 1]]


[1, 3, 5]
[2, 4, 6]
