In [42]:
import torch
import os
import pandas as pd
import matchzoo as mz
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
TYPE = 'classification'

classification_task = mz.tasks.Classification(num_classes=3)
classification_task.metrics = ['acc']
print(classification_task.num_classes)
print(classification_task.output_shape)
print(classification_task.output_dtype)
print(classification_task)

3
(3,)
<class 'int'>
Classification Task with 3 classes


# Data Pack Sample

In [3]:
# pack
df = pd.DataFrame(data={'text_left': list('ABBCD'),
                       'text_right': list('aacbd'),
                       'label': [-1, 1, 0, 0, 1]})
mz.pack(df, task=TYPE).frame()

Unnamed: 0,id_left,text_left,id_right,text_right,label
0,L-0,A,R-0,a,-1
1,L-1,B,R-0,a,1
2,L-1,B,R-1,c,0
3,L-2,C,R-2,b,0
4,L-3,D,R-3,d,1


In [4]:
# data_pack
left = [
    ['artid1', 'A1'],
    ['artid2', 'A2'],
    ['artid3', 'A3']
]
right = [
    ['hypoid1', 'prompt1'],
    ['hypoid2', 'prompt2'],
    ['hypoid3', 'prompt3']
]
relation = [
    ['artid1', 'hypoid1', -1],
    ['artid1', 'hypoid3', 1],
    ['artid2', 'hypoid2', 0],
    ['artid3', 'hypoid3', 1]
]

relation_df = pd.DataFrame(relation)
# relation_df
left = pd.DataFrame(left)
right = pd.DataFrame(right)
dp = mz.DataPack(
    relation=relation_df,
    left=left,
    right=right
)
# print(len(dp))
# print(type(dp.frame))
# frame_slice = dp.frame[0:5]
# type(frame_slice)
# list(frame_slice.columns)
# full_frame = dp.frame()

In [5]:
data_pack = mz.datasets.toy.load_data(stage='train')
type(data_pack)

matchzoo.data_pack.data_pack.DataPack

# Prepare input data

In [6]:
annot = pd.read_csv('./annotations/annotations_merged.csv')
print(annot.dtypes)
annot.sort_values('PMCID').head(10)

UserID              int64
PromptID            int64
PMCID               int64
Valid Label          bool
Valid Reasoning      bool
Label              object
Annotations        object
Label Code          int64
In Abstract          bool
Evidence Start      int64
Evidence End        int64
dtype: object


Unnamed: 0,UserID,PromptID,PMCID,Valid Label,Valid Reasoning,Label,Annotations,Label Code,In Abstract,Evidence Start,Evidence End
13994,0,7902,60007,True,True,significantly increased,Mivacurium 250 μg/kg produced a maximal T bloc...,1,True,1280,1395
14000,3,7904,60007,True,True,no significant difference,Heart rate was similar between doses,0,True,1452,1488
12483,0,7905,60007,True,True,significantly decreased,while both AUC-SBP and AUC-DBP were significan...,-1,False,16737,16836
12484,1,7905,60007,True,True,significantly decreased,"In relation to the cardiovascular response, th...",-1,False,16641,16836
13999,0,7904,60007,True,True,no significant difference,"In relation to the cardiovascular response, th...",0,False,16641,16736
13998,3,7903,60007,True,True,no significant difference,Spontaneous recovery times were similar in bot...,0,True,1396,1450
13997,0,7903,60007,True,True,no significant difference,The times to OA and to spontaneous recovery of...,0,False,14755,15079
13995,3,7902,60007,True,True,significantly increased,Mivacurium 250 μg/kg produced a maximal T bloc...,1,True,1280,1395
2855,0,1806,111193,True,True,no significant difference,The amount of blood transfusion was identical ...,0,True,1418,1578
2334,0,1808,111193,True,True,no significant difference,There was no difference in time spent in hospi...,0,False,14701,14770


In [7]:
"""Process txt file for Articles input"""
TXT_PATH = './annotations/txt_files/'
TAR_PATH = './annotations/processed_txt_files/'
look_up = {}

if not os.path.exists(TAR_PATH):
    os.mkdir(TAR_PATH)
# else: os.removedirs(TAR_PATH)
for file in os.listdir(TXT_PATH):
    fname = file[3:]
    look_up[int(file[3:-4])] = fname
    with open(TXT_PATH+file, 'r', encoding='utf-8') as f, open(TAR_PATH+fname, 'w', encoding='utf-8') as t:
        for line in f.readlines():
            t.write(line.strip())

In [8]:
"""Articles Map"""
left_articles = []
for file in os.listdir(TAR_PATH):
    with open(TAR_PATH+file, 'r', encoding='utf-8') as f:
        left_articles.append([int(file[:-4]), f.readlines()[0]])
# print(left_articles[0:5])

"""Prompts Map"""
right_prompts = [list(pair) for pair in zip(annot['PromptID'].values, annot['Annotations'].values)]
print(right_prompts[0:5])

"""Articles <-> Prompts Map"""
article_prompt_relations = [list(triplet) for triplet in zip(annot['PMCID'].values, 
                                                             annot['PromptID'].values, 
                                                             annot['Label Code'].values)]
print(article_prompt_relations[0:5])

"""Create Data-pack"""
left = pd.DataFrame(left_articles, columns=['id_left', 'text_left'])
right = pd.DataFrame(right_prompts, columns=['id_right', 'text_right'])
relation = pd.DataFrame(article_prompt_relations, columns=['id_left', 'id_right', 'label'])
dp = mz.DataPack(
    relation = relation,
    left = left,
    right = right
)

# print(left)
# print(len(dp))
# print(type(dp.frame))
# frame = dp.frame
# print(list(frame().columns))
# # frame_slice = dp.frame[0:5]
# # type(frame_slice)
# # list(frame_slice.columns)
# full_frame = dp.frame()

In [26]:
""" Read article by name """
def read_article(id) -> str:
    with open(TAR_PATH+look_up[id], 'r', encoding='utf-8') as f:
        return f.readlines()[0]

text_left = [read_article(id) for id in annot['PMCID']]
# print(text_left[0:5])

["TITLE: Influence of dextran-70 on systemic inflammatory response and myocardial ischaemia-reperfusion following cardiac operationsABSTRACT.INTRODUCTION:Experimental studies have demonstrated that dextran-70 reduces the leukocyte–endothelium interaction, but clinical evidence is still lacking. Our objective was to justify the anti-inflammatory effect of dextran-70 following cardiac operations.ABSTRACT.METHODS:Forty patients undergoing coronary bypass surgery (n = 32) or aortic valve replacement (n = 8) were enrolled in this prospective, randomized, double-blind study. Two groups were formed. In group A (n = 20), dextran-70 infusion was administered at a dose of 7.5 ml/kg before the initiation of cardiopulmonary bypass and at a dose of 12.5 ml/kg after the cessation of cardiopulmonary bypass. Group B served as a control with identical amounts of gelatin infusion (n = 20). The plasma concentration of procalcitonin, C-reactive protein, IL 6, IL 6r, IL 8, IL 10, soluble endothelial leukoc

In [44]:
df = pd.DataFrame(data={
    'id_left': annot['PMCID'],
    'text_left': text_left,
    'id_right': annot['PromptID'],
    'text_right': annot['Annotations'],
    'label': annot['Label Code']
})

""" Split data pack into train/valid """
train, valid = train_test_split(df, test_size=0.2)
train_pack = mz.pack(train, task=TYPE)
valid_pack = mz.pack(valid, task=TYPE)
train_pack.frame().head(10) # DataFrame

Unnamed: 0,id_left,text_left,id_right,text_right,label
0,4868208,TITLE: Effects of Kinesio taping and Mcconnell...,9144,Statistically significant between-group\r\r\r\...,1
1,5384512,TITLE: Bremelanotide for Female Sexual Dysfunc...,6597,"For 1.25/1.75-mg pooled versus placebo, mean c...",1
2,5729670,TITLE: Comparison of Two Protocols in the Mana...,10698,The parameters for glycemic variability (SD of...,-1
3,3479017,TITLE: The effect of a comprehensive lifestyle...,7291,"<td align=""left"" valign=""bottom"">Average veget...",0
4,5168897,TITLE: Comparative randomised study of GlideSc...,6272,We detected three cases of sore throat in each...,0
5,1868022,TITLE: Papanicolaou smears and cervical inflam...,6424,"CVL levels of IL-8, IL-1β and IL-6 remained un...",0
6,5673730,TITLE: Feto-maternal outcomes and Glycemic con...,1338,here was more number of cesarean sections due ...,-1
7,5571493,TITLE: Glycemic effects of quinine infusion in...,10309,There were no signs of hypoglycemia or signifi...,0
8,3529678,TITLE: CanPrevent: a telephone-delivered inter...,3749,"<td align=""left"" valign=""bottom""> Vegetables, ...",1
9,4318496,TITLE: Video game training and the reward syst...,10041,"Interestingly, when using uncorrected signific...",1


In [46]:
dp = mz.pack(df, task=TYPE)
print(type(dp.frame))
frame_slice = dp.frame[0:5]
print(list(frame_slice.columns))
full_frame = dp.frame()
len(full_frame) == len(dp)

<class 'matchzoo.data_pack.data_pack.DataPack.FrameView'>
['id_left', 'text_left', 'id_right', 'text_right', 'label']


True

# Train 