In [ ]:
import torch
import os
import pandas as pd
import matchzoo as mz
import numpy as np
from sklearn.model_selection import train_test_split

In [ ]:
print(mz.__version__)
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import nltk
nltk.download('punkt')
# nltk.set_proxy('SYSTEM PROXY')

In [ ]:
TYPE = 'classification'

classification_task = mz.tasks.Classification(num_classes=3)
classification_task.metrics = ['acc']
print(classification_task.num_classes)
print(classification_task.output_shape)
print(classification_task.output_dtype)
print(classification_task)

# Data Pack Sample

In [ ]:
# pack
df = pd.DataFrame(data={'text_left': list('ABBCD'),
                       'text_right': list('aacbd'),
                       'label': [-1, 1, 0, 0, 1]})
mz.pack(df, task=TYPE).frame()

In [ ]:
# data_pack
left = [
    ['artid1', 'A1'],
    ['artid2', 'A2'],
    ['artid3', 'A3']
]
right = [
    ['hypoid1', 'prompt1'],
    ['hypoid2', 'prompt2'],
    ['hypoid3', 'prompt3']
]
relation = [
    ['artid1', 'hypoid1', -1],
    ['artid1', 'hypoid3', 1],
    ['artid2', 'hypoid2', 0],
    ['artid3', 'hypoid3', 1]
]

relation_df = pd.DataFrame(relation)
# relation_df
left = pd.DataFrame(left)
right = pd.DataFrame(right)
dp = mz.DataPack(
    relation=relation_df,
    left=left,
    right=right
)
# print(len(dp))
# print(type(dp.frame))
# frame_slice = dp.frame[0:5]
# type(frame_slice)
# list(frame_slice.columns)
# full_frame = dp.frame()

In [ ]:
data_pack = mz.datasets.toy.load_data(stage='train')
type(data_pack)

# Prepare input data

In [ ]:
annot = pd.read_csv('../annotations/annotations_merged.csv')
print(annot.dtypes)
annot.sort_values('PMCID').head(10)

In [ ]:
"""Process txt file for Articles input"""
TXT_PATH = '../annotations/txt_files/'
TAR_PATH = '../annotations/processed_txt_files/'
look_up = {}

if not os.path.exists(TAR_PATH):
    os.mkdir(TAR_PATH)
# else: os.removedirs(TAR_PATH)
for file in os.listdir(TXT_PATH):
    fname = file[3:]
    look_up[int(file[3:-4])] = fname
    with open(TXT_PATH+file, 'r', encoding='utf-8') as f, open(TAR_PATH+fname, 'w', encoding='utf-8') as t:
        for line in f.readlines():
            t.write(line.strip())

In [ ]:
"""Articles Map"""
left_articles = []
for file in os.listdir(TAR_PATH):
    with open(TAR_PATH+file, 'r', encoding='utf-8') as f:
        left_articles.append([int(file[:-4]), f.readlines()[0]])
# print(left_articles[0:5])

"""Prompts Map"""
right_prompts = [list(pair) for pair in zip(annot['PromptID'].values, annot['Annotations'].values)]
print(right_prompts[0:5])

"""Articles <-> Prompts Map"""
article_prompt_relations = [list(triplet) for triplet in zip(annot['PMCID'].values, 
                                                             annot['PromptID'].values, 
                                                             annot['Label Code'].values)]
print(article_prompt_relations[0:2])

"""Create Data-pack"""
left = pd.DataFrame(left_articles, columns=['id_left', 'text_left'])
right = pd.DataFrame(right_prompts, columns=['id_right', 'text_right'])
relation = pd.DataFrame(article_prompt_relations, columns=['id_left', 'id_right', 'label'])
dp = mz.DataPack(
    relation = relation,
    left = left,
    right = right
)

# print(left)
# print(len(dp))
# print(type(dp.frame))
# frame = dp.frame
# print(list(frame().columns))
# # frame_slice = dp.frame[0:5]
# # type(frame_slice)
# # list(frame_slice.columns)
# full_frame = dp.frame()

In [ ]:
""" Read article by name """
def read_article(id) -> str:
    with open(TAR_PATH+look_up[id], 'r', encoding='utf-8') as f:
        return f.readlines()[0]

text_left = [read_article(id) for id in annot['PMCID']]
# print(text_left[0:5])

In [ ]:
df = pd.DataFrame(data={
    'id_left': annot['PMCID'].astype(str),
    'text_left': text_left,
    'id_right': annot['PromptID'].astype(str),
    'text_right': annot['Annotations'],
    'label': annot['Label Code']+1
})
print(df.dtypes)

""" Split data pack into train/valid """
train, valid = train_test_split(df, test_size=0.2)
train_pack = mz.pack(train, task=TYPE)
valid_pack = mz.pack(valid, task=TYPE)
train_pack.frame().head(10) # DataFrame

In [ ]:
dp = mz.pack(df, task=TYPE)
print(type(dp.frame))
frame_slice = dp.frame[0:5]
print(list(frame_slice.columns))
full_frame = dp.frame()
len(full_frame) == len(dp)

# Model and Train 

In [ ]:
preprocessor = mz.models.DIIN.get_default_preprocessor()
train_processed = preprocessor.fit_transform(train_pack)
valid_processed = preprocessor.transform(valid_pack)

print(preprocessor.context)

In [ ]:
ngram_callback = mz.dataloader.callbacks.Ngram(preprocessor, mode='index')

trainset = mz.dataloader.Dataset(
    data_pack=train_processed,
    mode='pair',
    num_dup=1,
    num_neg=4,
    callbacks=[ngram_callback]
)
validset = mz.dataloader.Dataset(
    data_pack=valid_processed,
    mode='point',
    callbacks=[ngram_callback]
)

In [ ]:
padding_callback = mz.models.DIIN.get_default_padding_callback()

trainloader = mz.dataloader.DataLoader(
    dataset=trainset,
    stage='train',
    callback=padding_callback
)
validloader = mz.dataloader.DataLoader(
    dataset=validset,
    stage='dev',
    callback=padding_callback
)

In [ ]:
model = mz.models.DIIN()
model.params['task'] = classification_task
model.params['embedding_output_dim'] = 100
model.params['embedding_input_dim'] = preprocessor.context['embedding_input_dim']
model.params['dropout_rate'] = 0.2
model.guess_and_fill_missing_params()
model.build()

print(model)
print('Trainable params: ', sum(p.numel() for p in model.parameters() if p.requires_grad))

In [ ]:
optimizer = torch.optim.Adam(model.parameters())

trainer = mz.trainers.Trainer(
    model=model,
    optimizer=optimizer,
    trainloader=trainloader,
    validloader=validloader,
    epochs=10
)

trainer.run()