In [1]:
import os
import pickle
import pandas as pd
from tokenizer import NLTKTokenizer

CWD = os.getcwd()
TRAIN_DATA_PATH = os.path.join(CWD, 'data', 'trainset.csv')
VALID_DATA_PATH = os.path.join(CWD, 'data', 'validset.csv')
TEST_DATA_PATH = os.path.join(CWD, 'data', 'testset.csv')
DICT_PATH = os.path.join(CWD, 'data', 'dictionary.pkl')
WORKERS = os.cpu_count() // 2
Tokenizer = NLTKTokenizer()

In [2]:
import numpy as np

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [4]:
def SplitSent(doc):
    return doc.split('$$$')


def GenDict(train, valid):
    global Tokenizer
    if os.path.exists(DICT_PATH):
        Tokenizer = NLTKTokenizer.load_from_file(DICT_PATH)
    else:
        for item in train['Abstract']:
            Tokenizer.build_dict(item)

        for item in valid['Abstract']:
            Tokenizer.build_dict(item)

        Tokenizer.save_to_file(DICT_PATH)

In [5]:
train = pd.read_csv(TRAIN_DATA_PATH)
valid = pd.read_csv(VALID_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)

train['Abstract'] = train['Abstract'].apply(func=SplitSent)
valid['Abstract'] = valid['Abstract'].apply(func=SplitSent)
GenDict(train, valid)

In [6]:
test['Abstract'] = test['Abstract'].apply(func=SplitSent)

In [7]:
def labels_to_onehot(labels):
    '''
    Convert labels to one-hot encoding

    Args : 
        labels:( DataFrame column item ) 
    Return :
        one_hot_labels: ( DataFrame column item )
    '''
    one_hot_labels = []
    label_list = labels.split(' ')
    label_dict = {'BACKGROUND': 0, 'OBJECTIVES':1, 'METHODS':2, 'RESULTS':3, 'CONCLUSIONS':4, 'OTHERS':5}
    for label in label_list:
        onehot = [0,0,0,0,0,0]
        for l in label.split('/'):
            onehot[label_dict[l]] = 1
        one_hot_labels.append(onehot)
    
    return one_hot_labels


In [8]:
def encode_data(dataset):
    '''
    encode 'Abstract' and convert label to one_hot


    Args:
        dataset(pd.DataFrame)
    '''
    global Tokenizer
    dataset['Abstract'] = dataset['Abstract'].apply(func=Tokenizer.encode)
    if 'Task 1' in dataset.columns:
        dataset['Task 1'] = dataset['Task 1'].apply(func=labels_to_onehot)

In [9]:
encode_data(train)
encode_data(valid)
encode_data(test)

In [10]:
train.iloc[0]

Id                                                     D05945
Abstract    [[31341, 37103, 33906, 21235, 37219, 37059, 37...
Task 1      [[1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 1...
Name: 0, dtype: object

In [14]:
class Abstract(Dataset):
    def __init__(self, data, pad_idx, eos_token):
        self.data = data
        self.pad_idx = pad_idx
        self.eos_token = eos_token
    
    def __len__(self):
        return len(self.data.index)
    
    def __getitem__(self, index):
        return self.data.iloc[index]
    
    def collate_fn(self, datas):
        abstracts = [torch.as_tensor(abstract, dtype=torch.long)for data in datas for abstract in data['Abstract']]
        batch_abstracts = pad_sequence(
            abstracts, batch_first=True, padding_value=self.pad_idx)
        
        b, s = batch_abstracts.size() # b: batch, s:sequence length
        batch_eos = batch_abstracts == 3
        eos_index_matrix = batch_eos.nonzero()
        eos_index_list = list()
        prev = 0
        for row in eos_index_matrix:
            eos_index_list.append(row[1].item()+prev)
            prev = prev + s 
            
        batch_labels = None
        labels = [
            label for data in datas if 'Task 1' in data for label in data['Task 1']]
        if len(labels) != 0:
            batch_labels = torch.as_tensor(labels, dtype=torch.float)
            batch_labels = batch_labels.view(-1, 6)

        return batch_abstracts, batch_labels, torch.as_tensor(eos_index_list)



In [15]:
testset = Abstract(data=test, pad_idx=0, eos_token=3)
testset[0]

Id                                                     T00001
Abstract    [[37010, 36576, 37189, 36992, 36376, 37138, 37...
Name: 0, dtype: object

In [16]:
test_loader = DataLoader(dataset=testset, batch_size = 2, collate_fn=testset.collate_fn)

In [18]:
i = 0
for x, y, z in test_loader:
    print(x)
    print(y)
    print(z)
    print(torch.index_select(x, 0, z))
    i += 1
    if i == 1:
        break

tensor([[37010, 36576, 37189, 36992, 36376, 37138, 37180, 36781, 36687, 37177,
         36959, 37010, 37087, 37000, 36948, 36709, 37108, 37188,     3,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [37139, 37093, 36994, 37010, 36576, 37197, 37180, 36575, 30989, 37010,
         37132, 37138, 36366, 37180, 37119, 37180, 35771, 37138, 12805, 37180,
         22502, 36768, 37226, 37138, 37177, 37180, 36565, 36510, 37188,     3],
        [37125, 37180, 37203, 37056, 36848, 25172,     4, 37132, 25448, 37138,
         37180, 29488, 36877,     4, 37132, 37184, 37138, 36069, 37001, 37123,
         25448, 37006, 37188,     3,     0,     0,     0,     0,     0,     0],
        [37227, 37228, 37180, 37178, 37230,     4, 36552, 37194,     4, 37196,
         37214, 37010, 36576, 37197, 37180, 37164,     4, 37181, 35861,     4,
         37074, 33042, 37132, 37226, 37177, 37188,     3,     0,     0,     0],
        [27399, 37063, 37157, 37180, 37230, 3715

RuntimeError: index out of range: Tried to access index 18 out of table with 11 rows. at /pytorch/aten/src/TH/generic/THTensorEvenMoreMath.cpp:418

In [13]:
dataset[0]

Abstract    [[42485, 35732, 42367, 42450, 38674, 10, 42508...
Task 1      [[1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 1...
Name: 0, dtype: object

In [41]:
a = torch.randn(3,5)
a

tensor([[ 1.8134, -0.9588,  0.0026,  0.2593,  0.3925],
        [-0.4434,  1.2400, -1.2114,  0.6910,  0.0485],
        [ 0.0859, -0.4183,  0.7825, -0.2081, -0.1877]])

In [44]:
a.size()

torch.Size([3, 5])

In [45]:
m, n = a.size()
print(m)
print(n)

3
5


In [34]:
Tokenizer.vocab_size()

52021

In [35]:
trainset[0]['Abstract'][0]

[42255,
 41744,
 42473,
 42225,
 41498,
 42402,
 42451,
 41975,
 41872,
 42453,
 42195,
 42255,
 42396,
 42237,
 42170,
 41899,
 42372,
 42465,
 3]

In [42]:
labels = [ torch.as_tensor(abstract, dtype=torch.float) for abstract in dataset[0]['Task 1'] ]

In [43]:
labels

[tensor([1., 0., 0., 0., 0., 0.]),
 tensor([1., 0., 0., 0., 0., 0.]),
 tensor([1., 1., 0., 0., 0., 0.]),
 tensor([0., 0., 1., 0., 0., 0.]),
 tensor([0., 0., 1., 0., 0., 0.]),
 tensor([0., 0., 1., 1., 0., 0.])]

In [45]:
batch_labels = pad_sequence( labels, batch_first=True )
batch_labels

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 1., 0., 0.]])

In [33]:
sequence = [ torch.as_tensor(vec) for  vec in dataset[0]['Abstract'] ]

In [34]:
sequence

[tensor([42485, 35732, 42367, 42450, 38674,    10, 42508, 42313, 42472, 38387,
         42474, 42459, 42455, 42471, 42228, 42500, 42356, 42451, 42463, 42441,
         42463, 42416, 42306, 42468, 42437, 42396, 42445, 42526, 42450, 42105,
         42115, 42451, 42550, 42366, 42465,     3]),
 tensor([42007, 42500, 42497, 42204, 42502, 42035, 42438, 42453, 42451, 42497,
         42514, 42502, 42531, 41078, 42556, 42515, 40045, 42459, 42452, 42497,
         40188, 41599, 42465,     3]),
 tensor([41532, 42488, 42453, 42351, 42451, 42455, 42404, 42502, 42541, 42555,
         42445, 42515, 40045, 42507, 42432, 42495, 42500, 42435, 42416, 42463,
         42516, 42468, 42531, 42541, 42499, 41271, 42497, 42484, 42481, 42455,
         42360, 42476, 42465,     3]),
 tensor([42516, 42453, 42518, 42451, 42519, 42396, 42455, 42517, 42481, 42424,
         42515, 40045, 42451, 42466,    80, 42451, 42469, 42459, 42407, 42445,
         42396, 42452, 42455, 41566, 42299, 42502, 42436, 42451, 42474, 42424,


In [35]:
sequence[0].size()

torch.Size([36])

In [36]:
pad = pad_sequence(sequence, batch_first=True)
pad

tensor([[42485, 35732, 42367, 42450, 38674,    10, 42508, 42313, 42472, 38387,
         42474, 42459, 42455, 42471, 42228, 42500, 42356, 42451, 42463, 42441,
         42463, 42416, 42306, 42468, 42437, 42396, 42445, 42526, 42450, 42105,
         42115, 42451, 42550, 42366, 42465,     3,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [42007, 42500, 42497, 42204, 42502, 42035, 42438, 42453, 42451, 42497,
         42514, 42502, 42531, 41078, 42556, 42515, 40045, 42459, 42452, 42497,
         40188, 41599, 42465,     3,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [41532, 42488, 42453, 42351, 42451, 42455, 42404, 42502, 42541, 42555,
         42445, 42515, 40045, 42507, 42432, 42495, 42500, 42435, 4241

In [37]:
pad.size()

torch.Size([6, 53])

In [38]:
loader = DataLoader(dataset = dataset, batch_size = 2, collate_fn=dataset.collate_fn)