In [1]:
import os
import pickle
import pandas as pd
from tokenizer import NLTKTokenizer

CWD = os.getcwd()
TRAIN_DATA_PATH = os.path.join(CWD, 'data', 'trainset.csv')
VALID_DATA_PATH = os.path.join(CWD, 'data', 'validset.csv')
TEST_DATA_PATH = os.path.join(CWD, 'data', 'testset.csv')
DICT_PATH = os.path.join(CWD, 'data', 'dictionary.pkl')
WORKERS = os.cpu_count() // 2
Tokenizer = NLTKTokenizer()

In [2]:
import numpy as np

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [16]:
from tqdm import tqdm

In [4]:
def GenDict(train, valid):
    global Tokenizer
    if os.path.exists(DICT_PATH):
        Tokenizer = NLTKTokenizer.load_from_file(DICT_PATH)
    else:
        for item in tqdm(train['Abstract'], desc='Train set'):
            Tokenizer.build_dict([item])

        for item in tqdm(valid['Abstract'], desc='Valid set'):
            Tokenizer.build_dict([item])
        Tokenizer.save_to_file(DICT_PATH)

In [5]:

def labels_to_onehot(labels):
    '''
    Convert labels to one-hot encoding

    Args : 
        labels:( DataFrame column item ) 
    Return :
        one_hot_labels: ( DataFrame column item )
    '''
    one_hot_labels = []
    label_list = labels.split(' ')
    label_dict = {
        'BACKGROUND': 0,
        'OBJECTIVES': 1,
        'METHODS': 2,
        'RESULTS': 3,
        'CONCLUSIONS': 4,
        'OTHERS': 5
    }
    for label in label_list:
        onehot = [0, 0, 0, 0, 0, 0]
        for l in label.split('/'):
            onehot[label_dict[l]] = 1
        one_hot_labels.append(onehot)

    return one_hot_labels

In [14]:
def encode_data(dataset):
    '''
    encode 'Abstract' and convert label to one_hot

    Args:
        dataset(pd.DataFrame)
    '''
    global Tokenizer
    tqdm.pandas()
    dataset['Abstract'] = dataset['Abstract'].progress_apply(func=Tokenizer.encode)
    if 'Task 1' in dataset.columns:
        dataset['Task 1'] = dataset['Task 1'].progress_apply(func=labels_to_onehot)

In [6]:
train = pd.read_csv(TRAIN_DATA_PATH)
valid = pd.read_csv(VALID_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)
print('Generate relative dictionary')
GenDict(train, valid)

Generate relative dictionary


In [17]:
encode_data(train)

100%|██████████| 42180/42180 [00:05<00:00, 7182.60it/s]
100%|██████████| 42180/42180 [00:00<00:00, 257688.07it/s]


In [18]:
train

Unnamed: 0,Abstract,Task 1
0,"[[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...","[[0, 1, 0, 0, 0, 0]]"
1,"[[23, 13, 24, 25, 26, 27, 28, 20, 29, 30, 31, ...","[[1, 0, 0, 0, 0, 0]]"
2,"[[34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 6, 1...","[[0, 0, 0, 0, 0, 1]]"
3,"[[50, 51, 11, 52, 53, 19, 20, 54, 55, 6, 11, 5...","[[0, 0, 0, 0, 0, 1]]"
4,"[[68, 69, 25, 70, 71, 72, 56, 73, 74, 75, 20, ...","[[0, 0, 0, 1, 0, 0]]"
...,...,...
42175,"[[40322, 6100, 391, 18299, 6, 15515, 6, 15939,...","[[1, 0, 0, 0, 0, 0]]"
42176,"[[68, 470, 547, 548, 1728, 2806, 6539, 19246, ...","[[0, 0, 1, 1, 0, 0]]"
42177,"[[90, 209, 5436, 25, 1725, 3482, 1456, 131, 79...","[[0, 0, 0, 0, 1, 0]]"
42178,"[[198, 865, 6, 96, 4261, 11, 3804, 6, 56, 2706...","[[1, 0, 0, 0, 0, 0]]"


In [29]:
class Abstract(Dataset):
    def __init__(self, data, pad_idx, eos_id):
        self.data = data
        self.pad_idx = pad_idx
        self.eos_token = eos_id

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, index):
        return self.data.iloc[index]

    def collate_fn(self, datas):
        '''
        Args:
            datas : a list of dataframe row(pd.Series)
        '''
        abstracts = [ torch.as_tensor(sent, dtype=torch.long) for data in datas for sent in data['Abstract'] ]
        batch_abstracts = pad_sequence(abstracts, batch_first=True, padding_value=self.pad_idx)

        _, s = batch_abstracts.size()  # b: batch, s:sequence length
        batch_eos = batch_abstracts == self.eos_token
        eos_index_matrix = batch_eos.nonzero()
        eos_index_list = list()
        prev = 0
        for row in eos_index_matrix:
            eos_index_list.append(row[1].item() + prev)
            prev = prev + s

        batch_labels = None
        labels = [ label for data in datas if 'Task 1' in data for label in data['Task 1'] ]
        if len(labels) != 0:
            batch_labels = torch.as_tensor(labels, dtype=torch.float)
            batch_labels = batch_labels.view(-1, 6)

        return batch_abstracts, batch_labels, torch.as_tensor(eos_index_list, dtype=torch.long)

In [30]:
trainset = Abstract(train, 0, 3)

In [31]:
trainset[0]

Abstract    [[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...
Task 1                                   [[0, 1, 0, 0, 0, 0]]
Name: 0, dtype: object

In [32]:
trainloader = DataLoader(dataset=trainset, batch_size=2, shuffle=True, collate_fn=trainset.collate_fn)

In [34]:
for x, y, z in trainloader:
    print(x)
    print(y)
    print(z)
    break

tensor([[22622,    42,   470,   108,    16, 22623,    20,   730,    19,    20,
            76,    22,     3,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [ 5927,   122,   192,  1743,    25,  6120, 17705,    20,  1003,   764,
           191,    96,   712,  1569,    96,   501,  2128,    96,   825,  1309,
           305,  1258,    22,     3]])
tensor([[0., 0., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.]])
tensor([12, 47])
