In [2]:
import os
import pickle
import pandas as pd
from tokenizer import NLTKTokenizer

CWD = os.getcwd()
TRAIN_DATA_PATH = os.path.join(CWD, 'data', 'trainset.csv')
VALID_DATA_PATH = os.path.join(CWD, 'data', 'validset.csv')
TEST_DATA_PATH = os.path.join(CWD, 'data', 'testset.csv')
DICT_PATH = os.path.join(CWD, 'data', 'dictionary.pkl')
WORKERS = os.cpu_count() // 2
Tokenizer = NLTKTokenizer()

In [3]:
import numpy as np

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [5]:
def GenDict(train, valid):
    global Tokenizer
    if os.path.exists(DICT_PATH):
        Tokenizer = NLTKTokenizer.load_from_file(DICT_PATH)
    else:
        for item in tqdm(train['Abstract'], desc='Train set'):
            Tokenizer.build_dict(item)

        for item in tqdm(valid['Abstract'], desc='Valid set'):
            Tokenizer.build_dict(item)
        Tokenizer.save_to_file(DICT_PATH)

In [6]:
train = pd.read_csv(TRAIN_DATA_PATH)
valid = pd.read_csv(VALID_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)
print('Generate relative dictionary')
GenDict(train, valid)

Generate relative dictionary


In [8]:
train

Unnamed: 0,Abstract,Task 1
0,"However, efficiently understanding underlying ...",OBJECTIVES
1,It is assumed that each partition employs the ...,BACKGROUND
2,The conditions under which this decoupling pro...,OTHERS
3,Given initial and final configurations of the ...,OTHERS
4,We establish that classical sketch has a simil...,RESULTS
...,...,...
42175,"Popular gates like AND, OR, XOR, processing tw...",BACKGROUND
42176,We also discuss how applying additional modifi...,METHODS/RESULTS
42177,Our analysis reveals that specific linguistic ...,CONCLUSIONS
42178,"In contrast, in AI and robotics, a robot's bod...",BACKGROUND


In [9]:
Tokenizer.encode(train['Abstract'][0])

[[5, 3],
 [6, 3],
 [7, 3],
 [8, 3],
 [9, 3],
 [8, 3],
 [10, 3],
 [11, 3],
 [3],
 [8, 3],
 [12, 3],
 [12, 3],
 [13, 3],
 [14, 3],
 [13, 3],
 [8, 3],
 [15, 3],
 [16, 3],
 [17, 3],
 [18, 3],
 [3],
 [19, 3],
 [15, 3],
 [20, 3],
 [8, 3],
 [10, 3],
 [21, 3],
 [16, 3],
 [22, 3],
 [15, 3],
 [20, 3],
 [13, 3],
 [15, 3],
 [23, 3],
 [3],
 [19, 3],
 [15, 3],
 [20, 3],
 [8, 3],
 [10, 3],
 [17, 3],
 [18, 3],
 [13, 3],
 [15, 3],
 [23, 3],
 [3],
 [16, 3],
 [10, 3],
 [8, 3],
 [15, 3],
 [20, 3],
 [21, 3],
 [3],
 [22, 3],
 [15, 3],
 [20, 3],
 [3],
 [24, 3],
 [22, 3],
 [16, 3],
 [16, 3],
 [8, 3],
 [10, 3],
 [15, 3],
 [21, 3],
 [3],
 [13, 3],
 [21, 3],
 [3],
 [25, 3],
 [22, 3],
 [10, 3],
 [20, 3],
 [3],
 [20, 3],
 [19, 3],
 [8, 3],
 [3],
 [16, 3],
 [6, 3],
 [3],
 [17, 3],
 [22, 3],
 [10, 3],
 [23, 3],
 [8, 3],
 [3],
 [21, 3],
 [13, 3],
 [26, 3],
 [8, 3],
 [3],
 [6, 3],
 [12, 3],
 [3],
 [16, 3],
 [25, 3],
 [8, 3],
 [3],
 [23, 3],
 [10, 3],
 [22, 3],
 [24, 3],
 [25, 3],
 [27, 3]]

In [10]:
Tokenizer.tokenize(train['Abstract'][0])

['However',
 ',',
 'efficiently',
 'understanding',
 'underlying',
 'trends',
 'and',
 'patterns',
 'is',
 'hard',
 'due',
 'to',
 'large',
 'size',
 'of',
 'the',
 'graph',
 '.']

In [7]:
Tokenizer.get_token_to_id()

{'[PAD]': 0,
 '[CLS]': 1,
 '[SEP]': 2,
 '[EOS]': 3,
 '[UNK]': 4,
 'H': 5,
 'o': 6,
 'w': 7,
 'e': 8,
 'v': 9,
 'r': 10,
 ',': 11,
 'f': 12,
 'i': 13,
 'c': 14,
 'n': 15,
 't': 16,
 'l': 17,
 'y': 18,
 'u': 19,
 'd': 20,
 's': 21,
 'a': 22,
 'g': 23,
 'p': 24,
 'h': 25,
 'z': 26,
 '.': 27,
 'I': 28,
 'm': 29,
 'L': 30,
 'R': 31,
 'U': 32,
 'T': 33,
 'k': 34,
 'x': 35,
 'G': 36,
 'D': 37,
 'b': 38,
 'j': 39,
 'W': 40,
 'M': 41,
 'S': 42,
 ':': 43,
 'O': 44,
 '-': 45,
 'q': 46,
 '1': 47,
 '2': 48,
 '0': 49,
 'F': 50,
 'A': 51,
 '5': 52,
 '+': 53,
 '(': 54,
 ')': 55,
 'E': 56,
 'B': 57,
 'K': 58,
 '3': 59,
 'P': 60,
 'V': 61,
 '6': 62,
 '?': 63,
 'Q': 64,
 'N': 65,
 'C': 66,
 '4': 67,
 "'": 68,
 '/': 69,
 '%': 70,
 ';': 71,
 '8': 72,
 '7': 73,
 '9': 74,
 '*': 75,
 '^': 76,
 '``': 77,
 'J': 78,
 '&': 79,
 'X': 80,
 '<': 81,
 'Y': 82,
 '=': 83,
 '@': 84,
 '`': 85,
 '_': 86,
 '#': 87,
 'Z': 88,
 '!': 89,
 '>': 90,
 '[': 91,
 '~': 92,
 '|': 93}