In [1]:
import numpy as np
import pandas as pd 
import os 
import pickle
from tokenizer import NLTKTokenizer
from tqdm import tqdm
import seaborn as sns
%matplotlib inline

In [2]:
CWD = os.getcwd()
TRAIN_DATA_PATH = os.path.join(CWD, 'data', 'trainset.csv')
VALID_DATA_PATH = os.path.join(CWD, 'data', 'validset.csv')
TEST_DATA_PATH = os.path.join(CWD, 'data', 'testset.csv')
DICT_PATH = os.path.join(CWD, 'data', 'dictionary.pkl')
WORKERS = os.cpu_count()

In [3]:
Tokenizer = NLTKTokenizer()

In [4]:
train = pd.read_csv(TRAIN_DATA_PATH)
valid = pd.read_csv(VALID_DATA_PATH)


In [5]:
tqdm.pandas()
train['Abstract'] = train['Abstract'].progress_apply(func=lambda doc : doc.split('$$$'))
valid['Abstract'] = valid['Abstract'].progress_apply(func=lambda doc : doc.split('$$$'))

100%|██████████| 6300/6300 [00:00<00:00, 371480.00it/s]
100%|██████████| 700/700 [00:00<00:00, 252973.70it/s]


In [7]:
def GenDict(train, valid):
    global Tokenizer
    if os.path.exists(DICT_PATH):
        Tokenizer = NLTKTokenizer.load_from_file(DICT_PATH)
    else:
        for item in train['Abstract']:
            Tokenizer.build_dict(item)

        for item in valid['Abstract']:
            Tokenizer.build_dict(item)
        Tokenizer.save_to_file(DICT_PATH)

In [8]:
GenDict(train, valid)

In [9]:

def labels_to_onehot(labels):
    '''
    Convert labels to one-hot encoding

    Args : 
        labels:( DataFrame column item ) 
    Return :
        one_hot_labels: ( DataFrame column item )
    '''
    one_hot_labels = []
    label_list = labels.split(' ')
    label_dict = {
        'BACKGROUND': 0,
        'OBJECTIVES': 1,
        'METHODS': 2,
        'RESULTS': 3,
        'CONCLUSIONS': 4,
        'OTHERS': 5
    }
    for label in label_list:
        onehot = [0, 0, 0, 0, 0, 0]
        for l in label.split('/'):
            onehot[label_dict[l]] = 1
        one_hot_labels.append(onehot)

    return one_hot_labels


def encode_data(dataset):
    '''
    encode 'Abstract' and convert label to one_hot


    Args:
        dataset(pd.DataFrame)
    '''
    global Tokenizer
    tqdm.pandas()
    dataset['Abstract'] = dataset['Abstract'].progress_apply(func=Tokenizer.encode)
    if 'Task 1' in dataset.columns:
        dataset['Task 1'] = dataset['Task 1'].progress_apply(func=labels_to_onehot)


In [10]:
encode_data(train)
encode_data(valid)

100%|██████████| 6300/6300 [00:04<00:00, 1267.70it/s]
100%|██████████| 6300/6300 [00:00<00:00, 216481.09it/s]
100%|██████████| 700/700 [00:00<00:00, 1233.99it/s]
100%|██████████| 700/700 [00:00<00:00, 91761.87it/s]


In [13]:
def GetMaxSeqLength(abstract):
    max = 0
    for sent in abstract:
        if len(sent) > max:
            max = len(sent)
    return max

In [14]:
train['MaxSeqLength'] = train['Abstract'].progress_apply(func=GetMaxSeqLength) 

100%|██████████| 6300/6300 [00:00<00:00, 548878.63it/s]


In [16]:
train['MaxSeqLength'].describe()

count    6300.000000
mean       27.181270
std         8.704375
min         9.000000
25%        22.000000
50%        26.000000
75%        31.000000
max       125.000000
Name: MaxSeqLength, dtype: float64

In [18]:
valid['MaxSeqLength'] = valid['Abstract'].progress_apply(func=GetMaxSeqLength)

100%|██████████| 700/700 [00:00<00:00, 307146.44it/s]


In [20]:
valid['MaxSeqLength'].describe()

count    700.000000
mean      27.008571
std        8.079359
min        8.000000
25%       22.000000
50%       25.000000
75%       31.000000
max       77.000000
Name: MaxSeqLength, dtype: float64