In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle, json, re, time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from tqdm import tqdm_notebook as tqdm
#from tqdm import tqdm
from tqdm import trange

from gensim.parsing import remove_stopwords

import os
CWD = os.getcwd()
DATA_PATH = os.path.join(CWD, 'data')
CPUNUM = os.cpu_count() // 2
print(CPUNUM)

### Hyperparameter logging and tuning
For tuning best models, you may need to save the used hyperparameters.<br />
The two cells below make the logging convenient.

In [6]:
import configparser

def write_config(filename, with_time=False):
    config = configparser.ConfigParser()
    config['DEFAULT'] = {'embedding_dim': embedding_dim,
                         'hidden_dim': hidden_dim,
                         'learning_rate': learning_rate,
                         'max_epoch': max_epoch,
                         'batch_size': batch_size}
    
    if with_time == False:
        with open("{}.ini".format(filename), 'w') as configfile:
            config.write(configfile)
        return 'config'            
    else:
        timestr = time.strftime("%Y%m%d_%H%M%S")
        filename = filename + '_' + timestr
        with open("{}.ini".format(filename), 'w') as configfile:
            config.write(configfile)
        return ( 'config' + timestr )

In [7]:
### Hyperparameters tuning
### Run this cell for renewing the hyperparameters

embedding_dim = 100
hidden_dim = 512
learning_rate = 1e-4
max_epoch = 10
batch_size = 16

config_fname = write_config(os.path.join(CWD,"config"), True)
# config_fname will be used when logging training scalar to tensorboard

In [14]:
dataset = pd.read_csv( os.path.join(DATA_PATH,'task1_trainset.csv'), dtype=str)

In [9]:
dataset.head()

Unnamed: 0,Id,Title,Abstract,Authors,Categories,Created Date,Task 1
0,D00001,A Brain-Inspired Trust Management Model to Ass...,Rapid popularity of Internet of Things (IoT) a...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,BACKGROUND OBJECTIVES METHODS METHODS RESULTS ...
1,D00002,On Efficient Computation of Shortest Dubins Pa...,"In this paper, we address the problem of compu...",Sadeghi/Smith,cs.SY/cs.RO/math.OC,2016-09-21,OBJECTIVES OTHERS METHODS/RESULTS RESULTS RESULTS
2,D00003,Data-driven Upsampling of Point Clouds,High quality upsampling of sparse 3D point clo...,Zhang/Jiang/Yang/Yamakawa/Shimada/Kara,cs.CV,2018-07-07,BACKGROUND OBJECTIVES METHODS METHODS METHODS ...
3,D00004,Accessibility or Usability of InteractSE? A He...,Internet is the main source of information now...,Aqle/Khowaja/Al-Thani,cs.HC,2018-08-29,BACKGROUND BACKGROUND BACKGROUND OBJECTIVES OB...
4,D00005,Spatio-Temporal Facial Expression Recognition ...,Automated Facial Expression Recognition (FER) ...,Hasani/Mahoor,cs.CV,2017-03-20,BACKGROUND BACKGROUND BACKGROUND BACKGROUND ME...


In [15]:
### Remove (current) redundant columns.

dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)
dataset['Abstract'] = dataset['Abstract'].str.lower()
#dataset['Task 1'] = dataset['Task 1'].str.lower()

for i in range(len(dataset['Abstract'])):
    dataset['Abstract'][i] = remove_stopwords(dataset['Abstract'][i])

In [11]:
dataset.head()

Unnamed: 0,Id,Abstract,Task 1
0,D00001,rapid popularity internet things (iot) cloud c...,BACKGROUND OBJECTIVES METHODS METHODS RESULTS ...
1,D00002,"paper, address problem computing optimal paths...",OBJECTIVES OTHERS METHODS/RESULTS RESULTS RESULTS
2,D00003,high quality upsampling sparse 3d point clouds...,BACKGROUND OBJECTIVES METHODS METHODS METHODS ...
3,D00004,internet main source information nowadays.$$$t...,BACKGROUND BACKGROUND BACKGROUND OBJECTIVES OB...
4,D00005,automated facial expression recognition (fer) ...,BACKGROUND BACKGROUND BACKGROUND BACKGROUND ME...


In [17]:
# set test_size=0.1 for validation split
trainset, validset = train_test_split(dataset, test_size=0.1, random_state=42)

trainset.to_csv(os.path.join(DATA_PATH,'trainset.csv'),index=False)
validset.to_csv(os.path.join(DATA_PATH,'validset.csv'),index=False)

In [19]:
# Remove (current) redundant columns of the test set.
dataset = pd.read_csv(os.path.join(DATA_PATH, 'task1_public_testset.csv'), dtype=str)
dataset.drop('Title', axis=1, inplace=True)
dataset.drop('Categories', axis=1, inplace=True)
dataset.drop('Created Date', axis=1, inplace=True)
dataset.drop('Authors', axis=1, inplace=True)
dataset['Abstract'] = dataset['Abstract'].str.lower()

dataset['Abstract'] = dataset['Abstract'].apply(func=remove_stopwords)

dataset.to_csv(os.path.join(DATA_PATH, 'testset.csv'), index=False)

### Collect words and create the vocabulary set

In [20]:
from multiprocessing import Pool
from nltk.tokenize import word_tokenize

def collect_words(data_path, n_workers=8):
    df = pd.read_csv(data_path, dtype=str)
    # create a list for storing sentences
    sent_list = []
    for _ , row in df.iterrows():
        # remove $$$ and append to sent_list
        sent_list.extend(row['Abstract'].split('$$$'))

    chunks = [
        ' '.join(sent_list[i:i + len(sent_list) // n_workers])
        for i in range(0, len(sent_list), len(sent_list) // n_workers)
    ]
    with Pool(n_workers) as pool:
        # word_tokenize for word-word separation
        chunks = pool.map_async(word_tokenize, chunks)
        words = set(sum(chunks.get(), []))

    return words

In [22]:
words = set()
words |= collect_words(os.path.join(DATA_PATH, 'trainset.csv'))

In [23]:
PAD_TOKEN = 0
UNK_TOKEN = 1
word_dict = {'<pad>':PAD_TOKEN,'<unk>':UNK_TOKEN}
for word in words:
    word_dict[word]=len(word_dict)

In [24]:
with open('dictionary.pkl', 'wb') as f:
    pickle.dump(word_dict, f)

### Download Glove pretrained word embedding from web.

Link: http://nlp.stanford.edu/data/glove.6B.zip <br />
It takes about 5 minutes for the download.


In [25]:
import requests, zipfile, io
if not os.path.exists('glove'):
    os.mkdir('glove')
    r = requests.get('http://nlp.stanford.edu/data/glove.6B.zip')
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(path='glove')

### Parsing the GloVe word-embeddings file

Parse the unzipped file (a .txt file) to build an index that maps words (as strings) to their vector representation (as number vectors)

In [27]:
# Parse the unzipped file (a .txt file) to build an index that maps words (as strings) to their vector representation (as number vectors)
wordvector_path = 'glove/glove.6B.100d.txt'
embeddings_index = {}
with open(wordvector_path) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype = 'float32')
        embeddings_index[word] = coefs

print(f'Found {len(embeddings_index)} word vectors')

Found 400000 word vectors


In [28]:
### Preparing the GloVe word-embeddings matrix

max_words = len(word_dict)
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_dict.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [29]:
embedding_matrix = torch.FloatTensor(embedding_matrix)

### Data Formatting
建立完字典後，我們要將 data 切成數個 batch，並且將輸入的句子轉成數字，將答案轉成 onehot vector。
- `label_to_onehot(labels)`:  
    將 datasert 中的 label string 轉成 onehot encoding vector。  
- `sentence_to_indices(sentence, word_dict)`:  
    將輸入的句子中每個 word 轉成字典中對應的 index  
    ex : 'i love ncku' -> $[1,2,3]$
- `get_dataset(data_path, word_dict, n_workers=4)`:  
    將 dataset 讀入
- `preprocess_samples(dataset, word_dict)`:  
    傳入所有 input sentences 並進行 data preprocessing  
- `preprocess_sample(data, word_dict)`:  
    主要透過這個 function 移除存在於 'Abstract' 中的 `$` 符號  
    並將 'Label' 轉成 onehot encoding vector。

In [30]:
def label_to_onehot(labels):
    """ Convert label to onehot .
        Args:
            labels (string): sentence's labels.
        Return:
            outputs (onehot list): sentence's onehot label.
    """
    label_dict = {'BACKGROUND': 0, 'OBJECTIVES':1, 'METHODS':2, 'RESULTS':3, 'CONCLUSIONS':4, 'OTHERS':5}
    onehot = [0,0,0,0,0,0]
    for l in labels.split('/'):
        onehot[label_dict[l]] = 1
    return onehot
        
def sentence_to_indices(sentence, word_dict):
    """ Convert sentence to its word indices.
    Args:
        sentence (str): One string.
    Return:
        indices (list of int): List of word indices.
    """
    return [word_dict.get(word,UNK_TOKEN) for word in word_tokenize(sentence)]
    
def get_dataset(data_path, word_dict, n_workers=4):
    """ Load data and return dataset for training and validating.

    Args:
        data_path (str): Path to the data.
    """
    dataset = pd.read_csv(data_path, dtype=str)

    results = [None] * n_workers
    with Pool(processes=n_workers) as pool:
        for i in range(n_workers):
            batch_start = (len(dataset) // n_workers) * i
            if i == n_workers - 1:
                batch_end = len(dataset)
            else:
                batch_end = (len(dataset) // n_workers) * (i + 1)
            
            batch = dataset[batch_start: batch_end]
            results[i] = pool.apply_async(preprocess_samples, args=(batch,word_dict))

        pool.close()
        pool.join()

    processed = []
    for result in results:
        processed += result.get()
    return processed

def preprocess_samples(dataset, word_dict):
    """ Worker function.

    Args:
        dataset (list of dict)
    Returns:
        list of processed dict.
    """
    processed = []
    for sample in tqdm(dataset.iterrows(), total=len(dataset)):
        processed.append(preprocess_sample(sample[1], word_dict))

    return processed

def preprocess_sample(data, word_dict):
    """
    Args:
        data (dict)
    Returns:
        dict
    """
    ## clean abstracts by removing $$$
    processed = {}
    processed['Abstract'] = [sentence_to_indices(sent, word_dict) for sent in data['Abstract'].split('$$$')]
    
    ## convert the labels into one-hot encoding
    if 'Task 1' in data:
        processed['Label'] = [label_to_onehot(label) for label in data['Task 1'].split(' ')]
        
    return processed

In [None]:
print('[INFO] Start processing trainset...')
train = get_dataset(os.path.join(DATA_PATH,'trainset.csv'), word_dict, n_workers=8)
print('[INFO] Start processing validset...')
valid = get_dataset(os.path.join(DATA_PATH,'validset.csv'), word_dict, n_workers=8)
print('[INFO] Start processing testset...')
test = get_dataset(os.path.join(DATA_PATH,'testset.csv'), word_dict, n_workers=8)