In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle, json, re, time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from tqdm import tqdm
from tqdm import trange

from gensim.parsing import remove_stopwords

from collections import OrderedDict

import os
CWD = os.getcwd()
DATA_PATH = os.path.join(CWD, 'data')
CPUNUM = os.cpu_count() // 2

### Hyperparameter logging and tuning
For tuning best models, you may need to save the used hyperparameters.<br />
The two cells below make the logging convenient.

In [21]:
import configparser

def write_config(filename, with_time=False):
    config = configparser.ConfigParser()
    config['DEFAULT'] = {'embedding_dim': embedding_dim,
                         'hidden_dim': hidden_dim,
                         'learning_rate': learning_rate,
                         'max_epoch': max_epoch,
                         'batch_size': batch_size,
                         'drop_p' : drop_p
                         }
    
    if with_time == False:
        with open("{}.ini".format(filename), 'w') as configfile:
            config.write(configfile)
        return 'config'            
    else:
        timestr = time.strftime("%Y%m%d_%H%M%S")
        filename = filename + '_' + timestr
        with open("{}.ini".format(filename), 'w') as configfile:
            config.write(configfile)
        return ( 'config' + timestr )

In [22]:
### Hyperparameters tuning
### Run this cell for renewing the hyperparameters

embedding_dim = 100 # word embedding dim for Glove
hidden_dim = 512
learning_rate = 1e-4
max_epoch = 10
batch_size = 16
drop_p = 0.2

config_fname = write_config(os.path.join(CWD,"config"), True)
# config_fname will be used when logging training scalar to tensorboard

In [4]:
dataset = pd.read_csv( os.path.join(DATA_PATH,'task1_trainset.csv'), dtype=str)

In [5]:
### Remove (current) redundant columns.

dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)
dataset['Abstract'] = dataset['Abstract'].str.lower()
#dataset['Task 1'] = dataset['Task 1'].str.lower()

for i in range(len(dataset['Abstract'])):
    dataset['Abstract'][i] = remove_stopwords(dataset['Abstract'][i])

In [6]:
# set test_size=0.1 for validation split
trainset, validset = train_test_split(dataset, test_size=0.1, random_state=42)

trainset.to_csv(os.path.join(DATA_PATH,'trainset.csv'),index=False)
validset.to_csv(os.path.join(DATA_PATH,'validset.csv'),index=False)

In [7]:
# Remove (current) redundant columns of the test set.
dataset = pd.read_csv(os.path.join(DATA_PATH, 'task1_public_testset.csv'), dtype=str)
dataset.drop('Title', axis=1, inplace=True)
dataset.drop('Categories', axis=1, inplace=True)
dataset.drop('Created Date', axis=1, inplace=True)
dataset.drop('Authors', axis=1, inplace=True)
dataset['Abstract'] = dataset['Abstract'].str.lower()

dataset['Abstract'] = dataset['Abstract'].apply(func=remove_stopwords)

dataset.to_csv(os.path.join(DATA_PATH, 'testset.csv'), index=False)

### Collect words and create the vocabulary set

In [8]:
from multiprocessing import Pool
from nltk.tokenize import word_tokenize

def collect_words(data_path, n_workers=4):
    df = pd.read_csv(data_path, dtype=str)
    # create a list for storing sentences
    sent_list = []
    for _ , row in df.iterrows():
        # remove $$$ and append to sent_list
        sent_list.extend(row['Abstract'].split('$$$'))

    chunks = [
        ' '.join(sent_list[i:i + len(sent_list) // n_workers])
        for i in range(0, len(sent_list), len(sent_list) // n_workers)
    ]
    with Pool(n_workers) as pool:
        # word_tokenize for word-word separation
        chunks = pool.map_async(word_tokenize, chunks)
        words = set(sum(chunks.get(), []))

    return words

In [9]:
words = set()
words |= collect_words(os.path.join(DATA_PATH, 'trainset.csv'), n_workers=CPUNUM)

In [10]:
PAD_TOKEN = 0
UNK_TOKEN = 1
word_dict = {'<pad>':PAD_TOKEN,'<unk>':UNK_TOKEN}
for word in words:
    word_dict[word]=len(word_dict)

In [11]:
with open('dictionary.pkl', 'wb') as f:
    pickle.dump(word_dict, f)

### Download Glove pretrained word embedding from web.

Link: http://nlp.stanford.edu/data/glove.6B.zip <br />
It takes about 5 minutes for the download.


In [12]:
import requests, zipfile, io
if not os.path.exists('glove'):
    os.mkdir('glove')
    r = requests.get('http://nlp.stanford.edu/data/glove.6B.zip')
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(path='glove')

### Parsing the GloVe word-embeddings file

Parse the unzipped file (a .txt file) to build an index that maps words (as strings) to their vector representation (as number vectors)

In [13]:
# Parse the unzipped file (a .txt file) to build an index that maps words (as strings) to their vector representation (as number vectors)
wordvector_path = 'glove/glove.6B.100d.txt'
embeddings_index = {}
with open(wordvector_path) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype = 'float32')
        embeddings_index[word] = coefs

print(f'Found {len(embeddings_index)} word vectors')

Found 400000 word vectors


In [14]:
### Preparing the GloVe word-embeddings matrix

max_words = len(word_dict)
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_dict.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [15]:
embedding_matrix = torch.FloatTensor(embedding_matrix)

### Data Formatting
建立完字典後，我們要將 data 切成數個 batch，並且將輸入的句子轉成數字，將答案轉成 onehot vector。
- `label_to_onehot(labels)`:  
    將 datasert 中的 label string 轉成 onehot encoding vector。  
- `sentence_to_indices(sentence, word_dict)`:  
    將輸入的句子中每個 word 轉成字典中對應的 index  
    ex : 'i love ncku' -> $[1,2,3]$
- `get_dataset(data_path, word_dict, n_workers=4)`:  
    將 dataset 讀入
- `preprocess_samples(dataset, word_dict)`:  
    傳入所有 input sentences 並進行 data preprocessing  
- `preprocess_sample(data, word_dict)`:  
    主要透過這個 function 移除存在於 'Abstract' 中的 `$` 符號  
    並將 'Label' 轉成 onehot encoding vector。

In [16]:
def label_to_onehot(labels):
    """ Convert label to onehot .
        Args:
            labels (string): sentence's labels.
        Return:
            outputs (onehot list): sentence's onehot label.
    """
    label_dict = {'BACKGROUND': 0, 'OBJECTIVES':1, 'METHODS':2, 'RESULTS':3, 'CONCLUSIONS':4, 'OTHERS':5}
    onehot = [0,0,0,0,0,0]
    for l in labels.split('/'):
        onehot[label_dict[l]] = 1
    return onehot
        
def sentence_to_indices(sentence, word_dict):
    """ Convert sentence to its word indices.
    Args:
        sentence (str): One string.
    Return:
        indices (list of int): List of word indices.
    """
    return [word_dict.get(word,UNK_TOKEN) for word in word_tokenize(sentence)]
    
def get_dataset(data_path, word_dict, n_workers=4):
    """ Load data and return dataset for training and validating.

    Args:
        data_path (str): Path to the data.
    """
    dataset = pd.read_csv(data_path, dtype=str)

    results = [None] * n_workers
    with Pool(processes=n_workers) as pool:
        for i in range(n_workers):
            batch_start = (len(dataset) // n_workers) * i
            if i == n_workers - 1:
                batch_end = len(dataset)
            else:
                batch_end = (len(dataset) // n_workers) * (i + 1)
            
            batch = dataset[batch_start: batch_end]
            results[i] = pool.apply_async(preprocess_samples, args=(batch,word_dict))

        pool.close()
        pool.join()

    processed = []
    for result in results:
        processed += result.get()
    return processed

def preprocess_samples(dataset, word_dict):
    """ Worker function.

    Args:
        dataset (list of dict)
    Returns:
        list of processed dict.
    """
    processed = []
    for sample in tqdm(dataset.iterrows(), total=len(dataset)):
        processed.append(preprocess_sample(sample[1], word_dict))

    return processed

def preprocess_sample(data, word_dict):
    """
    Args:
        data (dict)
    Returns:
        dict
    """
    ## clean abstracts by removing $$$
    processed = {}
    processed['Abstract'] = [sentence_to_indices(sent, word_dict) for sent in data['Abstract'].split('$$$')]
    
    ## convert the labels into one-hot encoding
    if 'Task 1' in data:
        processed['Label'] = [label_to_onehot(label) for label in data['Task 1'].split(' ')]
        
    return processed

In [17]:
print('[INFO] Start processing trainset...')
train = get_dataset(os.path.join(DATA_PATH,'trainset.csv'), word_dict, n_workers=CPUNUM)
print('[INFO] Start processing validset...')
valid = get_dataset(os.path.join(DATA_PATH,'validset.csv'), word_dict, n_workers=CPUNUM)
print('[INFO] Start processing testset...')
test = get_dataset(os.path.join(DATA_PATH,'testset.csv'), word_dict, n_workers=CPUNUM)

[INFO] Start processing trainset...
100%|██████████| 787/787 [00:00<00:00, 952.67it/s]
100%|██████████| 787/787 [00:00<00:00, 906.39it/s]
100%|██████████| 787/787 [00:00<00:00, 861.71it/s]

100%|██████████| 791/791 [00:00<00:00, 868.09it/s]
100%|██████████| 787/787 [00:00<00:00, 794.86it/s]
100%|██████████| 787/787 [00:00<00:00, 842.91it/s]
100%|██████████| 787/787 [00:00<00:00, 792.73it/s]
[INFO] Start processing validset...
 91%|█████████ | 79/87 [00:00<00:00, 779.39it/s]
100%|██████████| 87/87 [00:00<00:00, 782.57it/s]
100%|██████████| 87/87 [00:00<00:00, 765.11it/s]
100%|██████████| 87/87 [00:00<00:00, 844.27it/s]
 92%|█████████▏| 80/87 [00:00<00:00, 790.73it/s]
 87%|████████▋ | 76/87 [00:00<00:00, 753.11it/s]
100%|██████████| 91/91 [00:00<00:00, 955.70it/s]
100%|██████████| 87/87 [00:00<00:00, 730.61it/s]
[INFO] Start processing testset...
 96%|█████████▌| 2388/2500 [00:02<00:00, 895.48it/s]
100%|██████████| 2500/2500 [00:02<00:00, 870.70it/s]

100%|██████████| 2500/2500 [00:02<00

### Create a dataset class for the abstract dataset
`torch.utils.data.Dataset` is an abstract class representing a dataset.<br />Your custom dataset should inherit Dataset and override the following methods:

- `__len__` so that len(dataset) returns the size of the dataset.
- `__getitem__` to support the indexing such that dataset[i] can be used to get i
th sample
- `collate_fn` Users may use customized collate_fn to achieve custom batching
    - Here we pad sequences of various lengths (make same length of every single sentence)

In [18]:
class AbstractDataset(Dataset):
    def __init__(self, data, pad_idx, max_len = 64):
        self.data = data
        self.pad_idx = pad_idx
        self.max_len = max_len
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]
        
    def collate_fn(self, datas):
        # get max length in this batch
        max_sent = max([len(data['Abstract']) for data in datas])
        max_len = max([min(len(sentence), self.max_len) for data in datas for sentence in data['Abstract']])
        batch_abstract = []
        batch_label = []
        sent_len = []
        for data in datas:
            # padding abstract to make them in same length
            pad_abstract = []
            for sentence in data['Abstract']:
                if len(sentence) > max_len:
                    pad_abstract.append(sentence[:max_len])
                else:
                    pad_abstract.append(sentence+[self.pad_idx]*(max_len-len(sentence)))
            sent_len.append(len(pad_abstract))
            pad_abstract.extend([[self.pad_idx]*max_len]*(max_sent-len(pad_abstract)))
            batch_abstract.append(pad_abstract)
            # gather labels
            if 'Label' in data:
                pad_label = data['Label']
                pad_label.extend([[0]*6]*(max_sent-len(pad_label)))
                
                batch_label.append(pad_label)
        return torch.LongTensor(batch_abstract), torch.FloatTensor(batch_label), sent_len

In [19]:
trainData = AbstractDataset(train, PAD_TOKEN, max_len = 64)
validData = AbstractDataset(valid, PAD_TOKEN, max_len = 64)
testData = AbstractDataset(test, PAD_TOKEN, max_len = 64)

### TO-DO List
- [x] : Increase GRU layer  
        Now : 2 layer
- [ ] : use Fast text embedding
- [ ] : use `title` column
- [x] : add dropout layer
        Now : dropout prob = 0.3

In [32]:
class Net(nn.Module):
    def __init__(self, vocabulary_size):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim, _weight=embedding_matrix)
        self.sent_rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=2, dropout=drop_p, bidirectional=True, batch_first=True)
        self.FCLayer = nn.Sequential(OrderedDict([
            ('FC1', nn.Linear(hidden_dim*2, hidden_dim)),
            ('DropOut1', nn.Dropout(drop_p)),
            ('ReLU1', nn.ReLU()),
            ('FC2', nn.Linear(hidden_dim, 6)),
            ('Sigmoid', nn.Sigmoid())
        ]))
        torch.nn.init.xavier_normal_(self.FCLayer[0].weight)
        
    def forward(self, x):
        # b: batch_size
        # s: number of sentences
        # w: number of words
        # e: embedding_dim
        x = self.embedding(x)
        b,s,w,e = x.shape
        x = x.view(b,s*w,e)
        x, __ = self.sent_rnn(x)
        x = x.view(b,s,w,-1)
        x = torch.max(x,dim=2)[0]
        y = self.FCLayer(x)
        return y

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [25]:
### Helper functions for scoring

class F1():
    def __init__(self):
        self.threshold = 0.5
        self.n_precision = 0
        self.n_recall = 0
        self.n_corrects = 0
        self.name = 'F1'

    def reset(self):
        self.n_precision = 0
        self.n_recall = 0
        self.n_corrects = 0

    def update(self, predicts, groundTruth):
        predicts = predicts > self.threshold
        self.n_precision += torch.sum(predicts).data.item()
        self.n_recall += torch.sum(groundTruth).data.item()
        self.n_corrects += torch.sum(groundTruth.type(torch.bool) * predicts).data.item()

    def get_score(self):
        recall = self.n_corrects / self.n_recall
        precision = self.n_corrects / (self.n_precision + 1e-20)
        return 2 * (recall * precision) / (recall + precision + 1e-20)

    def print_score(self):
        score = self.get_score()
        return '{:.5f}'.format(score)

In [26]:
def _run_epoch(epoch, mode):
    model.train(True)
    if mode=="train":
        description = 'Train'
        dataset = trainData
        shuffle = True
    else:
        description = 'Valid'
        dataset = validData
        shuffle = False
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            collate_fn=dataset.collate_fn,
                            num_workers=8)

    trange = tqdm(enumerate(dataloader), total=len(dataloader), desc=description)
    loss = 0
    f1_score = F1()
    for i, (x, y, sent_len) in trange:
        if epoch == 0:
            writer.add_graph(model, x.to(device))

        o_labels, batch_loss = _run_iter(x,y)
        if mode=="train":
            opt.zero_grad()
            batch_loss.backward()
            opt.step()

        loss += batch_loss.item()
        f1_score.update(o_labels.cpu(), y)

        trange.set_postfix(
            loss=loss / (i + 1), f1=f1_score.print_score())
    
    if mode=="train":
        history['train'].append({'f1':f1_score.get_score(), 'loss':loss/ len(trange)})
        writer.add_scalar('Loss/train', loss/ len(trange), epoch)
        writer.add_scalar('F1_score/train', f1_score.get_score(), epoch)
    else:
        history['valid'].append({'f1':f1_score.get_score(), 'loss':loss/ len(trange)})
        writer.add_scalar('Loss/valid', loss/ len(trange), epoch)
        writer.add_scalar('F1_score/valid', f1_score.get_score(), epoch)
    trange.close()
    

def _run_iter(x,y):
    abstract = x.to(device)
    labels = y.to(device)
    o_labels = model(abstract)
    l_loss = criteria(o_labels, labels)
    return o_labels, l_loss

def save(epoch):
    if not os.path.exists(os.path.join(CWD,'model')):
        os.makedirs(os.path.join(CWD,'model'))
    torch.save(model.state_dict(), os.path.join( CWD,'model/model.pkl.'+str(epoch) ))
    with open( os.path.join( CWD,'model/history.json'), 'w') as f:
        json.dump(history, f, indent=4)

In [33]:
model = Net(len(word_dict))

opt = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criteria = torch.nn.BCELoss()
model.to(device)
history = {'train':[],'valid':[]}

## Tensorboard
## save path: test_experiment/
tf_path = os.path.join(CWD, 'test_experiment')
if not os.path.exists(tf_path):
    os.mkdir(tf_path)
writer = SummaryWriter(os.path.join(tf_path,config_fname))

for epoch in range(max_epoch):
    print('Epoch: {}'.format(epoch))
    _run_epoch(epoch, 'train')
    _run_epoch(epoch, 'valid')
    save(epoch)

# Plot the training results 
with open(os.path.join(CWD,'model/history.json'), 'r') as f:
    history = json.loads(f.read())
    
train_loss = [l['loss'] for l in history['train']]
valid_loss = [l['loss'] for l in history['valid']]
train_f1 = [l['f1'] for l in history['train']]
valid_f1 = [l['f1'] for l in history['valid']]

plt.figure(figsize=(7,5))
plt.title('Loss')
plt.plot(train_loss, label='train')
plt.plot(valid_loss, label='valid')
plt.legend()
plt.show()

plt.figure(figsize=(7,5))
plt.title('F1 Score')
plt.plot(train_f1, label='train')
plt.plot(valid_f1, label='valid')
plt.legend()
plt.show()

print('Best F1 score ', max([[l['f1'], idx] for idx, l in enumerate(history['valid'])]))

Epoch: 0

Train:   0%|          | 0/394 [00:00<?, ?it/s][A
Train:   0%|          | 0/394 [00:11<?, ?it/s, f1=0.13492, loss=0.682][A
Train:   0%|          | 1/394 [00:11<1:15:33, 11.54s/it, f1=0.13492, loss=0.682][A
Train:   0%|          | 1/394 [00:18<1:15:33, 11.54s/it, f1=0.10843, loss=0.675][A
Train:   1%|          | 2/394 [00:18<1:05:55, 10.09s/it, f1=0.10843, loss=0.675][A
Train:   1%|          | 2/394 [00:26<1:05:55, 10.09s/it, f1=0.09091, loss=0.668][A
Train:   1%|          | 3/394 [00:26<1:01:42,  9.47s/it, f1=0.09091, loss=0.668][A
Train:   1%|          | 3/394 [00:36<1:01:42,  9.47s/it, f1=0.07852, loss=0.661][A
Train:   1%|          | 4/394 [00:36<1:02:14,  9.58s/it, f1=0.07852, loss=0.661][A

KeyboardInterrupt: 