In [None]:
import os 
print(os.environ)

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
os.chdir("gdrive/My Drive/Colab Notebooks/AI classify")
! ls

## Runtime Environment

In [None]:
! pip3 install torch torchvision pandas nltk numpy sklearn tqdm
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download("maxent_treebank_pos_tagger")
nltk.download('averaged_perceptron_tagger')

# Take a  view of dataset

In [None]:
import pandas as pd

dataset = pd.read_csv('./data/task2_trainset.csv', dtype=str)
dataset.head()

# Data processing

In [None]:
dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)

In [None]:
dataset.head()

In [None]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

trainset, validset = train_test_split(dataset, test_size=0.1, random_state=890414)

trainset.to_csv('trainset.csv', index=False)
validset.to_csv('validset.csv', index=False)

In [None]:
dataset = pd.read_csv('./data/task2_public_testset.csv', dtype=str)
dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)
dataset.to_csv('testset.csv',index=False)

In [None]:
import string
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
print(stopwords.words('english'))

stopwords = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    res = []
    for word, pos in pos_tag(word_tokenize(sentence)):
        word = word.lower()
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        res.append(lemmatizer.lemmatize(word, pos=wordnet_pos))
    res = [word for word in res if word not in string.punctuation]
    return res

In [None]:
from multiprocessing import Pool
from tqdm import tqdm

def collect_words(data_path, n_workers=4):
    df = pd.read_csv(data_path, dtype=str)
    
    sent_list = []
    for i in df.iterrows():
        sent_list += i[1]['Abstract'].lower().split('$$$')

    words = set()

    for sent in tqdm(sent_list):
        words |= set(lemmatize_sentence(sent))

    return words

In [None]:
words = set()
# words |= collect_words('trainset.csv')
words |= collect_words('./data/task2_trainset.csv')

In [None]:
PAD_TOKEN = 0
UNK_TOKEN = 1
word_dict = {'<pad>':PAD_TOKEN,'<unk>':UNK_TOKEN}
for word in words:
    word_dict[word]=len(word_dict)

In [None]:
import pickle

with open('dicitonary.pkl','wb') as f:
    pickle.dump(word_dict, f)

## Embedding class to save pretrained embedding

In [None]:
import re
import torch

class Embedding:
    """
    Args:
        embedding_path (str): Path where embedding are loaded from (text file).
        words (None or list): If not None, only load embedding of the words in
            the list.
        oov_as_unk (bool): If argument `words` are provided, whether or not
            treat words in `words` but not in embedding file as `<unk>`. If
            true, OOV will be mapped to the index of `<unk>`. Otherwise,
            embedding of those OOV will be randomly initialize and their
            indices will be after non-OOV.
        lower (bool): Whether or not lower the words.
        rand_seed (int): Random seed for embedding initialization.
    """

    def __init__(self, embedding_path,
                 words=None, oov_as_unk=True, lower=True, rand_seed=524):
        self.word_dict = {}
        self.vectors = None
        self.lower = lower
        self.extend(embedding_path, words, oov_as_unk)
        torch.manual_seed(rand_seed)

        if '<pad>' not in self.word_dict:
            self.add(
                '<pad>', torch.zeros(self.get_dim())
            )
        
        if '<bos>' not in self.word_dict:
            t_tensor = torch.rand((1, self.get_dim()), dtype=torch.float)
            torch.nn.init.orthogonal_(t_tensor)
            self.add(
                '<bos>', t_tensor
            )
            
        if '<eos>' not in self.word_dict:
            t_tensor = torch.rand((1, self.get_dim()), dtype=torch.float)
            torch.nn.init.orthogonal_(t_tensor)
            self.add(
                '<eos>', t_tensor
            )
        
        if '<unk>' not in self.word_dict:
            self.add('<unk>')

    def to_index(self, word):
        """
        Args:
            word (str)

        Return:
             index of the word. If the word is not in `words` and not in the
             embedding file, then index of `<unk>` will be returned.
        """
        if self.lower:
            word = word.lower()

        if word not in self.word_dict:
            return self.word_dict['<unk>']
        else:
            return self.word_dict[word]

    def get_dim(self):
        return self.vectors.shape[1]

    def get_vocabulary_size(self):
        return self.vectors.shape[0]

    def get_dict(self):
        return self.word_dict

    def add(self, word, vector=None):
        if self.lower:
            word = word.lower()

        if vector is not None:
            vector = vector.view(1, -1)
        else:
            vector = torch.empty(1, self.get_dim())
            torch.nn.init.uniform_(vector)
        self.vectors = torch.cat([self.vectors, vector], 0)
        self.word_dict[word] = len(self.word_dict)

    def extend(self, embedding_path, words, oov_as_unk=True):
        self._load_embedding(embedding_path, words)

        if words is not None and not oov_as_unk:
            # initialize word vector for OOV
            for word in words:
                if self.lower:
                    word = word.lower()

                if word not in self.word_dict:
                    self.word_dict[word] = len(self.word_dict)

            oov_vectors = torch.nn.init.uniform_(
                torch.empty(len(self.word_dict) - self.vectors.shape[0],
                            self.vectors.shape[1]))

            self.vectors = torch.cat([self.vectors, oov_vectors], 0)

    def _load_embedding(self, embedding_path, words):
        if words is not None:
            words = set(words)

        vectors = []

        count = 0

        with open(embedding_path, encoding="utf-8") as fp:

            row1 = fp.readline()
            # if the first row is not header
            if not re.match('^[0-9]+ [0-9]+$', row1):
                # seek to 0
                fp.seek(0)
            # otherwise ignore the header

            for i, line in enumerate(fp):
                cols = line.rstrip().split(' ')
                word = cols[0]
                count += 1

                # skip word not in words if words are provided
                if words is not None and word not in words:
                    continue
                elif word not in self.word_dict:
                    self.word_dict[word] = len(self.word_dict)
                    vectors.append([float(v) for v in cols[1:]])

        print(len(self.word_dict))
        
        vectors = torch.tensor(vectors)
        if self.vectors is not None:
            self.vectors = torch.cat([self.vectors, vectors], dim=0)
        else:
            self.vectors = vectors

In [None]:
# download Glove pretrained word embedding from web.
print(len(words))
# embedder = Embedding('../data/glove.840B.300d.txt', words)
embedder = Embedding('../data/task1.model.txt', words)

In [None]:
import pickle

with open('embedding.pkl','wb') as f:
    pickle.dump(embedder, f)

In [None]:
from tqdm import tqdm as tqdm

def label_to_onehot(labels):
    """ Convert label to onehot .
        Args:
            labels (string): sentence's labels.
        Return:
            outputs (onehot list): sentence's onehot label.
    """
    label_dict = {'THEORETICAL': 0, 'ENGINEERING':1, 'EMPIRICAL':2, 'OTHERS':3}
    onehot = [0,0,0,0]
    for l in labels.split():
        onehot[label_dict[l]] = 1
    return onehot
        
def sentence_to_indices(sentence, word_dict):
    """ Convert sentence to its word indices.
    Args:
        sentence (str): One string.
    Return:
        indices (list of int): List of word indices.
    """
    return [word_dict.to_index(word) for word in lemmatize_sentence(sentence)]
    
def get_dataset(data_path, word_dict, n_workers=4):
    """ Load data and return dataset for training and validating.

    Args:
        data_path (str): Path to the data.
    """
    dataset = pd.read_csv(data_path, dtype=str)

    results = [None] * n_workers
    with Pool(processes=n_workers) as pool:
        for i in range(n_workers):
            batch_start = (len(dataset) // n_workers) * i
            if i == n_workers - 1:
                batch_end = len(dataset)
            else:
                batch_end = (len(dataset) // n_workers) * (i + 1)
            
            batch = dataset[batch_start: batch_end]
            results[i] = pool.apply_async(preprocess_samples, args=(batch,word_dict))

        pool.close()
        pool.join()

    processed = []
    for result in results:
        processed += result.get()
    return processed

def preprocess_samples(dataset, word_dict):
    """ Worker function.

    Args:
        dataset (list of dict)
    Returns:
        list of processed dict.
    """
    processed = []
    for sample in tqdm(dataset.iterrows(), total=len(dataset)):
        processed.append(preprocess_sample(sample[1], word_dict))

    return processed

def preprocess_sample(data, word_dict):
    """
    Args:
        data (dict)
    Returns:
        dict
    """
    processed = {}
    processed['Abstract'] = [sentence_to_indices(sent, word_dict) for sent in data['Abstract'].split('$$$')]
    if 'Task 2' in data:
        processed['Label'] = label_to_onehot(data['Task 2'])
        
    return processed

In [None]:
print('[INFO] Start processing trainset...')
train = get_dataset('trainset.csv', embedder, n_workers=4)
print('[INFO] Start processing validset...')
valid = get_dataset('validset.csv', embedder, n_workers=4)
print('[INFO] Start processing testset...')
test = get_dataset('testset.csv', embedder, n_workers=4)

## Data packing

In [None]:
from torch.utils.data import Dataset
import torch

class AbstractDataset(Dataset):
    def __init__(self, data, pad_idx, max_len = 500):
        self.data = data
        self.pad_idx = pad_idx
        self.max_len = max_len
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]
        
    def collate_fn(self, datas):
        # get max length in this batch
        max_sent = max([len(data['Abstract']) for data in datas])
        max_len = max([min(len(sentence), self.max_len) for data in datas for sentence in data['Abstract']])
        batch_abstract = []
        batch_label = []
        sent_len = []
        for data in datas:
            # padding abstract to make them in same length
            pad_abstract = []
            for sentence in data['Abstract']:
                if len(sentence) > max_len:
                    pad_abstract.append(sentence[:max_len])
                else:
                    pad_abstract.append(sentence+[self.pad_idx]*(max_len-len(sentence)))
            sent_len.append(len(pad_abstract))
            pad_abstract.extend([[self.pad_idx]*max_len]*(max_sent-len(pad_abstract)))
            batch_abstract.append(pad_abstract)
            
            # gather labels
            if 'Label' in data:
                batch_label.append(data['Label'])
                
        return torch.LongTensor(batch_abstract), torch.FloatTensor(batch_label), sent_len

In [None]:
trainData = AbstractDataset(train, PAD_TOKEN, max_len = 2048)
validData = AbstractDataset(valid, PAD_TOKEN, max_len = 2048)
testData = AbstractDataset(test, PAD_TOKEN, max_len = 2048)

# Model

In [None]:
import torch.nn as nn
import torch.nn.functional as F


class simpleNet(nn.Module):
    def __init__(self, embedder):
        super(simpleNet, self).__init__()

        self.hidden_dim = 256

        self.embedding = nn.Embedding(embedder.get_vocabulary_size(), embedder.get_dim())
        self.embedding.weight = torch.nn.Parameter(embedder.vectors)
        self.embedding.weight.requires_grad = False

        self.sent_rnn1 = nn.GRU(embedder.get_dim(),
                                self.hidden_dim,
                                num_layers = 2,
                                # dropout = 0.3,
                                bidirectional=True,
                                batch_first=True)
        
        self.sent_rnn2 = nn.GRU(self.hidden_dim * 2,
                                self.hidden_dim,
                                num_layers = 2,
                                # dropout = 0.5,
                                bidirectional=True,
                                batch_first=True)
        
        self.sent_dropout = nn.Dropout(0.2)
        self.dropout = nn.Dropout(0.5)

        self.l1 = nn.Linear(self.hidden_dim*2, self.hidden_dim)
        self.l2 = nn.Linear(self.hidden_dim, 4)

        nn.init.orthogonal_(self.sent_rnn1.weight_ih_l0)
        nn.init.orthogonal_(self.sent_rnn1.weight_hh_l0)

        nn.init.zeros_(self.sent_rnn1.bias_ih_l0)
        nn.init.zeros_(self.sent_rnn1.bias_hh_l0)

        nn.init.orthogonal_(self.sent_rnn2.weight_ih_l0)
        nn.init.orthogonal_(self.sent_rnn2.weight_hh_l0)

        nn.init.zeros_(self.sent_rnn2.bias_ih_l0)
        nn.init.zeros_(self.sent_rnn2.bias_hh_l0)

        nn.init.zeros_(self.l2.weight)
        nn.init.zeros_(self.l2.bias)

    def forward(self, x):
        x = self.embedding(x)

        b,s,w,e = x.shape
        x = x.view(b,s*w,e)
        x = self.sent_dropout(x)
        x, __ = self.sent_rnn1(x)
        x = x.view(b,s,w,-1)

        x = torch.max(x,dim=2)[0]

        x, __ = self.sent_rnn2(x)

        x = torch.max(x,dim=1)[0]

        x = torch.relu(self.l1(x))

        x = torch.sigmoid(self.l2(x))

        return x

# Training

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class F1():
    def __init__(self):
        self.threshold = 0.3
        self.n_precision = 0
        self.n_recall = 0
        self.n_corrects = 0
        self.name = 'F1'

    def reset(self):
        self.n_precision = 0
        self.n_recall = 0
        self.n_corrects = 0

    def update(self, predicts, groundTruth):
        predicts = predicts > self.threshold
        self.n_precision += torch.sum(predicts).data.item()
        self.n_recall += torch.sum(groundTruth).data.item()
        self.n_corrects += torch.sum(groundTruth.type(torch.uint8) * predicts).data.item()

    def get_score(self):
        recall = self.n_corrects / self.n_recall
        precision = self.n_corrects / (self.n_precision + 1e-20)
        return 2 * (recall * precision) / (recall + precision + 1e-20)

    def print_score(self):
        score = self.get_score()
        return '{:.5f}'.format(score)


In [None]:
import os
def _run_epoch(epoch, training):
    model.train(training)
    if training:
        description = 'Train'
        dataset = trainData
        shuffle = True
    else:
        description = 'Valid'
        dataset = validData
        shuffle = False
    dataloader = DataLoader(dataset=dataset,
                            batch_size=64,
                            shuffle=shuffle,
                            collate_fn=dataset.collate_fn,
                            num_workers=4)

    trange = tqdm(enumerate(dataloader), total=len(dataloader), desc=description)
    loss = 0
    f1_score = F1()
    for i, (x, y, sent_len) in trange:
        o_labels, batch_loss = _run_iter(x,y)
        if training:
            opt.zero_grad()
            batch_loss.backward()
            opt.step()

        loss += batch_loss.item()
        f1_score.update(o_labels.cpu(), y)

        trange.set_postfix(
            loss=loss / (i + 1), f1=f1_score.print_score())
    if training:
        history['train'].append({'f1':f1_score.get_score(), 'loss':loss/ len(trange)})
    else:
        history['valid'].append({'f1':f1_score.get_score(), 'loss':loss/ len(trange)})
        return loss/ len(trange)

def _run_iter(x,y):
    abstract = x.to(device)
    labels = y.to(device)
    o_labels = model(abstract)
    l_loss = criteria(o_labels, labels)
    return o_labels, l_loss

def save(epoch):
    if not os.path.exists('model'):
        os.makedirs('model')
    torch.save(model.state_dict(), 'model/model.pkl.'+str(epoch))
    with open('model/history.json', 'w') as f:
        json.dump(history, f, indent=4)

In [None]:
from torch.utils.data import DataLoader
from tqdm import trange
import json
model = simpleNet(embedder)
opt = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.8)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(opt, 10)
criteria = torch.nn.BCELoss()
model.to(device)
max_epoch = 60
history = {'train':[],'valid':[]}

In [None]:
for epoch in range(max_epoch):
    print('Epoch: {}'.format(epoch))
    print('%e'%opt.param_groups[0]['lr'])
    _run_epoch(epoch, True)
    _run_epoch(epoch, False)
    scheduler.step()
    save(epoch)

## Predict

In [None]:
model.load_state_dict(torch.load('model/0.6997651795'))
model.train(False)
_run_epoch(1, False)
dataloader = DataLoader(dataset=testData,
                            batch_size=128,
                            shuffle=False,
                            collate_fn=testData.collate_fn,
                            num_workers=4)
trange = tqdm(enumerate(dataloader), total=len(dataloader), desc='Predict')
prediction = []
for i, (x,y,_l) in trange:
    o_labels = model(x.to(device))
    o_labels = o_labels>0.3
    prediction.append(o_labels.to('cpu'))

prediction = torch.cat(prediction).detach().numpy().astype(int)

In [None]:
def SubmitGenerator(prediction, sampleFile, public=True, filename='prediction.csv'):
    """
    Args:
        prediction (numpy array)
        sampleFile (str)
        public (boolean)
        filename (str)
    """
    sample = pd.read_csv(sampleFile)
    submit = {}
    submit['order_id'] = list(sample.order_id.values)
    redundant = len(sample) - prediction.shape[0]
    if public:
        submit['THEORETICAL'] = list(prediction[:,0]) + [0]*redundant
        submit['ENGINEERING'] = list(prediction[:,1]) + [0]*redundant
        submit['EMPIRICAL'] = list(prediction[:,2]) + [0]*redundant
        submit['OTHERS'] = list(prediction[:,3]) + [0]*redundant
    else:
        submit['THEORETICAL'] = [0]*redundant + list(prediction[:,0])
        submit['ENGINEERING'] = [0]*redundant + list(prediction[:,1])
        submit['EMPIRICAL'] = [0]*redundant + list(prediction[:,2])
        submit['OTHERS'] = [0]*redundant + list(prediction[:,3])
    df = pd.DataFrame.from_dict(submit) 
    df.to_csv(filename,index=False)

In [None]:
SubmitGenerator(prediction, 
                './task2_sample_submission.csv',
                True, 
                './task2_submission.csv')

# Plot

In [None]:
import json
import matplotlib.pyplot as plt
%matplotlib inline

with open('model/history.json', 'r') as f:
    history = json.loads(f.read())
    
train_loss = [l['loss'] for l in history['train']]
valid_loss = [l['loss'] for l in history['valid']]
train_f1 = [l['f1'] for l in history['train']]
valid_f1 = [l['f1'] for l in history['valid']]

plt.figure(figsize=(7,5))
plt.title('Loss')
plt.plot(train_loss, label='train')
plt.plot(valid_loss, label='valid')
plt.legend()
plt.show()

plt.figure(figsize=(7,5))
plt.title('F1 Score')
plt.plot(train_f1, label='train')
plt.plot(valid_f1, label='valid')
plt.legend()
plt.show()

print('Best F1 score ', max([[l['f1'], idx] for idx, l in enumerate(history['valid'])]))

In [None]:
from google.colab import files
files.download('./task2_submission.csv') 