In [18]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import nltk
from tqdm import tqdm
import pickle
import gzip
flatten = lambda l: [item for sublist in l for item in sublist]
from sklearn_crfsuite import metrics
random.seed(1994)

In [4]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [5]:
from pprint import pprint

In [6]:
print(torch.__version__)

0.4.1


In [7]:
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)
if USE_CUDA:
    gpus = [0]
    torch.cuda.set_device(0)

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

False


In [8]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
        
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [9]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))


def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

def prepare_tag(tag, tag2index):
    return Variable(LongTensor([tag2index[tag]]))

In [10]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

with open("Harry Potter txt/Harry Potter 1 - Sorcerer's Stone.txt",'r', encoding = 'cp1252') as content_file:
    all_text = content_file.read()
    print(len(all_text))
    doc = nlp(all_text)

442745


In [None]:
# if the pretrained list of named entity is not used, generate the list manually
data_list = []
for j in range(0, len(sentences)):
    entities = [(str(x), x.label_) for x in nlp(str(sentences[j])).ents]
    my_list = []
    for i in range(0,len(nlp(str(sentences[j])))):
        if str(nlp(str(sentences[j][i]))) not in str(nlp(str(sentences[j])).ents):
            my_list.append((str(nlp(str(sentences[j][i]))), 'O'))
        else:
            for k, v in entities:
                if str(nlp(str(sentences[j][i]))) == k:
                    my_list.append((k, v))
    data_list.append(list(zip(*my_list)))

In [19]:
# if the pretrained list of named entity is used, load the premade list

def load(file_name):
    # load the model
    stream = gzip.open(file_name, "rb")
    model = pickle.load(stream)
    stream.close()
    return model


def save(file_name, model):
    # save the model
    stream = gzip.open(file_name, "wb")
    pickle.dump(model, stream)
    stream.close()
data_list = load("data_list")

In [11]:
sentences = [x for x in doc.sents]
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent', minify=True)

In [12]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('For', 'ADP', 'for'),
 ('second', 'NOUN', 'second'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Dursley', 'PROPN', 'dursley'),
 ("n't", 'ADV', 'not'),
 ('realize', 'VERB', 'realize'),
 ('seen', 'VERB', 'see'),
 ('jerked', 'VERB', 'jerk'),
 ('head', 'NOUN', 'head'),
 ('look', 'VERB', 'look')]

In [13]:
[(str(x), x.label_) for x in nlp(str(sentences[20])).ents]

[('second', 'ORDINAL'), ('Dursley', 'PERSON')]

In [21]:
labels = [x.label_ for x in doc.ents]
pprint(Counter(labels).most_common(3))

[('PERSON', 3189), ('ORG', 950), ('GPE', 572)]


In [22]:
items = [x.text for x in doc.ents]
pprint(Counter(items).most_common(5))

[('Harry', 1278),
 ('Ron', 424),
 ('Hagrid', 239),
 ('Hermione', 202),
 ('Snape', 148)]


In [23]:
vocab = list(set(items))

In [24]:
word2index = {'<UNK>': 0, '<DUMMY>': 1}
for word in vocab:
    if word2index.get(word) is None:
        word2index[word] = len(word2index)
index2word = {v:k for k, v in word2index.items()}

In [25]:
WINDOW_SIZE = 2
windows = []

In [26]:
data_list = [x for x in data_list if x!= []]
[i for i, x in enumerate(data_list) if x == []]

[]

In [27]:
for sample in data_list:
    dummy = ['<DUMMY>'] * WINDOW_SIZE
    window = list(nltk.ngrams(dummy + list(sample[0]) + dummy,5))
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])

In [28]:
len(windows)

94163

In [29]:
windows[99]

[['<DUMMY>', 'Mrs.', 'Dursley', 'was', 'thin'], 'PERSON']

In [30]:
random.shuffle(windows)

train_data = windows[:int(len(windows) * 0.9)]
test_data = windows[int(len(windows) * 0.9):]

In [31]:
class WindowClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, window_size, hidden_size, output_size):
        super(WindowClassifier, self).__init__()
        
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.h1 = nn.Linear(embedding_size * (window_size * 2 + 1), hidden_size)
        self.h2 = nn.Linear(hidden_size, hidden_size)
        self.o = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim = 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self,inputs, is_training = False):
        embeds = self.embed(inputs)
        concated = embeds.view(-1, embeds.size(1)*embeds.size(2))
        h_0 = self.relu(self.h1(concated))
        if is_training:
            h_0 = self.dropout(h_0)
        h_1 = self.relu(self.h2(h_0))
        if is_training:
            h_1 = self.dropout(h_1)
        out = self.softmax(self.o(h_1))
        return out

In [32]:
_, tags = list(zip(*train_data))
tag2index = {v:k for k, v in enumerate(list(set(tags)))}
index2tag = {k:v for v, k in tag2index.items()}

In [33]:
BATCH_SIZE = 128
EMBEDDING_SIZE = 50 # x (WINDOW_SIZE*2+1) = 250
HIDDEN_SIZE = 300
EPOCH = 3
LEARNING_RATE = 0.001

In [34]:
model = WindowClassifier(len(word2index), EMBEDDING_SIZE, WINDOW_SIZE, HIDDEN_SIZE, len(tag2index))
if USE_CUDA:
    model = model.cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [35]:
for epoch in range(EPOCH):
    losses = []
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        x,y=list(zip(*batch))
        inputs = torch.cat([prepare_sequence(sent, word2index).view(1, -1) for sent in x])
        targets = torch.cat([prepare_tag(tag, tag2index) for tag in y])
        model.zero_grad()
        preds = model(inputs, is_training=True)
    
        loss = loss_function(preds, targets)
        losses.append(loss.data.tolist())
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

[0/3] mean_loss : 2.76
[0/3] mean_loss : 0.25
[0/3] mean_loss : 0.11
[0/3] mean_loss : 0.09
[0/3] mean_loss : 0.09
[0/3] mean_loss : 0.08
[0/3] mean_loss : 0.08
[1/3] mean_loss : 0.10
[1/3] mean_loss : 0.08
[1/3] mean_loss : 0.07
[1/3] mean_loss : 0.08
[1/3] mean_loss : 0.07
[1/3] mean_loss : 0.07
[1/3] mean_loss : 0.07
[2/3] mean_loss : 0.11
[2/3] mean_loss : 0.07
[2/3] mean_loss : 0.06
[2/3] mean_loss : 0.07
[2/3] mean_loss : 0.07
[2/3] mean_loss : 0.06
[2/3] mean_loss : 0.07


In [36]:
for_f1_score = []

In [37]:
accuracy = 0
for test in test_data:
    x, y = test[0], test[1]
    input_ = prepare_sequence(x, word2index).view(1, -1)

    i = model(input_).max(1)[1]
    pred = index2tag[i.data.tolist()[0]]
    for_f1_score.append([pred, y])
    if pred == y:
        accuracy += 1

print(accuracy/len(test_data) * 100)


97.71689497716895


High accuracy is due to large number of words with tag "O", need to look at other evaluation matrics. 

As an alternative, one can do random undersampling to reduce the number of instances of majority class 

In [38]:
y_pred, y_test = list(zip(*for_f1_score))
sorted_labels = sorted(
    list(set(y_test) - {'O'}),
    key=lambda name: (name[1:], name[0])
)


In [39]:
y_pred = [[y] for y in y_pred] # this is because sklearn_crfsuite.metrics function flatten inputs
y_test = [[y] for y in y_test]

print(metrics.flat_classification_report(
    y_test, y_pred, labels = sorted_labels, digits=3
))

             precision    recall  f1-score   support

        FAC      0.000     0.000     0.000         1
   CARDINAL      0.773     0.872     0.819        39
       DATE      0.000     0.000     0.000        11
        LAW      0.000     0.000     0.000         1
     PERSON      0.625     0.868     0.727       219
       TIME      0.000     0.000     0.000        12
        LOC      0.769     1.000     0.870        10
       NORP      0.000     0.000     0.000         7
        GPE      0.707     0.774     0.739        53
    ORDINAL      0.923     0.923     0.923        13
        ORG      0.648     0.639     0.644       147
    PRODUCT      0.000     0.000     0.000         2

avg / total      0.620     0.740     0.671       515



  'precision', 'predicted', average, warn_for)
