In [1]:
import urllib.request

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

In [2]:
data_path = "../data/"
df = pd.read_csv(data_path + "ner_dataset.csv", encoding="latin1")

print(len(df))
print(df.head(5))

1048575
    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


# Data Preprocessing

In [3]:
# see 'Tag' distribution
print(df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique())
print(df.groupby('Tag').size().reset_index(name='counts'))

47959 35177 17
      Tag  counts
0   B-art     402
1   B-eve     308
2   B-geo   37644
3   B-gpe   15870
4   B-nat     201
5   B-org   20143
6   B-per   16990
7   B-tim   20333
8   I-art     297
9   I-eve     253
10  I-geo    7414
11  I-gpe     198
12  I-nat      51
13  I-org   16784
14  I-per   17251
15  I-tim    6528
16      O  887908


In [4]:
# fill NaN with previous value
df = df.fillna(method="ffill") # fill NaN with previous value
print(df.isnull().values.any())
print(df.tail(10))

False
              Sentence #       Word  POS    Tag
1048565  Sentence: 47958     impact   NN      O
1048566  Sentence: 47958          .    .      O
1048567  Sentence: 47959     Indian   JJ  B-gpe
1048568  Sentence: 47959     forces  NNS      O
1048569  Sentence: 47959       said  VBD      O
1048570  Sentence: 47959       they  PRP      O
1048571  Sentence: 47959  responded  VBD      O
1048572  Sentence: 47959         to   TO      O
1048573  Sentence: 47959        the   DT      O
1048574  Sentence: 47959     attack   NN      O


  df = df.fillna(method="ffill") # fill NaN with previous value


In [5]:
df['Word'] = df['Word'].str.lower() # lowercase 
print(df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique())

47959 31817 17


In [7]:
# get word and tag in each sentence
func = lambda temp: [(word, tag) for word, tag in zip(temp["Word"].values.tolist(), temp["Tag"].values.tolist())]
tagged_sentences=[t for t in df.groupby("Sentence #").apply(func)]

In [8]:
print(tagged_sentences[0])

[('thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('london', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('british', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')]


In [14]:
# get sentence and tag
sentences, tags = [], []

for tagged_sentence in tagged_sentences:
    sentence, tag = zip(*tagged_sentence)
    sentences.append(list(sentence))
    tags.append(list(tag))

print(sentences[0])
print(tags[0])

['thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'london', 'to', 'protest', 'the', 'war', 'in', 'iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'british', 'troops', 'from', 'that', 'country', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [15]:
max_sentence_len = max(len(s) for s in sentences)
avg_sentence_len = sum(map(len, sentences))/len(sentences)
print("max sentence length: {}, average sentence length: {}".format(max_sentence_len, avg_sentence_len))

max sentence length: 104, average sentence length: 21.863987989741236


# Tokenizing

In [21]:
sen_tokenizer = Tokenizer(oov_token="OOV")
tag_tokenizer = Tokenizer(lower=False) # keep the original case

sen_tokenizer.fit_on_texts(sentences)
tag_tokenizer.fit_on_texts(tags)

X_data = sen_tokenizer.texts_to_sequences(sentences)
y_data = tag_tokenizer.texts_to_sequences(tags)

vocab_size = len(sen_tokenizer.word_index) + 1
tag_size = len(tag_tokenizer.word_index) + 1

print(sen_tokenizer.word_index["OOV"])
print(vocab_size)
print(tag_size)

print(X_data[0])
print(y_data[0])

1
31819
18
[254, 6, 967, 16, 1795, 238, 468, 7, 523, 2, 129, 5, 61, 9, 571, 2, 833, 6, 186, 90, 22, 15, 56, 3]
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1]


In [19]:
max_len = 70 # hyperparameter

X_data_pad = pad_sequences(X_data, padding="post", maxlen=max_len)
y_data_pad = pad_sequences(y_data, padding="post", maxlen=max_len)

X_train, X_test, y_train, y_test = train_test_split(X_data_pad, y_data_pad, test_size=0.2, random_state=0)

y_train_encod = to_categorical(y_train)
y_test_encod = to_categorical(y_test)

print(X_train.shape, y_train_encod.shape)
print(X_test.shape, y_test_encod.shape)

(38367, 70) (38367, 70, 18)
(9592, 70) (9592, 70, 18)
