In [1]:
cd ..

/home/kevin/Documents/github/article-tagging/lib


In [2]:
import tagnews
import pandas as pd

In [3]:
# Download (and extract if needed) a saved glove data from
# https://github.com/stanfordnlp/GloVe
# and save it to tagnews/data/
glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')

In [4]:
# Download (and extract if needed) the NER data from
# https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data
# and save it to tagnews/data/
ner = tagnews.load_ner_data('tagnews/data/')

b'Skipping line 281837: expected 25 fields, saw 34\n'
  mask |= (ar1 == a)


In [5]:
ner = pd.concat([ner, pd.DataFrame(glove.loc[ner['word'].str.lower()].values)], axis='columns')

In [6]:
ner.fillna(value=0.0, inplace=True)

In [7]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed, Flatten
from keras.utils import to_categorical
from keras_tqdm import TQDMNotebookCallback
import numpy as np

Using TensorFlow backend.
  return f(*args, **kwds)


In [8]:
data_dim = 50
timesteps = 25 # only during training, testing can take arbitrary length.
num_classes = 2

In [9]:
train_val_split = int(19 * ner.shape[0] / 20.)

In [10]:
ner_train_idxs = range(0, train_val_split - timesteps, timesteps)
x_train = np.array([ner.iloc[i:i+timesteps, 3:].values
                    for i in ner_train_idxs])
y_train = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)
                    for i in ner_train_idxs])

In [11]:
ner_val_idxs = range(train_val_split, ner.shape[0] - timesteps, timesteps)
x_val = np.array([ner.iloc[i:i+timesteps, 3:].values
                  for i in ner_val_idxs])
y_val = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)
                  for i in ner_val_idxs])

In [12]:
model = Sequential()
model.add(LSTM(32, return_sequences=True, input_shape=(None, data_dim)))
model.add(LSTM(8, return_sequences=True))
model.add(TimeDistributed(Dense(2, activation='softmax')))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])
print(model.summary(100))

____________________________________________________________________________________________________
Layer (type)                                 Output Shape                            Param #        
lstm_1 (LSTM)                                (None, None, 32)                        10624          
____________________________________________________________________________________________________
lstm_2 (LSTM)                                (None, None, 8)                         1312           
____________________________________________________________________________________________________
time_distributed_1 (TimeDistributed)         (None, None, 2)                         18             
Total params: 11,954
Trainable params: 11,954
Non-trainable params: 0
____________________________________________________________________________________________________
None


In [13]:
model.fit(x_train, y_train,
          epochs=3,
          validation_data=(x_val, y_val),
          verbose=0,
          callbacks=[TQDMNotebookCallback(leave_inner=True)])




<keras.callbacks.History at 0x7fb087d109e8>

In [14]:
# # Alternative approach, uses generator to save on memory, but is about 35x slower.

# def train_data_gen(batch_size):
#     while True:
#         idxs = np.random.randint(train_val_split-timesteps, size=batch_size)
#         x_train = np.array([ner.iloc[i:i+timesteps, 3:].values for i in idxs])
#         y_train = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2) for i in idxs])
#         yield (x_train, y_train)

# batch_size = 64
# model.fit_generator(train_data_gen(batch_size=batch_size),
#                     steps_per_epoch=int(train_val_split / batch_size),
#                     epochs=15,
#                     max_queue_size=50,
#                     validation_data=(x_val, y_val),
#                     verbose=0,
#                     callbacks=[TQDMNotebookCallback(leave_inner=True)])

In [15]:
idx = slice(501, 550)
pd.concat([ner.iloc[idx, :3].reset_index(drop=True),
           pd.DataFrame(model.predict(np.expand_dims(ner.iloc[idx, 3:].values, 0))[0][:, 1:],
                        columns=['prob_geloc'])],
          axis='columns')

Unnamed: 0,word,all_tags,tag,prob_geloc
0,said,O,False,0.00054
1,it,O,False,4e-05
2,will,O,False,0.00011
3,go,O,False,5.4e-05
4,ahead,O,False,0.000454
5,with,O,False,3.5e-05
6,a,O,False,2e-05
7,reconciliation,O,False,0.001117
8,conference,O,False,0.000635
9,to,O,False,3e-05


In [16]:
# Go to https://geo-extract-tester.herokuapp.com/ and download
# the validation data (validation.txt).
with open('validation.txt', encoding='utf-8') as f:
    s = f.read()

In [17]:
gloved_data = glove.loc[[w for w in s.split('\n') if w]].fillna(0)
glove_time_size=100
preds_batched = []
i = 0
while gloved_data[i:i+glove_time_size].size:
    preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size], axis=0))[0][:,1])
    i += glove_time_size

In [18]:
with open('guesses.txt', 'w') as f:
    for prob in [p for pred in preds_batched for p in pred]:
        f.write(str(prob) + '\n')

Now go to https://geo-extract-tester.herokuapp.com/ and upload `guesses.txt` to see how you did!

This model should achieve an AUC of about 80-85%.