In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [5]:
data = pd.read_csv('../input/ner-dataset/ner_datasetreference.csv', encoding='latin1')
data = data.fillna(method='ffill')
data.head(20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [6]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [7]:
words = list(set(data["Word"].values))
n_words = len(words); n_words

35178

In [20]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags


17

In [25]:
class SentenceGetter(object):
    def __init__(self, data):
        self.data = data
        agg_fun = lambda s: [(w, p, t) for w,p,t in zip(s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_fun)
        self.sentences = [i for i in self.grouped]
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [26]:
getter = SentenceGetter(data)
sentences = getter.sentences

In [27]:
labels = [[s[2] for s in sent] for sent in sentences]
sentences = [" ".join([s[0] for s in sent]) for sent in sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [28]:
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [31]:
from collections import Counter
import keras
from keras.preprocessing.sequence import pad_sequences

In [32]:
word_cnt = Counter(data["Word"].values)
vocabulary = set(w[0] for w in word_cnt.most_common(5000))

In [34]:
max_len = 50
word2idx = {"PAD" : 0, "UNK": 1}
word2idx.update({w: i for i,w in enumerate(words) if w in vocabulary})
tag2idx = {t: i for i,t in enumerate(tags)}

In [35]:
X = [[word2idx.get(w, word2idx["UNK"]) for w in s.split()] for s in sentences]

In [37]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word2idx["PAD"])

In [40]:
y = [[tag2idx[l_i] for l_i in l] for l in labels]

In [41]:
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [42]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X,y,test_size=0.1, shuffle=False)

In [43]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, SpatialDropout1D, Bidirectional

In [44]:
word_input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim = 50, input_length=max_len)(word_input)
model = SpatialDropout1D(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)

In [45]:
model = Model(word_input, out)
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])


In [46]:
history = model.fit(X_tr, y_tr.reshape(*y_tr.shape, 1), batch_size = 32, epochs=5, validation_split=0.1, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [47]:
from eli5.lime import TextExplainer
from eli5.lime.samplers import MaskingTextSampler

In [48]:
class NERExplainerGenerator(object):
    
    def __init__(self, model, word2idx, tag2idx, max_len):
        self.model = model
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.idx2tag = {v: k for k,v in tag2idx.items()}
        self.max_len = max_len
        
    def _preprocess(self, texts):
        X = [[self.word2idx.get(w, self.word2idx["UNK"]) for w in t.split()]
             for t in texts]
        X = pad_sequences(maxlen=self.max_len, sequences=X,
                          padding="post", value=self.word2idx["PAD"])
        return X
    
    def get_predict_function(self, word_index):
        def predict_func(texts):
            X = self._preprocess(texts)
            p = self.model.predict(X)
            return p[:,word_index,:]
        return predict_func


In [49]:
index = 46781
label =labels[index]
text = sentences[index]
print(text)
print()
print(" ".join([f"{t} ({l})" for t, l in zip(text.split(), label)]))

Nigeria 's President Olusegun Obasanjo expressed his condolences , noting the late pontiff promoted religious tolerance and democracy in the West African nation .

Nigeria (B-geo) 's (O) President (B-per) Olusegun (I-per) Obasanjo (I-per) expressed (O) his (O) condolences (O) , (O) noting (O) the (O) late (O) pontiff (O) promoted (O) religious (O) tolerance (O) and (O) democracy (O) in (O) the (O) West (O) African (B-gpe) nation (O) . (O)


In [51]:
for i, w in enumerate(text.split()):
    print(f"{i}: {w}")

0: Nigeria
1: 's
2: President
3: Olusegun
4: Obasanjo
5: expressed
6: his
7: condolences
8: ,
9: noting
10: the
11: late
12: pontiff
13: promoted
14: religious
15: tolerance
16: and
17: democracy
18: in
19: the
20: West
21: African
22: nation
23: .


In [52]:
explainer_generator = NERExplainerGenerator(model, word2idx, tag2idx, max_len)


In [53]:
word_index = 4
predict_func = explainer_generator.get_predict_function(word_index=word_index)


In [54]:
sampler = MaskingTextSampler(
    replacement="UNK",
    max_replace=0.7,
    token_pattern=None,
    bow=False
)


In [55]:
samples, similarity = sampler.sample_near(text, n_samples=4)
print(samples)


("Nigeria 's President UNK UNK expressed UNK condolences , noting the UNK pontiff UNK religious UNK and UNK UNK the UNK UNK nation .", "UNK 'UNK UNK Olusegun UNK expressed his condolences , UNK the UNK UNK UNK UNK tolerance UNK democracy UNK UNK UNK African nation .", "Nigeria 'UNK UNK Olusegun Obasanjo UNK his condolences , UNK UNK UNK pontiff promoted UNK UNK UNK UNK UNK UNK UNK UNK UNK .", "Nigeria 's President UNK Obasanjo expressed his condolences , UNK UNK late pontiff promoted religious tolerance UNK democracy in the West African UNK .")


In [56]:
te = TextExplainer(
    sampler=sampler,
    position_dependent=True,
    random_state=42
)

te.fit(text, predict_func)

te.explain_prediction(
    target_names=list(explainer_generator.idx2tag.values()),
    top_targets=3
)


Contribution?,Feature
3.827,Highlighted in text (sum)
-0.081,<BIAS>

Contribution?,Feature
-2.098,Highlighted in text (sum)
-2.359,<BIAS>

Contribution?,Feature
-1.307,<BIAS>
-4.047,Highlighted in text (sum)
