In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
dir_path = os.getcwd()
dir_path = dir_path.split('FYP-19-20')[0] + 'FYP-19-20/'
data_file = open(dir_path + 'data/conll-2003/eng.train.txt', 'r')

In [3]:
data = data_file.read().replace('-DOCSTART- -X- O O\n\n', '').split('\n\n')
len(data)

14042

In [4]:
train_df = {'sentence': [], 'token': [], 'pos_tag': [], 'phrase': [], 'bio_tag': []}
for sent in range(len(data)):
    tokens = data[sent].split('\n')
    for t in tokens:
        try:
            token, pos, phrase, bio = t.split(' ')
            train_df['sentence'].append(sent)
            train_df['token'].append(token)
            train_df['pos_tag'].append(pos)
            train_df['phrase'].append(phrase)
            train_df['bio_tag'].append(bio)
        except:
            continue

In [5]:
train_df = pd.DataFrame(train_df)
train_df.head()

Unnamed: 0,sentence,token,pos_tag,phrase,bio_tag
0,0,EU,NNP,I-NP,I-ORG
1,0,rejects,VBZ,I-VP,O
2,0,German,JJ,I-NP,I-MISC
3,0,call,NN,I-NP,O
4,0,to,TO,I-VP,O


In [6]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["token"].values.tolist(),
                                                           s["pos_tag"].values.tolist(),
                                                           s["bio_tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [7]:
max_len = 0
getter = SentenceGetter(train_df)
n = getter.get_next()
while n is not None:
    if max_len < len(n):
        max_len = len(n)
    n = getter.get_next()
print(max_len)

113


In [8]:
# words, tags to numerics
words = list(train_df.token.unique())
words.append('ENDPAD')
tags = list(train_df.bio_tag.unique())
n_words, n_tags = len(words), len(tags)
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
sentences = getter.sentences

In [9]:
# converting sentences to sequences of numbers
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]

Using TensorFlow backend.


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)

In [None]:
X[454]

In [None]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [None]:
y[454]

In [None]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [None]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda
from keras.backend import argmax, cast
from keras.losses import categorical_crossentropy

In [None]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
model = Dense(n_tags, activation="softmax")(model)
model = Lambda(lambda x: argmax(x, axis=-1))(model)
out = Lambda(lambda x: cast(x,"float"))(model) # output argmax layer

In [None]:
model = Model(input, out)
model.compile(optimizer="rmsprop", loss=categorical_crossentropy, metrics=['acc'])
model.summary()

In [None]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5,
                    validation_split=0.1, verbose=1)

In [None]:
test_pred = model.predict(X_te, verbose=1)

In [None]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)

In [None]:
from sklearn.metrics import classification_report
classification_report(test_labels, pred_labels)

In [10]:
!pip install tensorflow

Collecting tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/ed/11/037887c5cbac5af3124050fb6348e67caa038734cc9673b11c31c8939072/tensorflow-1.14.0-cp37-cp37m-macosx_10_11_x86_64.whl (105.8MB)
[K     |████████████████████████████████| 105.8MB 207kB/s eta 0:00:01    |█████████████████████           | 69.6MB 726kB/s eta 0:00:50     |████████████████████████▏       | 79.8MB 833kB/s eta 0:00:32     |█████████████████████████       | 82.6MB 1.2MB/s eta 0:00:20     |███████████████████████████▉    | 92.1MB 484kB/s eta 0:00:29     |█████████████████████████████   | 96.1MB 1.5MB/s eta 0:00:07
[?25hCollecting absl-py>=0.7.0 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/da/3f/9b0355080b81b15ba6a9ffcf1f5ea39e307a2778b2f2dc8694724e8abd5b/absl-py-0.7.1.tar.gz (99kB)
[K     |████████████████████████████████| 102kB 1.7MB/s ta 0:00:011
[?25hCollecting termcolor>=1.1.0 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/8a/48/a7