In [1]:
# import
import keras
import sys
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from keras.utils import np_utils, plot_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
import h5py as h5py

Using Theano backend.


In [2]:
BINARY = False
timestep = 1
epochs = 10
en_corpus_file = "corpus-en.txt"
ewo_corpus_file = "corpus-ewo.txt"
best_model_file = "best-model-conll.hdfs"
max_nb_of_phrases =  -1

In [3]:
def getTag(aString):
    tag = "O"
    if BINARY:
        if aString != "O":
            return "NE"
    else:
        tag = aString
    return tag
        
def load_corpus(file, max_nb_of_phrases):
    nb_of_phrases = 0
    dataset = {"word": [], "ne-tag": []}
    with open(file) as f:
        prev_line = None
        for cpt, line in enumerate(f):
            if cpt == 0:
                continue
            if nb_of_phrases == max_nb_of_phrases:
                break;

            l = line.strip()
            if len(l) == 0 and len(prev_line) != 0:
                nb_of_phrases += 1
                dataset["word"].append(line)
                dataset["ne-tag"].append(None)
            else:
                l = l.split("\t")
                dataset["word"].append(l[0])
                dataset["ne-tag"].append(ne_type(l[1]))
            prev_line = line.strip()
        
    return pd.DataFrame(dataset), nb_of_phrases+1

def corpus_fingerprint(aDataframe, nb_of_biphrases):
    # create distributionnal signature
    fingerprints = {}
    current_bi_phrase_index = 0
    for index, row in aDataframe.iterrows():
        if current_bi_phrase_index > nb_of_biphrases:
            break
            
        word = row['word']
        
        if word != "\n":
            if word not in fingerprints:
                fingerprints[word] = np.zeros(nb_of_biphrases, dtype=np.int8)
            fingerprints[word][current_bi_phrase_index] = 1
        else:
            current_bi_phrase_index += 1
    return pd.DataFrame(fingerprints)

def corpus2trainingdata(aDataframe, fingerprintsDataFrame):
    X = np.zeros((aDataframe.shape[0], fingerprintsDataFrame.shape[0]), dtype=np.int8)
    y = np.zeros(aDataframe.shape[0], dtype=np.int8)
    i = 0
    for row in aDataframe.iterrows():
        X[i] = fingerprintsDataFrame[row[1]['word']].values
        y[i] = tag2int[getTag(row[1]['ne-tag'])]
        i += 1
    return X, y

def train_test_split(X, y, test_size = 0.33):
    total = X.shape[0]
    train_length = round(total * (1 - test_size)) 
    return X[:train_length], X[train_length:], y[:train_length], y[train_length:]
  
def ne_type(aType):
    aType = aType.lower()
    if 'per' in aType:
        t =  'NE' if BINARY else 'PER' 
    elif 'loc' in aType:
        t =  'NE' if BINARY else 'LOC'
    elif 'org' in aType:
        t =  'NE' if BINARY else 'ORG'
    elif 'hour' in aType:
        t =  'NE' if BINARY else 'HOUR'
    elif aType != 'o' and len(aType) > 0 :
        t =  'NE' if BINARY else 'MISC'
    else:
        t = 'O'
    return t

def compute_performance(y_true, y_pred, words=None, BINARY=False):
    if BINARY:
        p = precision_score(y_true, y_pred, pos_label=tag2int['NE'])
        r = recall_score(y_true, y_pred, pos_label=tag2int['NE'])
        f1 = f1_score(y_true, y_pred, pos_label=tag2int['NE'])
        acc = accuracy_score(y_true, y_pred)
    else:
        p = precision_score(y_pred, y_true, average='macro')
        r = recall_score(y_pred, y_true, average='macro')
        f1 = f1_score(y_pred, y_true, average='macro')
        acc = accuracy_score(y_pred, y_true)
    if words is None:
        model_output_vs = pd.DataFrame({'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})
    else:
        model_output_vs = pd.DataFrame({'word': words, 'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})

    return p, r, f1, acc, model_output_vs

In [4]:
en_corpus, en_nb_of_phrases = load_corpus(en_corpus_file, max_nb_of_phrases)

In [5]:
tagSet = en_corpus["ne-tag"].dropna().unique()
if BINARY:
    tagSet = ['NE', 'O']
tag2int = {j: i for i, j in enumerate(tagSet)}
int2tag = {i: j for i, j in enumerate(tagSet)}
print(tag2int)

{'O': 0, 'LOC': 3, 'PER': 2, 'HOUR': 4, 'MISC': 1, 'ORG': 5}


In [6]:
en_nb_of_phrases

210

In [7]:
en_corpus.describe()

Unnamed: 0,ne-tag,word
count,4753,4962
unique,6,913
top,O,","
freq,4362,343


In [8]:
en_corpus.head(10)

Unnamed: 0,ne-tag,word
0,O,The
1,O,Promise
2,O,of
3,O,the
4,MISC,Holy
5,MISC,Spirit
6,,\n
7,O,In
8,O,the
9,O,first


In [9]:
en_corpus[:1]

Unnamed: 0,ne-tag,word
0,O,The


In [10]:
en_corpus[en_corpus.word == "\n"].shape

(209, 2)

In [57]:
print("Nb of bi-phrases", en_nb_of_phrases)
en_fingerprints = corpus_fingerprint(en_corpus, en_nb_of_phrases)

Nb of bi-phrases 210


In [58]:
en_corpus[en_corpus.word != "\n"].shape

(4753, 2)

In [89]:
X, target = corpus2trainingdata(en_corpus[en_corpus.word != "\n"], en_fingerprints)

In [60]:
en_fingerprints.shape

(210, 912)

In [83]:
prev_row = None
X2, target2=[], []
for idx, row in en_corpus[en_corpus.word != "\n"].iterrows():
    if prev_row is None:
        prev_row = row.copy()
        continue
    if en_fingerprints[row.word].equals(en_fingerprints[prev_row.word]):
        if row['ne-tag'] == prev_row['ne-tag']:
            X2.append(en_fingerprints[row.word])
            target2.append(tag2int[row['ne-tag']])
        elif row['ne-tag'] == 'O' or prev_row['ne-tag'] == 'O':
            X2.append(en_fingerprints[row.word])
            target2.append(tag2int[row['ne-tag']] if row['ne-tag'] != 'O' else tag2int[prev_row['ne-tag']])
        else:
            pass
    prev_row = row.copy()
X2, target2 = np.array(X2), np.array(target2)

In [90]:
print(X.shape, target.shape)
X = np.concatenate((X, X2))
target = np.concatenate((target, target2))
print(X.shape, target.shape)

(4753, 210) (4753,)
(4804, 210) (4804,)


In [91]:
print(" X.shape =", X.shape, " target.shape=", target.shape)

 X.shape = (4804, 210)  target.shape= (4804,)


In [92]:
y = target.copy()
y[0:100]

array([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0])

In [93]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)
X_train = X_train.reshape(X_train.shape[0], timestep, X_train.shape[1])
X_val = X_val.reshape(X_val.shape[0], timestep, X_val.shape[1])
print("X_train.shape =", X_train.shape)
print("y_train.shape =", y_train.shape)
print("X_val.shape =", X_val.shape)
print("y_val.shape =", y_val.shape)
for tag in tagSet:
    print("{0} % in training data = {1} %".format(tag, np.round(y_train[y_train==tag2int[tag]].size * 100 / y_train.shape[0], 2)))
    print("{0} % in validation data = {1} %".format(tag, np.round(y_val[y_val==tag2int[tag]].size * 100 / y_val.shape[0], 2)))

X_train.shape = (3219, 1, 210)
y_train.shape = (3219,)
X_val.shape = (1585, 1, 210)
y_val.shape = (1585,)
O % in training data = 93.26 %
O % in validation data = 88.45 %
MISC % in training data = 2.14 %
MISC % in validation data = 2.4 %
PER % in training data = 3.51 %
PER % in validation data = 8.45 %
LOC % in training data = 0.99 %
LOC % in validation data = 0.69 %
HOUR % in training data = 0.06 %
HOUR % in validation data = 0.0 %
ORG % in training data = 0.03 %
ORG % in validation data = 0.0 %


In [94]:
if not BINARY:
    y_train = np_utils.to_categorical(y_train, len(tagSet))
    y_val = np_utils.to_categorical(y_val, len(tagSet))

In [95]:
model = Sequential()
model.add(SimpleRNN(640, input_shape=(None, X_train.shape[2]), activation='sigmoid'))
model.add(Dense(160, activation='sigmoid'))
if BINARY:
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['binary_accuracy'])
else:
    model.add(Dense(len(tagSet), activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer="rmsprop", metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_2 (SimpleRNN)     (None, 640)               544640    
_________________________________________________________________
dense_3 (Dense)              (None, 160)               102560    
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 966       
Total params: 648,166
Trainable params: 648,166
Non-trainable params: 0
_________________________________________________________________


In [None]:
early_stop = EarlyStopping(patience=2, verbose=2) # stop learning if the error is the same between two consecutive epochs
best_model_cp = ModelCheckpoint(best_model_file, save_best_only=True, verbose=2) # saved best model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, verbose=1, callbacks=[early_stop, best_model_cp])

Train on 3219 samples, validate on 1585 samples
Epoch 1/10
  32/3219 [..............................] - ETA: 3:18:25 - loss: 1.7714 - acc: 0.0000e+00

In [None]:
best_model = keras.models.load_model(best_model_file) #loading the best model

In [None]:
if BINARY:
    y_pred = np.round(best_model.predict(X_train))
    y_true = y_train
else:
    predictions = best_model.predict(X_train)
    y_pred = [np.argmax(p) for p in predictions]
    y_true = [np.argmax(t) for t in y_train ]

In [None]:
p, r, f1, acc, _ = compute_performance(y_true, y_pred, BINARY)
print("precision =", p, "recall =", r, "f1-score =", f1, "accuracy =", acc)

precision, recall, f1_mesure, s = precision_recall_fscore_support(y_true, y_pred)
indices = [int2tag[i] for i in range(len(tagSet))]
if(len(indices) == precision.size):
    scores = pd.DataFrame({'Precision': precision, 'Recall': recall, 'F1-mesure': f1_mesure, 'Support': s},index=indices)
else:
    scores = pd.DataFrame({'Precision': precision, 'Recall': recall, 'F1-mesure': f1_mesure, 'Support': s})
scores

In [None]:
if BINARY:
    y_pred_val = np.round(best_model.predict(X_val))
    y_true_val = y_val
else:
    predictions = best_model.predict(X_val)
    y_pred_val = [np.argmax(p) for p in predictions]
    y_true_val = [np.argmax(t) for t in y_val ]

In [None]:
p, r, f1, acc, _ = compute_performance(y_true_val, y_pred_val, BINARY)
print("precision =", p, "recall =", r, "f1-score =", f1, "accuracy =", acc)

val_precision, val_recall, val_f1_mesure, val_s = precision_recall_fscore_support(y_true_val, y_pred_val)
indices = [int2tag[i] for i in range(len(tagSet))]
if(len(indices) == val_precision.size):
    val_scores = pd.DataFrame({'Precision': val_precision, 'Recall': val_recall, 'F1-mesure': val_f1_mesure, 'Support': val_s}, index=indices)
else:
    val_scores = pd.DataFrame({'Precision': val_precision, 'Recall': val_recall, 'F1-mesure': val_f1_mesure, 'Support': val_s})
val_scores

In [None]:
ewo_corpus, ewo_nb_of_phrases = load_corpus(ewo_corpus_file, max_nb_of_phrases)

In [None]:
ewo_nb_of_phrases

In [None]:
ewo_corpus.describe()

In [None]:
ewo_corpus.head()

In [None]:
ewo_fingerprints = corpus_fingerprint(ewo_corpus, en_nb_of_phrases)

In [None]:
import gc
gc.collect()
X_ewo, ewo_target = corpus2trainingdata(ewo_corpus[ewo_corpus.word != "\n"], ewo_fingerprints)

In [None]:
X_ewo.shape

In [None]:
y_ewo = ewo_target.copy()
y_ewo[:20]

In [None]:
print("ewo", ewo_nb_of_phrases)
print("en", en_nb_of_phrases)
print("ewo.word.shape", ewo_corpus[ewo_corpus.word != "\n"].word.shape)
print("X_ewo.shape ", X_ewo.shape)

In [None]:
X_ewo = X_ewo.reshape((X_ewo.shape[0], timestep, en_nb_of_phrases))
ewo_predictions = best_model.predict(X_ewo)

if BINARY:
    y_pred_ewo = np.round(ewo_predictions)
    y_true_ewo = y_ewo
else:
    predictions = ewo_predictions
    y_pred_ewo = [np.argmax(p) for p in predictions]
    y_true_ewo = y_ewo

In [None]:
p, r, f1, acc, output = compute_performance(y_true_ewo, y_pred_ewo, ewo_corpus[ewo_corpus.word != '\n'].word, BINARY)
print("precision =", p, "recall =", r, "f1-score =", f1, "accuracy =", acc)

ewo_precision, ewo_recall, ewo_f1_mesure, ewo_s = precision_recall_fscore_support(y_true_ewo, y_pred_ewo)
indices = [int2tag[i] for i in range(len(tagSet))]
if(len(indices) == ewo_recall.size):
    ewo_scores = pd.DataFrame({'Precision': ewo_precision, 'Recall': ewo_recall, 'F1-mesure': ewo_f1_mesure, 'Support': ewo_s}, index=indices)
else:
    ewo_scores = pd.DataFrame({'Precision': ewo_precision, 'Recall': ewo_recall, 'F1-mesure': ewo_f1_mesure, 'Support': ewo_s})
ewo_scores

In [None]:
output.query('y_true == "MISC"')