In [1]:
# import
import keras
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from keras.utils import np_utils, plot_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn import model_selection
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
import h5py as h5py

Using TensorFlow backend.


In [2]:
BINARY = False
timestep = 1
epochs = 10
en_corpus_file = "corpus-en.txt"
ewo_corpus_file = "corpus-ewo.txt"
best_model_file = "best-model-conll.hdfs"
max_nb_of_phrases =  -1
duplication = 1
max_depth = 0
is_only_vocab = True
shuffle = is_only_vocab

In [3]:
def getTag(aString):
    tag = "O"
    if BINARY:
        if aString != "O":
            return "NE"
    else:
        tag = aString
    return tag
     

In [4]:
def load_corpus(file, max_nb_of_phrases):
    nb_of_phrases = 0
    dataset = {"word": [], "ne-tag": []}
    with open(file) as f:
        prev_line = None
        for cpt, line in enumerate(f):
            if cpt == 0:
                continue
            if nb_of_phrases == max_nb_of_phrases:
                break;

            l = line.strip()
            if len(l) == 0 and len(prev_line) != 0:
                nb_of_phrases += 1
                dataset["word"].append(line)
                dataset["ne-tag"].append(None)
            else:
                l = l.split("\t")
                dataset["word"].append(l[0])
                dataset["ne-tag"].append(ne_type(l[1]))
            prev_line = line.strip()
        
    return pd.DataFrame(dataset), nb_of_phrases+1

In [5]:
def corpus_fingerprint(aDataframe, nb_of_biphrases):
    # create distributionnal signature
    fingerprints = {}
    current_bi_phrase_index = 0
    nb_word_in_corpus = aDataframe[aDataframe.word != "\n"].word.size
    words_in_current_phrase = []
    for index, row in aDataframe.iterrows():
        if current_bi_phrase_index > nb_of_biphrases:
            break
            
        word = row['word']
        
        if word != "\n":
            words_in_current_phrase.append(word)
            if word not in fingerprints:
                fingerprints[word] = np.zeros(nb_of_biphrases, dtype=np.float32)
            fingerprints[word][current_bi_phrase_index] += 1
        else:
            nb_word_in_current_phrase = len(words_in_current_phrase)
#             for w in words_in_current_phrase:
#                 fingerprints[w][current_bi_phrase_index] = nb_word_in_corpus / fingerprints[w][current_bi_phrase_index]                
            current_bi_phrase_index += 1
            words_in_current_phrase = []
    for word in fingerprints:
        for i in range(nb_of_biphrases):
            if fingerprints[word][i] != 0:
                fingerprints[word][i] = nb_word_in_corpus / fingerprints[word][i]
#         fingerprints[word][nb_of_biphrases] = nb_word_in_corpus / aDataframe[aDataframe.word == word].word.size
        
    return pd.DataFrame(fingerprints)

In [6]:
def corpus2trainingdata(aDataframe, fingerprintsDataFrame):
    X = np.zeros((aDataframe.shape[0], fingerprintsDataFrame.shape[0]), dtype=np.int8)
    y = np.zeros(aDataframe.shape[0], dtype=np.int8)
    i = 0
    for row in aDataframe.iterrows():
        X[i] = fingerprintsDataFrame[row[1]['word']].values
        y[i] = tag2int[getTag(row[1]['ne-tag'])]
        i += 1
    return X, y

In [7]:
def train_test_split(X, y, test_size = 0.33):
    total = X.shape[0]
    train_length = round(total * (1 - test_size)) 
    return X[:train_length], X[train_length:], y[:train_length], y[train_length:]

In [8]:
  
def ne_type(aType):
    aType = aType.lower()
    if 'per' in aType:
        t =  'NE' if BINARY else 'PER' 
    elif 'loc' in aType:
        t =  'NE' if BINARY else 'LOC'
    elif 'org' in aType:
        t =  'NE' if BINARY else 'ORG'
    elif 'hour' in aType:
        t =  'NE' if BINARY else 'MISC'
    elif aType != 'o' and len(aType) > 0 :
        t =  'NE' if BINARY else 'MISC'
    else:
        t = 'O'
    return t

In [9]:
def compute_performance(y_true, y_pred, words=None, BINARY=False):
    if BINARY:
        p = precision_score(y_true, y_pred, pos_label=tag2int['NE'])
        r = recall_score(y_true, y_pred, pos_label=tag2int['NE'])
        f1 = f1_score(y_true, y_pred, pos_label=tag2int['NE'])
        acc = accuracy_score(y_true, y_pred)
    else:
        p = precision_score(y_pred, y_true, average='macro')
        r = recall_score(y_pred, y_true, average='macro')
        f1 = f1_score(y_pred, y_true, average='macro')
        acc = accuracy_score(y_pred, y_true)
    if words is None:
        model_output_vs = pd.DataFrame({'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})
    else:
        model_output_vs = pd.DataFrame({'word': words, 'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})

    return p, r, f1, acc, model_output_vs

In [10]:
def P_R_F1(y_pred, y_true, neg_class):
    same = y_pred[y_true==y_pred]
    tp = same[same != neg_class].size
    nb_of_pos_exple = y_true[y_true != neg_class].size
    nb_of_pos_pred = y_pred[y_pred != neg_class].size
    p = r = f1 = 0
    try:
        p = np.round(tp*100/nb_of_pos_pred, 2)
    except ZeroDivisionError:
        print("number of correct positive predictions is 0")
        
    try:
        r = np.round(tp*100/nb_of_pos_exple, 2)
    except ZeroDivisionError:
        print("number of position exple is 0")
        
    try:
        f1 = np.round(2*r*p/(r+p), 2)
    except ZeroDivisionError:
        print("Recall and precision are 0")

    return p, r, f1

In [11]:
def is_mergeable(aListOfConsecutiveTokens, corpus, fingerprints):
    n = len(aListOfConsecutiveTokens)
    if n <= 1:
        return False
    if n == 2:
        w1, w2 = aListOfConsecutiveTokens[0], aListOfConsecutiveTokens[1]
        rep1, rep2 = fingerprints[aListOfConsecutiveTokens[0]], fingerprints[aListOfConsecutiveTokens[1]]
        tag1, tag2 = corpus[corpus.word==w1].iloc[0]['ne-tag'], corpus[corpus.word==w2].iloc[0]['ne-tag']
        if (tag1 == tag2) and (tag1 == "O"): # O + O => False
            return False
        if (tag1 != tag2) and (tag1 != "O") and (tag2 != "O"): # X + Y => False
            return False
        return rep1.equals(rep2)
    else:
        half = int(n / 2)
        return is_mergeable(aListOfConsecutiveTokens[0:half+1], corpus, fingerprints) and is_mergeable(aListOfConsecutiveTokens[half:n], corpus, fingerprints)

In [12]:
def merge(depth, corpus, fingerprint):
    wordDf = corpus[corpus.word != "\n"].word
    nbOfWord = wordDf.shape[0]
    text = list(wordDf)
    X2, target2, tokens = [], [], []
    level, newToken = 1, True
    while level <= depth and newToken:
        i, newToken = 0, False
        limit = nbOfWord - level
        while i < limit:
            if is_mergeable(text[i:i+level+1], corpus, fingerprint):
                tokens.append(" ".join(text[i:i+level+1]))
                newToken = True
            i += 1
        print("level ", level, ":", set(tokens))
        level += 1
    
    X2, target2 = np.array(X2), np.array(target2)
    
    return X2, target2, set(tokens)

In [13]:
def shuffle(X, y):
    indices = [i for i in  range(X.shape[0])]
    np.random.shuffle(indices)
    return X[indices], y[indices]

In [14]:
en_corpus, en_nb_of_phrases = load_corpus(en_corpus_file, max_nb_of_phrases)

In [15]:
tagSet = en_corpus["ne-tag"].dropna().unique()
if BINARY:
    tagSet = ['NE', 'O']
tag2int = {j: i for i, j in enumerate(tagSet)}
int2tag = {i: j for i, j in enumerate(tagSet)}
print(tag2int)

{'O': 0, 'MISC': 1, 'PER': 2, 'LOC': 3, 'ORG': 4}


In [16]:
en_nb_of_phrases

210

In [17]:
en_corpus.describe()

Unnamed: 0,word,ne-tag
count,4962,4753
unique,913,5
top,",",O
freq,343,4362


In [18]:
en_corpus.head(10)

Unnamed: 0,word,ne-tag
0,The,O
1,Promise,O
2,of,O
3,the,O
4,Holy,MISC
5,Spirit,MISC
6,\n,
7,In,O
8,the,O
9,first,O


In [19]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(en_corpus[en_corpus['ne-tag']==tag].shape[0] * 100 / en_corpus[en_corpus['ne-tag']!='\n'].shape[0], 2)))

O % = 87.91 %
MISC % = 2.12 %
PER % = 4.94 %
LOC % = 0.81 %
ORG % = 0.02 %


In [20]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(en_corpus[en_corpus['ne-tag']==tag].word.unique().shape[0] * 100 / en_corpus[en_corpus['ne-tag']!='\n'].word.unique().shape[0], 2)))

O % = 89.27 %
MISC % = 1.86 %
PER % = 8.87 %
LOC % = 1.97 %
ORG % = 0.11 %


In [21]:
en_corpus[en_corpus.word == "\n"].shape

(209, 2)

In [22]:
print("Nb of bi-phrases", en_nb_of_phrases)
en_fingerprints = corpus_fingerprint(en_corpus, en_nb_of_phrases)

Nb of bi-phrases 210


In [23]:
en_fingerprints.head(10)

Unnamed: 0,The,Promise,of,the,Holy,Spirit,In,first,book,",",...,considered,dream,She,save,fulfill,Immanuel,us),woke,sleep,knew
0,4753.0,4753.0,4753.0,4753.0,4753.0,4753.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,4753.0,0.0,0.0,4753.0,4753.0,4753.0,1584.333374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1584.333374,4753.0,4753.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,4753.0,4753.0,0.0,0.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4753.0,2376.5,0.0,0.0,0.0,0.0,0.0,1188.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,4753.0,4753.0,4753.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4753.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,4753.0,0.0,0.0,0.0,0.0,0.0,1584.333374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,4753.0,0.0,0.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,4753.0,1584.333374,4753.0,4753.0,0.0,0.0,0.0,2376.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
en_fingerprints['you'].values.shape

(210,)

In [25]:
en_corpus[en_corpus.word != "\n"].shape

(4753, 2)

In [26]:
X2, target2, tokens = merge(max_depth, en_corpus, en_fingerprints)

In [27]:
if is_only_vocab:
    text = list(en_corpus[en_corpus.word != "\n"].word.unique())
else:
    text = list(en_corpus[en_corpus.word != "\n"].word)
en_vocab = pd.DataFrame({'text': text + list(tokens)})
en_vocab.describe()

Unnamed: 0,text
count,912
unique,912
top,greeted
freq,1


In [28]:
if is_only_vocab:
    X = np.zeros((en_vocab.shape[0] * duplication, en_nb_of_phrases))
    target = np.zeros((en_vocab.shape[0] * duplication))
    p=0
    for i, row in en_vocab.iterrows():
        c = row.text
        for j in range(duplication):
            X[p] = en_fingerprints[c.split(" ")[0]]
            target[p] = tag2int[getTag(en_corpus[en_corpus.word == c.split(" ")[-1:][0]]['ne-tag'].iloc[0])]
            p+=1
    X, target = shuffle(X, target)
    print(X.shape, en_fingerprints.shape, target.shape)

(912, 210) (210, 912) (912,)


In [29]:
en_vocab[-20:]

Unnamed: 0,text
892,Eliud
893,Eleazar
894,Matthan
895,husband
896,fourteen
897,unwilling
898,shame
899,resolved
900,divorce
901,quietly


In [30]:
if not is_only_vocab:
    X, target = corpus2trainingdata(en_corpus[en_corpus.word != "\n"], en_fingerprints)

In [31]:
print(X.shape, target.shape)
if len(X.shape) == len(X2.shape):
    X = np.concatenate((X, X2))
    target = np.concatenate((target, target2))
    if shuffle:
        X, target = shuffle(X, target)
print(X.shape, target.shape)

(912, 210) (912,)
(912, 210) (912,)


In [32]:
y = target.copy()
y[0:100]
if not BINARY:
    y = np_utils.to_categorical(y, len(tagSet))
y.shape

(912, 5)

In [33]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=0.33)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1])
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1])
print("X_train.shape =", X_train.shape)
print("y_train.shape =", y_train.shape)
print("X_val.shape =", X_val.shape)
print("y_val.shape =", y_val.shape)

tTarget = np.array([np.argmax(yy) for yy in y_train])
vTarget = np.array([np.argmax(yy) for yy in y_val])

for tag in tagSet:
    print("{0} % in training data = {1} %".format(tag, np.round(tTarget[tTarget==tag2int[tag]].size * 100 / tTarget.shape[0], 2)))
    print("{0} % in validation data = {1} %".format(tag, np.round(vTarget[vTarget==tag2int[tag]].size * 100 / vTarget.shape[0], 2)))

X_train.shape = (611, 210)
y_train.shape = (611, 5)
X_val.shape = (301, 210)
y_val.shape = (301, 5)
O % in training data = 88.05 %
O % in validation data = 89.04 %
MISC % in training data = 1.15 %
MISC % in validation data = 1.33 %
PER % in training data = 8.67 %
PER % in validation data = 7.97 %
LOC % in training data = 1.96 %
LOC % in validation data = 1.66 %
ORG % in training data = 0.16 %
ORG % in validation data = 0.0 %


In [34]:
def create_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(640, input_dim=input_dim, activation='sigmoid', name="hidden1"))
    model.add(Dense(160, activation='sigmoid', name="hidden2"))
    if BINARY:
        model.add(Dense(1, activation='sigmoid', name="outputlayer"))
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['binary_accuracy'])
    else:
        model.add(Dense(output_dim, activation='softmax', name="outputlayer"))
        model.compile(loss='categorical_crossentropy', optimizer="rmsprop", metrics=['accuracy'])
    plot_model(model, show_shapes=True, show_layer_names=True)
    model.summary()
    return model

In [35]:
def train_model(model, X_train, y_train, X_val, y_val, epochs=epochs):
    early_stop = EarlyStopping(patience=2, verbose=2) # stop learning if the error is the same between two consecutive epochs
    best_model_cp = ModelCheckpoint(best_model_file, save_best_only=True, verbose=1) # saved best model
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, verbose=0, shuffle=shuffle, callbacks=[best_model_cp, early_stop])
    best_model = keras.models.load_model(best_model_file) #loading the best model
    return best_model

In [36]:
def predict(model, X, y, binary=BINARY):
    if BINARY:
        y_pred = np.round(model.predict(X))
        y_true = y
    else:
        predictions = model.predict(X)
        y_pred = np.array([np.argmax(p) for p in predictions])
        y_true = np.array([np.argmax(t) for t in y ])
    return y_true, y_pred

In [37]:
def model_performance(y_true, y_pred):
    return P_R_F1(y_pred, y_true, tag2int['O']) #precision, recall, f1-score

In [38]:
def model_performace_by_tag(y_true, y_pred, tag):
    p, r, f1 = 0, 0, 0
    
    eq = y_pred[y_pred==y_true]
    correctly_pred = eq[eq==tag].size
    try:
        p = np.round(100 * correctly_pred / y_pred[y_pred==tag].size, 2)
    except ZeroDivisionError:
        pass
    
    try:
        r = np.round(100 * correctly_pred / y_true[y_true==tag].size, 2)
    except ZeroDivisionError:
        pass
    
    try:
        f1 = np.round(2 * r * p / (r + p), 2)
    except ZeroDivisionError:
        pass
    
    return p, r, f1

In [39]:
ewo_corpus, ewo_nb_of_phrases = load_corpus(ewo_corpus_file, max_nb_of_phrases)

In [40]:
ewo_nb_of_phrases

210

In [41]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(ewo_corpus[ewo_corpus['ne-tag']==tag].shape[0] * 100 / ewo_corpus[ewo_corpus['ne-tag']!='\n'].shape[0], 2)))

O % = 86.37 %
MISC % = 2.18 %
PER % = 5.76 %
LOC % = 0.89 %
ORG % = 0.05 %


In [42]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(ewo_corpus[ewo_corpus['ne-tag']==tag].word.unique().shape[0] * 100 / ewo_corpus[ewo_corpus['ne-tag']!='\n'].word.unique().shape[0], 2)))

O % = 90.0 %
MISC % = 1.17 %
PER % = 8.25 %
LOC % = 1.84 %
ORG % = 0.19 %


In [43]:
ewo_corpus.describe()

Unnamed: 0,word,ne-tag
count,4394,4185
unique,1030,5
top,",",O
freq,413,3795


In [44]:
ewo_corpus.head()

Unnamed: 0,word,ne-tag
0,Mfufub,MISC
1,Nsisim,MISC
2,ayi,O
3,sò,O
4,\n,


In [45]:
ewo_fingerprints = corpus_fingerprint(ewo_corpus, en_nb_of_phrases)

In [46]:
X2, target2, tokens = merge(max_depth, ewo_corpus, ewo_fingerprints)

In [47]:
if is_only_vocab:
    text = list(ewo_corpus[ewo_corpus.word != "\n"].word.unique())
else:
    text = list(ewo_corpus[ewo_corpus.word != "\n"].word)
ewo_vocab = pd.DataFrame({"text":text + list(tokens)})

In [48]:
if is_only_vocab:
    X_ewo = np.zeros((ewo_vocab.shape[0] * duplication, en_nb_of_phrases))
    ewo_target = np.zeros((ewo_vocab.shape[0] * duplication))
    p=0
    for i, row in ewo_vocab.iterrows():
        c = row.text
        for j in range(duplication):
            X_ewo[p] = ewo_fingerprints[c.split(" ")[0]]
            ewo_target[p] = tag2int[getTag(ewo_corpus[ewo_corpus.word == c.split(" ")[-1:][0]]['ne-tag'].iloc[0])]
            p+=1
    X_ewo, ewo_target = shuffle(X_ewo, ewo_target)

In [49]:
ewo_vocab[-20:]

Unnamed: 0,text
1009,nlo
1010,obë
1011,mbara
1012,yabyali
1013,dzili
1014,yasò
1015,oyolëge
1016,kode
1017,dili
1018,atoban


In [50]:
if not is_only_vocab:
    X_ewo, ewo_target = corpus2trainingdata(ewo_corpus[ewo_corpus.word != "\n"], ewo_fingerprints)

In [51]:
print(X_ewo.shape, ewo_target.shape)
if len(X_ewo.shape) == len(X2.shape):
    X_ewo = np.concatenate((X_ewo, X2))
    ewo_target = np.concatenate((ewo_target, target2))
    if shuffle:
        X_ewo, ewo_target = shuffle(X_ewo, ewo_target)
print(X_ewo.shape, ewo_target.shape)

(1029, 210) (1029,)
(1029, 210) (1029,)


In [52]:
y_ewo = ewo_target.copy()
print(y_ewo.shape, len(ewo_vocab))

(1029,) 1029


In [53]:
X_ewo.shape

(1029, 210)

In [54]:
y_ewo = ewo_target.copy()
y_ewo[:20]
if not BINARY:
    y_ewo = np_utils.to_categorical(y_ewo)

In [55]:
X_ewo = X_ewo.reshape((X_ewo.shape[0], en_nb_of_phrases))

In [56]:
def algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, epochs=epochs, model=None):
    test_precision, train_precision, ewo_precision = [], [], []
    test_recall, train_recall, ewo_recall = [], [], []
    test_fscore, train_fscore, ewo_fscore = [], [], []
    
    test_result_by_tag = {}
    train_result_by_tag = {}
    ewo_result_by_tag = {}
    for t in tagSet:
        f1_key = "F1-"+t
        p_key = "P-"+t
        r_key = "R-"+t
        train_result_by_tag[f1_key], train_result_by_tag[p_key], train_result_by_tag[r_key] = [], [], []
        test_result_by_tag[f1_key], test_result_by_tag[p_key], test_result_by_tag[r_key] = [], [], []
        ewo_result_by_tag[f1_key], ewo_result_by_tag[p_key], ewo_result_by_tag[r_key] = [], [], []

    m = train_model(model, X_train, y_train, X_val, y_val, epochs=epochs)
        
    y_true, y_pred = predict(m, X_train, y_train)
    p_train, r_train, f1_train = model_performance(y_true, y_pred)
        
    y_true_val, y_pred_val = predict(m, X_val, y_val)
    p_val, r_val, f1_val = model_performance(y_true_val, y_pred_val)
        
    y_true_ewo, y_pred_ewo = predict(m, X_ewo, y_ewo) 
    p_ewo, r_ewo, f1_ewo = model_performance(y_true_ewo, y_pred_ewo)
        
    for t in range(len(int2tag)):
        f1_key = "F1-" + int2tag[t]
        p_key = "P-" + int2tag[t]
        r_key = "R-" + int2tag[t]
            
        p, r, f1 = model_performace_by_tag(y_true, y_pred, t)
        train_result_by_tag[p_key].append(p)
        train_result_by_tag[r_key].append(r)
        train_result_by_tag[f1_key].append(f1)
            
        p, r, f1 = model_performace_by_tag(y_true_val, y_pred_val, t)
        test_result_by_tag[p_key].append(p)
        test_result_by_tag[r_key].append(r)
        test_result_by_tag[f1_key].append(f1)
            
        p, r, f1 = model_performace_by_tag(y_true_ewo, y_pred_ewo, t)
        ewo_result_by_tag[p_key].append(p)
        ewo_result_by_tag[r_key].append(r)
        ewo_result_by_tag[f1_key].append(f1)
                
    test_precision.append(p_val)
    train_precision.append(p_train)
    ewo_precision.append(p_ewo)
        
    test_recall.append(r_val)
    train_recall.append(r_train)
    ewo_recall.append(r_ewo)
        
    test_fscore.append(f1_val)
    train_fscore.append(f1_train)
    ewo_fscore.append(f1_ewo)
    return pd.DataFrame({
        'P_test': test_precision, 
        'P_train': train_precision, 
        'P_ewo': ewo_precision, 'R_test': test_recall, 'R_train': train_recall, 
        'R_ewo': ewo_recall, 'F1-test': test_fscore, 'F1-train': train_fscore, 'F1-ewo': ewo_fscore}), pd.DataFrame(train_result_by_tag), pd.DataFrame(test_result_by_tag), pd.DataFrame(ewo_result_by_tag)

In [57]:
# model = create_model(X.shape[1], len(tagSet))
# resultEval, train_by_tag, test_by_tag, ewo_by_tag = algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, model=model)

In [58]:
# resultEval

In [59]:
# train_by_tag

In [60]:
# test_by_tag

In [61]:
# ewo_by_tag

In [62]:
# resultEval.mean()

In [63]:
# resultEval.std()

In [64]:
def algoCrossVal(X, y, X_ewo, y_ewo, k = 10, repeat=1): 
    block_size = int(X.shape[0] / k)   
    output = None
    model = None
    train_by_tags, test_by_tags, ewo_by_tags = None, None, None
    for it in range(repeat):
        print("AlgoCrossValIter -", it+1)
        model = create_model(X.shape[1], len(tagSet))
        results = None
        train_by_tagsTmp, test_by_tagsTmp, ewo_by_tagsTmp = None, None, None
        for i in range(k):
            X_val, y_val = X[i*block_size:i*block_size+block_size], y[i*block_size:i*block_size+block_size]
            X_train = np.concatenate((X[0:i*block_size], X[i*block_size+block_size:]))
            y_train = np.concatenate((y[0:i*block_size], y[i*block_size+block_size:]))

            X_train = X_train.reshape(X_train.shape[0], X_train.shape[1])
            X_val = X_val.reshape(X_val.shape[0], X_val.shape[1])

            result, train_by_tag, test_by_tag, ewo_by_tag = algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, model=model)
            if results is None:
                results = result.copy()
                train_by_tagsTmp, test_by_tagsTmp, ewo_by_tagsTmp = train_by_tag.copy(), test_by_tag.copy(), ewo_by_tag.copy()
            else:
                results = pd.concat([results, result], ignore_index=True)
                train_by_tagsTmp = pd.concat([train_by_tagsTmp, train_by_tag], ignore_index=True)
                test_by_tagsTmp = pd.concat([test_by_tagsTmp, test_by_tag], ignore_index=True)
                ewo_by_tagsTmp = pd.concat([ewo_by_tagsTmp, ewo_by_tag], ignore_index=True)
        
        if output is None:
            output = results.mean(axis=0).to_frame()
            train_by_tags = train_by_tagsTmp.mean(axis=0).to_frame()
            test_by_tags = test_by_tagsTmp.mean(axis=0).to_frame()
            ewo_by_tags = ewo_by_tagsTmp.mean(axis=0).to_frame()
        else:
            output = pd.concat([output, results.mean(axis=0).to_frame()], axis=1)
            train_by_tags = pd.concat([train_by_tags, train_by_tagsTmp.mean(axis=0).to_frame()], axis=1)
            test_by_tags = pd.concat([test_by_tags, test_by_tagsTmp.mean(axis=0).to_frame()], axis=1)
            ewo_by_tags = pd.concat([ewo_by_tags, ewo_by_tagsTmp.mean(axis=0).to_frame()], axis=1)

    return output, train_by_tags, test_by_tags, ewo_by_tags, model

In [65]:
resultCrossVal, trainByTagResult, testByTagResult, ewoByTagResult, model = algoCrossVal(X, y, X_ewo, y_ewo, repeat=10)

AlgoCrossValIter - 1
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden1 (Dense)              (None, 640)               135040    
_________________________________________________________________
hidden2 (Dense)              (None, 160)               102560    
_________________________________________________________________
outputlayer (Dense)          (None, 5)                 805       
Total params: 238,405
Trainable params: 238,405
Non-trainable params: 0
_________________________________________________________________

Epoch 00001: val_loss improved from inf to 0.51957, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.51957

Epoch 00003: val_loss improved from 0.51957 to 0.42881, saving model to best-model-conll.hdfs

Epoch 00004: val_loss improved from 0.42881 to 0.38687, saving model to best-model-conll.hdfs

Epoch 00005: val_loss did not improve from 0.




Epoch 00001: val_loss improved from inf to 0.11202, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.11202

Epoch 00003: val_loss improved from 0.11202 to 0.11067, saving model to best-model-conll.hdfs

Epoch 00004: val_loss did not improve from 0.11067

Epoch 00005: val_loss improved from 0.11067 to 0.10716, saving model to best-model-conll.hdfs

Epoch 00006: val_loss did not improve from 0.10716

Epoch 00007: val_loss did not improve from 0.10716
Epoch 00007: early stopping

Epoch 00001: val_loss improved from inf to 0.17482, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.17482 to 0.08321, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.08321

Epoch 00004: val_loss did not improve from 0.08321
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.11092, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.11092

Epoch 00003: val_loss




Epoch 00003: val_loss did not improve from 0.11712
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.12027, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.12027

Epoch 00003: val_loss did not improve from 0.12027
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.14327, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.14327

Epoch 00003: val_loss did not improve from 0.14327
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.11472, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.11472

Epoch 00003: val_loss did not improve from 0.11472
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.05453, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.05453 to 0.05098, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.05098

Epoch 00004:




Epoch 00001: val_loss improved from inf to 0.10651, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.10651

Epoch 00003: val_loss did not improve from 0.10651
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.10568, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.10568 to 0.08344, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.08344

Epoch 00004: val_loss did not improve from 0.08344
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.13697, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.13697

Epoch 00003: val_loss did not improve from 0.13697
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.08709, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.08709

Epoch 00003: val_loss did not improve from 0.08709
Epoch 00003: early stopping

Epoch 00001:




Epoch 00001: val_loss improved from inf to 0.10168, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.10168

Epoch 00003: val_loss did not improve from 0.10168
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.10173, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.10173

Epoch 00003: val_loss improved from 0.10173 to 0.09416, saving model to best-model-conll.hdfs

Epoch 00004: val_loss improved from 0.09416 to 0.07750, saving model to best-model-conll.hdfs

Epoch 00005: val_loss did not improve from 0.07750

Epoch 00006: val_loss did not improve from 0.07750
Epoch 00006: early stopping

Epoch 00001: val_loss improved from inf to 0.12017, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.12017

Epoch 00003: val_loss did not improve from 0.12017
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.11424, saving model to best-model-conll.hdf




Epoch 00002: val_loss did not improve from 0.17026

Epoch 00003: val_loss improved from 0.17026 to 0.13313, saving model to best-model-conll.hdfs

Epoch 00004: val_loss improved from 0.13313 to 0.12944, saving model to best-model-conll.hdfs

Epoch 00005: val_loss improved from 0.12944 to 0.11763, saving model to best-model-conll.hdfs

Epoch 00006: val_loss did not improve from 0.11763

Epoch 00007: val_loss did not improve from 0.11763
Epoch 00007: early stopping

Epoch 00001: val_loss improved from inf to 0.09040, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.09040 to 0.07113, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.07113

Epoch 00004: val_loss did not improve from 0.07113
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.13932, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.13932

Epoch 00003: val_loss did not improve from 0.13932
Epoch 00003: early




Epoch 00001: val_loss improved from inf to 0.10925, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.10925 to 0.10836, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.10836

Epoch 00004: val_loss did not improve from 0.10836
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.08677, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.08677

Epoch 00003: val_loss did not improve from 0.08677
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.11262, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.11262

Epoch 00003: val_loss did not improve from 0.11262
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.12415, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.12415 to 0.11964, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.1196




Epoch 00001: val_loss improved from inf to 0.13758, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.13758 to 0.09733, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.09733

Epoch 00004: val_loss did not improve from 0.09733
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.10550, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.10550

Epoch 00003: val_loss did not improve from 0.10550
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.13395, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.13395

Epoch 00003: val_loss did not improve from 0.13395
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.10924, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.10924

Epoch 00003: val_loss did not improve from 0.10924
Epoch 00003: early stopping

Epoch 00001:




Epoch 00001: val_loss improved from inf to 0.12407, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.12407 to 0.11897, saving model to best-model-conll.hdfs

Epoch 00003: val_loss improved from 0.11897 to 0.11716, saving model to best-model-conll.hdfs

Epoch 00004: val_loss did not improve from 0.11716

Epoch 00005: val_loss did not improve from 0.11716
Epoch 00005: early stopping

Epoch 00001: val_loss improved from inf to 0.08236, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.08236

Epoch 00003: val_loss did not improve from 0.08236
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.11954, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.11954 to 0.11684, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.11684

Epoch 00004: val_loss did not improve from 0.11684
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.




Epoch 00001: val_loss improved from inf to 0.10193, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.10193

Epoch 00003: val_loss did not improve from 0.10193
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.09330, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.09330 to 0.08844, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.08844

Epoch 00004: val_loss did not improve from 0.08844
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.14393, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.14393 to 0.13442, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.13442

Epoch 00004: val_loss did not improve from 0.13442
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.12502, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.1250




Epoch 00001: val_loss improved from inf to 0.11728, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.11728 to 0.10182, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.10182

Epoch 00004: val_loss did not improve from 0.10182
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.09118, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.09118

Epoch 00003: val_loss did not improve from 0.09118
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.10895, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.10895

Epoch 00003: val_loss did not improve from 0.10895
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.14725, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.14725 to 0.13737, saving model to best-model-conll.hdfs

Epoch 00003: val_loss improved from 0.13737 to 0.

In [66]:
resultCrossVal.to_csv("results/merge-{0}.csv".format(max_depth))
resultCrossVal

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
P_test,89.144,75.112,87.339,86.77,80.465,84.984,83.963,81.951,87.098,83.604
P_train,90.876,86.225,88.108,88.398,86.197,86.516,86.571,84.639,87.153,87.837
P_ewo,80.869,77.149,79.115,77.312,75.959,76.989,77.876,73.028,77.583,77.166
R_test,74.328,71.87,74.722,76.409,68.962,76.642,71.877,79.204,71.912,74.739
R_train,78.424,76.284,79.281,81.462,73.971,83.659,79.182,85.703,81.677,81.151
R_ewo,61.758,61.112,63.056,64.353,58.241,65.556,64.814,69.073,65.555,64.999
F1-test,79.699,81.08,78.411,80.125,82.167778,78.3,74.803,79.329,75.924,77.824
F1-train,83.859,79.771,82.875,84.31,77.248,84.84,81.939,84.906,83.607,83.97
F1-ewo,69.532,66.947,69.674,69.541,64.257,70.488,69.733,70.463,70.137,69.896


In [67]:
resultCrossVal.mean(axis=1).to_frame()

Unnamed: 0,0
P_test,84.043
P_train,87.252
P_ewo,77.3046
R_test,74.0665
R_train,80.0794
R_ewo,63.8517
F1-test,78.766278
F1-train,82.7325
F1-ewo,69.0668


In [68]:
resultCrossVal.std(axis=1).to_frame()

Unnamed: 0,0
P_test,4.105271
P_train,1.684321
P_ewo,2.017411
R_test,2.977341
R_train,3.43109
R_ewo,2.97329
F1-test,2.23542
F1-train,2.465754
F1-ewo,1.963093


In [69]:
trainByTagResult.to_csv("results/train-by-tag-merge-{0}.csv".format(max_depth))
trainByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-O,98.06,97.696,97.956,98.073,97.652,98.08,97.854,98.094,98.008,98.013
P-O,97.233,97.036,97.359,97.629,96.769,97.896,97.351,98.226,97.665,97.561
R-O,98.912,98.403,98.58,98.539,98.607,98.276,98.387,97.975,98.374,98.484
F1-MISC,75.375,73.316667,73.236,73.694,74.68,71.476,74.65,77.77,73.181,72.016
P-MISC,97.46,81.844,97.778,94.028,84.773,91.526,80.884,90.988,94.253,95.182
R-MISC,62.979,58.543,59.958,63.776,57.634,63.473,59.837,68.838,62.362,61.453
F1-PER,87.719,85.05,87.463,88.402,81.357,89.364,86.698,88.848,87.4,88.576
P-PER,91.056,86.681,88.509,88.949,86.145,88.032,88.755,86.167,88.887,88.494
R-PER,85.372,84.599,86.93,88.607,80.904,90.81,85.64,92.163,87.571,89.101
F1-LOC,71.027,65.014444,72.408889,70.998,68.812222,71.452,73.911111,72.42,73.184,73.07


In [70]:
trainByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-O,97.9486
P-O,97.4725
R-O,98.4537
F1-MISC,73.939467
P-MISC,90.8716
R-MISC,61.8853
F1-PER,87.0877
P-PER,88.1675
R-PER,87.1697
F1-LOC,71.229767


In [71]:
trainByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-O,0.161503
P-O,0.419913
R-O,0.243079
F1-MISC,1.797333
P-MISC,6.235793
R-MISC,3.218012
F1-PER,2.35713
P-PER,1.504098
R-PER,3.259828
F1-LOC,2.620879


In [72]:
testByTagResult.to_csv("results/test-by-tag-merge-{0}.csv".format(max_depth))
testByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-O,97.595,97.042,97.542,97.516,97.248,97.528,97.23,97.382,97.155,97.265
P-O,96.624,96.344,96.65,96.848,95.886,97.002,96.547,97.316,96.297,96.713
R-O,98.631,97.864,98.509,98.238,98.74,98.127,97.999,97.49,98.113,97.866
F1-MISC,51.6675,42.223333,42.223333,45.926667,42.223333,45.926667,42.223333,59.048571,42.223333,45.926667
P-MISC,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
R-MISC,36.667,31.667,31.667,36.667,31.667,36.667,31.667,36.667,31.667,36.667
F1-PER,83.014,86.758889,82.421,85.524,89.718889,84.413,78.927,83.191,80.793,82.598
P-PER,87.448,75.686,86.381,86.729,80.699,86.563,86.714,85.047,87.507,86.007
R-PER,82.108,81.452,83.611,86.275,81.225,87.02,80.429,84.747,80.707,82.588
F1-LOC,62.38125,53.672857,74.15,56.054286,53.33375,66.508333,61.43,60.715,54.338889,60.34


In [73]:
testByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-O,97.3503
P-O,96.6227
R-O,98.1577
F1-MISC,45.961274
P-MISC,50.0
R-MISC,34.167
F1-PER,83.735878
P-PER,84.8781
R-PER,83.0162
F1-LOC,60.292437


In [74]:
testByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-O,0.189171
P-O,0.39671
R-O,0.385608
F1-MISC,5.514657
P-MISC,0.0
R-MISC,2.635231
F1-PER,3.059019
P-PER,3.781122
R-PER,2.325916
F1-LOC,6.516272


In [75]:
ewoByTagResult.to_csv("results/ewo-by-tag-merge-{0}.csv".format(max_depth))

In [76]:
ewoByTagResult = pd.read_csv("results/ewo-by-tag-merge-{0}.csv".format(2), index_col=0)
ewoByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-LOC,,,,,,17.38,,,,17.38
F1-MISC,46.15,46.15,46.15,46.15,43.6825,43.6825,46.15,46.15,46.15,43.591111
F1-O,94.431,95.075,94.737,94.9,94.964,95.227,94.918,94.868,95.157,95.299
F1-ORG,,,,,,,,,,
F1-PER,8.57625,47.702857,28.391429,32.1575,34.5975,39.691111,34.62875,31.67875,42.05,41.984444
P-LOC,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,60.0
P-MISC,70.0,60.0,60.0,70.0,65.0,65.0,60.0,60.0,80.0,72.5
P-O,89.54,91.146,90.138,90.5,90.709,91.122,90.538,90.384,91.105,91.426
P-ORG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-PER,53.638,51.554,57.193,63.381,66.738,77.824,66.391,68.142,66.391,75.073


In [77]:
ewoByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,17.38
F1-MISC,45.400611
F1-O,94.9576
F1-ORG,
F1-PER,34.145859
P-LOC,12.0
P-MISC,66.25
P-O,90.6608
P-ORG,0.0
P-PER,64.6325


In [78]:
ewoByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,0.0
F1-MISC,1.206887
F1-O,0.254209
F1-ORG,
F1-PER,10.728242
P-LOC,25.298221
P-MISC,6.795628
P-O,0.565721
P-ORG,0.0
P-PER,8.529639


In [79]:
columns = en_fingerprints.columns

print("Pred", "Real", "Freq", "Word", sep="\t")
for c in columns:
    prediction = model.predict(en_fingerprints[c].values.reshape((1, 1, 210)))
    pred_tag = int2tag[np.argmax(prediction)]
    real_tag = en_corpus[en_corpus.word == c].iloc[0]['ne-tag']
    
    if pred_tag != real_tag:
        print(pred_tag, real_tag, en_fingerprints[c].max(), c, sep="\t")

Pred	Real	Freq	Word


ValueError: Error when checking input: expected hidden1_input to have 2 dimensions, but got array with shape (1, 1, 210)

In [None]:
en_corpus[en_corpus.word != "\n"].shape