In [293]:
# import
import keras
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from keras.utils import np_utils, plot_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn import model_selection
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
import h5py as h5py

In [294]:
BINARY = False
timestep = 1
epochs = 10
en_corpus_file = "corpus-en.txt"
ewo_corpus_file = "corpus-ewo.txt"
best_model_file = "best-model-conll.hdfs"
max_nb_of_phrases =  -1
duplication = 1
max_depth = 0
is_only_vocab = True
shuffle = is_only_vocab
h1_size = 320
h2_size = 160

In [295]:
def getTag(aString):
    tag = "O"
    if BINARY:
        if aString != "O":
            return "NE"
    else:
        tag = aString
    return tag
     

In [296]:
def load_corpus(file, max_nb_of_phrases):
    nb_of_phrases = 0
    dataset = {"word": [], "ne-tag": []}
    with open(file) as f:
        prev_line = None
        for cpt, line in enumerate(f):
            if cpt == 0:
                continue
            if nb_of_phrases == max_nb_of_phrases:
                break;

            l = line.strip()
            if len(l) == 0 and len(prev_line) != 0:
                nb_of_phrases += 1
                dataset["word"].append(line)
                dataset["ne-tag"].append(None)
            else:
                l = l.split("\t")
                dataset["word"].append(l[0])
                dataset["ne-tag"].append(ne_type(l[1]))
            prev_line = line.strip()
        
    return pd.DataFrame(dataset), nb_of_phrases+1

In [297]:
def corpus_fingerprint(aDataframe, nb_of_biphrases):
    # create distributionnal signature
    fingerprints = {}
    current_bi_phrase_index = 0
    nb_word_in_corpus = aDataframe[aDataframe.word != "\n"].word.size
    words_in_current_phrase = []
    for index, row in aDataframe.iterrows():
        if current_bi_phrase_index > nb_of_biphrases:
            break
            
        word = row['word']
        
        if word != "\n":
            words_in_current_phrase.append(word)
            if word not in fingerprints:
                fingerprints[word] = np.zeros(nb_of_biphrases, dtype=np.float32)
            fingerprints[word][current_bi_phrase_index] += 1
        else:
            nb_word_in_current_phrase = len(words_in_current_phrase)
#             for w in words_in_current_phrase:
#                 fingerprints[w][current_bi_phrase_index] = nb_word_in_corpus / fingerprints[w][current_bi_phrase_index]                
            current_bi_phrase_index += 1
            words_in_current_phrase = []
    for word in fingerprints:
        for i in range(nb_of_biphrases):
            if fingerprints[word][i] != 0:
                fingerprints[word][i] = nb_word_in_corpus / fingerprints[word][i]
#         fingerprints[word][nb_of_biphrases] = nb_word_in_corpus / aDataframe[aDataframe.word == word].word.size
        
    return pd.DataFrame(fingerprints)

In [298]:
def corpus2trainingdata(aDataframe, fingerprintsDataFrame):
    X = np.zeros((aDataframe.shape[0], fingerprintsDataFrame.shape[0]), dtype=np.int8)
    y = np.zeros(aDataframe.shape[0], dtype=np.int8)
    i = 0
    for row in aDataframe.iterrows():
        X[i] = fingerprintsDataFrame[row[1]['word']].values
        y[i] = tag2int[getTag(row[1]['ne-tag'])]
        i += 1
    return X, y

In [299]:
def train_test_split(X, y, test_size = 0.33):
    total = X.shape[0]
    train_length = round(total * (1 - test_size)) 
    return X[:train_length], X[train_length:], y[:train_length], y[train_length:]

In [300]:
  
def ne_type(aType):
    aType = aType.lower()
    if 'per' in aType:
        t =  'NE' if BINARY else 'PER' 
    elif 'loc' in aType:
        t =  'NE' if BINARY else 'LOC'
    elif 'org' in aType:
        t =  'NE' if BINARY else 'ORG'
    elif 'hour' in aType:
        t =  'NE' if BINARY else 'MISC'
    elif aType != 'o' and len(aType) > 0 :
        t =  'NE' if BINARY else 'MISC'
    else:
        t = 'O'
    return t

In [301]:
def compute_performance(y_true, y_pred, words=None, BINARY=False):
    if BINARY:
        p = precision_score(y_true, y_pred, pos_label=tag2int['NE'])
        r = recall_score(y_true, y_pred, pos_label=tag2int['NE'])
        f1 = f1_score(y_true, y_pred, pos_label=tag2int['NE'])
        acc = accuracy_score(y_true, y_pred)
    else:
        p = precision_score(y_pred, y_true, average='macro')
        r = recall_score(y_pred, y_true, average='macro')
        f1 = f1_score(y_pred, y_true, average='macro')
        acc = accuracy_score(y_pred, y_true)
    if words is None:
        model_output_vs = pd.DataFrame({'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})
    else:
        model_output_vs = pd.DataFrame({'word': words, 'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})

    return p, r, f1, acc, model_output_vs

In [302]:
def P_R_F1(y_pred, y_true, neg_class):
    same = y_pred[y_true==y_pred]
    tp = same[same != neg_class].size
    nb_of_pos_exple = y_true[y_true != neg_class].size
    nb_of_pos_pred = y_pred[y_pred != neg_class].size
    p = r = f1 = 0
    try:
        p = np.round(tp*100/nb_of_pos_pred, 2)
    except ZeroDivisionError:
        print("number of correct positive predictions is 0")
        
    try:
        r = np.round(tp*100/nb_of_pos_exple, 2)
    except ZeroDivisionError:
        print("number of position exple is 0")
        
    try:
        f1 = np.round(2*r*p/(r+p), 2)
    except ZeroDivisionError:
        print("Recall and precision are 0")

    return p, r, f1

In [303]:
def is_mergeable(aListOfConsecutiveTokens, corpus, fingerprints):
    n = len(aListOfConsecutiveTokens)
    if n <= 1:
        return False
    if n == 2:
        w1, w2 = aListOfConsecutiveTokens[0], aListOfConsecutiveTokens[1]
        rep1, rep2 = fingerprints[aListOfConsecutiveTokens[0]], fingerprints[aListOfConsecutiveTokens[1]]
        tag1, tag2 = corpus[corpus.word==w1].iloc[0]['ne-tag'], corpus[corpus.word==w2].iloc[0]['ne-tag']
        if (tag1 == tag2) and (tag1 == "O"): # O + O => False
            return False
        if (tag1 != tag2) and (tag1 != "O") and (tag2 != "O"): # X + Y => False
            return False
        return rep1.equals(rep2)
    else:
        half = int(n / 2)
        return is_mergeable(aListOfConsecutiveTokens[0:half+1], corpus, fingerprints) and is_mergeable(aListOfConsecutiveTokens[half:n], corpus, fingerprints)

In [304]:
def merge(depth, corpus, fingerprint):
    wordDf = corpus[corpus.word != "\n"].word
    nbOfWord = wordDf.shape[0]
    text = list(wordDf)
    X2, target2, tokens = [], [], []
    level, newToken = 1, True
    while level <= depth and newToken:
        i, newToken = 0, False
        limit = nbOfWord - level
        while i < limit:
            if is_mergeable(text[i:i+level+1], corpus, fingerprint):
                tokens.append(" ".join(text[i:i+level+1]))
                newToken = True
            i += 1
        print("level ", level, ":", set(tokens))
        level += 1
    
    X2, target2 = np.array(X2), np.array(target2)
    
    return X2, target2, set(tokens)

In [305]:
def shuffle(X, y):
    indices = [i for i in  range(X.shape[0])]
    np.random.shuffle(indices)
    return X[indices], y[indices]

In [306]:
en_corpus, en_nb_of_phrases = load_corpus(en_corpus_file, max_nb_of_phrases)

In [307]:
tagSet = en_corpus["ne-tag"].dropna().unique()
if BINARY:
    tagSet = ['NE', 'O']
tag2int = {j: i for i, j in enumerate(tagSet)}
int2tag = {i: j for i, j in enumerate(tagSet)}
print(tag2int)

{'O': 0, 'MISC': 1, 'PER': 2, 'LOC': 3, 'ORG': 4}


In [308]:
en_nb_of_phrases

210

In [309]:
en_corpus.describe()

Unnamed: 0,word,ne-tag
count,4962,4753
unique,913,5
top,",",O
freq,343,4362


In [310]:
en_corpus.head(10)

Unnamed: 0,word,ne-tag
0,The,O
1,Promise,O
2,of,O
3,the,O
4,Holy,MISC
5,Spirit,MISC
6,\n,
7,In,O
8,the,O
9,first,O


In [311]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(en_corpus[en_corpus['ne-tag']==tag].shape[0] * 100 / en_corpus[en_corpus['ne-tag']!='\n'].shape[0], 2)))

O % = 87.91 %
MISC % = 2.12 %
PER % = 4.94 %
LOC % = 0.81 %
ORG % = 0.02 %


In [312]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(en_corpus[en_corpus['ne-tag']==tag].word.unique().shape[0] * 100 / en_corpus[en_corpus['ne-tag']!='\n'].word.unique().shape[0], 2)))

O % = 89.27 %
MISC % = 1.86 %
PER % = 8.87 %
LOC % = 1.97 %
ORG % = 0.11 %


In [313]:
en_corpus[en_corpus.word == "\n"].shape

(209, 2)

In [314]:
print("Nb of bi-phrases", en_nb_of_phrases)
en_fingerprints = corpus_fingerprint(en_corpus, en_nb_of_phrases)

Nb of bi-phrases 210


In [315]:
en_fingerprints.head(10)

Unnamed: 0,The,Promise,of,the,Holy,Spirit,In,first,book,",",...,considered,dream,She,save,fulfill,Immanuel,us),woke,sleep,knew
0,4753.0,4753.0,4753.0,4753.0,4753.0,4753.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,4753.0,0.0,0.0,4753.0,4753.0,4753.0,1584.333374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1584.333374,4753.0,4753.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,4753.0,4753.0,0.0,0.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4753.0,2376.5,0.0,0.0,0.0,0.0,0.0,1188.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,4753.0,4753.0,4753.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4753.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,4753.0,0.0,0.0,0.0,0.0,0.0,1584.333374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,4753.0,0.0,0.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,4753.0,1584.333374,4753.0,4753.0,0.0,0.0,0.0,2376.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [316]:
en_fingerprints['you'].values.shape

(210,)

In [317]:
en_corpus[en_corpus.word != "\n"].shape

(4753, 2)

In [318]:
X2, target2, tokens = merge(max_depth, en_corpus, en_fingerprints)

In [319]:
if is_only_vocab:
    text = list(en_corpus[en_corpus.word != "\n"].word.unique())
else:
    text = list(en_corpus[en_corpus.word != "\n"].word)
en_vocab = pd.DataFrame({'text': text + list(tokens)})
en_vocab.describe()

Unnamed: 0,text
count,912
unique,912
top,robes
freq,1


In [320]:
if is_only_vocab:
    X = np.zeros((en_vocab.shape[0] * duplication, en_nb_of_phrases))
    target = np.zeros((en_vocab.shape[0] * duplication))
    p=0
    for i, row in en_vocab.iterrows():
        c = row.text
        for j in range(duplication):
            X[p] = en_fingerprints[c.split(" ")[0]]
            target[p] = tag2int[getTag(en_corpus[en_corpus.word == c.split(" ")[-1:][0]]['ne-tag'].iloc[0])]
            p+=1
    X, target = shuffle(X, target)
    print(X.shape, en_fingerprints.shape, target.shape)

(912, 210) (210, 912) (912,)


In [321]:
en_vocab[-20:]

Unnamed: 0,text
892,Eliud
893,Eleazar
894,Matthan
895,husband
896,fourteen
897,unwilling
898,shame
899,resolved
900,divorce
901,quietly


In [322]:
if not is_only_vocab:
    X, target = corpus2trainingdata(en_corpus[en_corpus.word != "\n"], en_fingerprints)

In [323]:
print(X.shape, target.shape)
if len(X.shape) == len(X2.shape):
    X = np.concatenate((X, X2))
    target = np.concatenate((target, target2))
    if shuffle:
        X, target = shuffle(X, target)
print(X.shape, target.shape)

(912, 210) (912,)
(912, 210) (912,)


In [324]:
y = target.copy()
y[0:100]
if not BINARY:
    y = np_utils.to_categorical(y, len(tagSet))
y.shape

(912, 5)

In [325]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=0.33)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1])
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1])
print("X_train.shape =", X_train.shape)
print("y_train.shape =", y_train.shape)
print("X_val.shape =", X_val.shape)
print("y_val.shape =", y_val.shape)

tTarget = np.array([np.argmax(yy) for yy in y_train])
vTarget = np.array([np.argmax(yy) for yy in y_val])

for tag in tagSet:
    print("{0} % in training data = {1} %".format(tag, np.round(tTarget[tTarget==tag2int[tag]].size * 100 / tTarget.shape[0], 2)))
    print("{0} % in validation data = {1} %".format(tag, np.round(vTarget[vTarget==tag2int[tag]].size * 100 / vTarget.shape[0], 2)))

X_train.shape = (611, 210)
y_train.shape = (611, 5)
X_val.shape = (301, 210)
y_val.shape = (301, 5)
O % in training data = 88.05 %
O % in validation data = 89.04 %
MISC % in training data = 1.15 %
MISC % in validation data = 1.33 %
PER % in training data = 9.0 %
PER % in validation data = 7.31 %
LOC % in training data = 1.64 %
LOC % in validation data = 2.33 %
ORG % in training data = 0.16 %
ORG % in validation data = 0.0 %


In [326]:
def create_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(h1_size, input_dim=input_dim, activation='sigmoid', name="hidden1"))
    model.add(Dense(h2_size, activation='sigmoid', name="hidden2"))
    if BINARY:
        model.add(Dense(1, activation='sigmoid', name="outputlayer"))
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['binary_accuracy'])
    else:
        model.add(Dense(output_dim, activation='softmax', name="outputlayer"))
        model.compile(loss='categorical_crossentropy', optimizer="rmsprop", metrics=['accuracy'])
    model.summary()
    return model

In [327]:
def train_model(model, X_train, y_train, X_val, y_val, epochs=epochs):
    early_stop = EarlyStopping(patience=2, verbose=2) # stop learning if the error is the same between two consecutive epochs
    best_model_cp = ModelCheckpoint(best_model_file, save_best_only=True, verbose=1) # saved best model
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, verbose=0, shuffle=shuffle, callbacks=[best_model_cp, early_stop])
    best_model = keras.models.load_model(best_model_file) #loading the best model
    return best_model

In [328]:
def predict(model, X, y, binary=BINARY):
    if BINARY:
        y_pred = np.round(model.predict(X))
        y_true = y
    else:
        predictions = model.predict(X)
        y_pred = np.array([np.argmax(p) for p in predictions])
        y_true = np.array([np.argmax(t) for t in y ])
    return y_true, y_pred

In [329]:
def model_performance(y_true, y_pred):
    return P_R_F1(y_pred, y_true, tag2int['O']) #precision, recall, f1-score

In [330]:
def model_performace_by_tag(y_true, y_pred, tag):
    p, r, f1 = 0, 0, 0
    
    eq = y_pred[y_pred==y_true]
    correctly_pred = eq[eq==tag].size
    try:
        p = np.round(100 * correctly_pred / y_pred[y_pred==tag].size, 2)
    except ZeroDivisionError:
        pass
    
    try:
        r = np.round(100 * correctly_pred / y_true[y_true==tag].size, 2)
    except ZeroDivisionError:
        pass
    
    try:
        f1 = np.round(2 * r * p / (r + p), 2)
    except ZeroDivisionError:
        pass
    
    return p, r, f1

In [331]:
ewo_corpus, ewo_nb_of_phrases = load_corpus(ewo_corpus_file, max_nb_of_phrases)

In [332]:
ewo_nb_of_phrases

210

In [333]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(ewo_corpus[ewo_corpus['ne-tag']==tag].shape[0] * 100 / ewo_corpus[ewo_corpus['ne-tag']!='\n'].shape[0], 2)))

O % = 86.37 %
MISC % = 2.18 %
PER % = 5.76 %
LOC % = 0.89 %
ORG % = 0.05 %


In [334]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(ewo_corpus[ewo_corpus['ne-tag']==tag].word.unique().shape[0] * 100 / ewo_corpus[ewo_corpus['ne-tag']!='\n'].word.unique().shape[0], 2)))

O % = 90.0 %
MISC % = 1.17 %
PER % = 8.25 %
LOC % = 1.84 %
ORG % = 0.19 %


In [335]:
ewo_corpus.describe()

Unnamed: 0,word,ne-tag
count,4394,4185
unique,1030,5
top,",",O
freq,413,3795


In [336]:
ewo_corpus.head()

Unnamed: 0,word,ne-tag
0,Mfufub,MISC
1,Nsisim,MISC
2,ayi,O
3,sò,O
4,\n,


In [337]:
ewo_fingerprints = corpus_fingerprint(ewo_corpus, en_nb_of_phrases)

In [338]:
X2, target2, tokens = merge(max_depth, ewo_corpus, ewo_fingerprints)

In [339]:
if is_only_vocab:
    text = list(ewo_corpus[ewo_corpus.word != "\n"].word.unique())
else:
    text = list(ewo_corpus[ewo_corpus.word != "\n"].word)
ewo_vocab = pd.DataFrame({"text":text + list(tokens)})

In [340]:
if is_only_vocab:
    X_ewo = np.zeros((ewo_vocab.shape[0] * duplication, en_nb_of_phrases))
    ewo_target = np.zeros((ewo_vocab.shape[0] * duplication))
    p=0
    for i, row in ewo_vocab.iterrows():
        c = row.text
        for j in range(duplication):
            X_ewo[p] = ewo_fingerprints[c.split(" ")[0]]
            ewo_target[p] = tag2int[getTag(ewo_corpus[ewo_corpus.word == c.split(" ")[-1:][0]]['ne-tag'].iloc[0])]
            p+=1
    X_ewo, ewo_target = shuffle(X_ewo, ewo_target)

In [341]:
ewo_vocab[-20:]

Unnamed: 0,text
1009,nlo
1010,obë
1011,mbara
1012,yabyali
1013,dzili
1014,yasò
1015,oyolëge
1016,kode
1017,dili
1018,atoban


In [342]:
if not is_only_vocab:
    X_ewo, ewo_target = corpus2trainingdata(ewo_corpus[ewo_corpus.word != "\n"], ewo_fingerprints)

In [343]:
print(X_ewo.shape, ewo_target.shape)
if len(X_ewo.shape) == len(X2.shape):
    X_ewo = np.concatenate((X_ewo, X2))
    ewo_target = np.concatenate((ewo_target, target2))
    if shuffle:
        X_ewo, ewo_target = shuffle(X_ewo, ewo_target)
print(X_ewo.shape, ewo_target.shape)

(1029, 210) (1029,)
(1029, 210) (1029,)


In [344]:
y_ewo = ewo_target.copy()
print(y_ewo.shape, len(ewo_vocab))

(1029,) 1029


In [345]:
X_ewo.shape

(1029, 210)

In [346]:
y_ewo = ewo_target.copy()
y_ewo[:20]
if not BINARY:
    y_ewo = np_utils.to_categorical(y_ewo)

In [347]:
X_ewo = X_ewo.reshape((X_ewo.shape[0], en_nb_of_phrases))

In [348]:
def algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, epochs=epochs, model=None):
    test_precision, train_precision, ewo_precision = [], [], []
    test_recall, train_recall, ewo_recall = [], [], []
    test_fscore, train_fscore, ewo_fscore = [], [], []
    
    test_result_by_tag = {}
    train_result_by_tag = {}
    ewo_result_by_tag = {}
    for t in tagSet:
        f1_key = "F1-"+t
        p_key = "P-"+t
        r_key = "R-"+t
        train_result_by_tag[f1_key], train_result_by_tag[p_key], train_result_by_tag[r_key] = [], [], []
        test_result_by_tag[f1_key], test_result_by_tag[p_key], test_result_by_tag[r_key] = [], [], []
        ewo_result_by_tag[f1_key], ewo_result_by_tag[p_key], ewo_result_by_tag[r_key] = [], [], []

    m = train_model(model, X_train, y_train, X_val, y_val, epochs=epochs)
        
    y_true, y_pred = predict(m, X_train, y_train)
    p_train, r_train, f1_train = model_performance(y_true, y_pred)
        
    y_true_val, y_pred_val = predict(m, X_val, y_val)
    p_val, r_val, f1_val = model_performance(y_true_val, y_pred_val)
        
    y_true_ewo, y_pred_ewo = predict(m, X_ewo, y_ewo) 
    p_ewo, r_ewo, f1_ewo = model_performance(y_true_ewo, y_pred_ewo)
        
    for t in range(len(int2tag)):
        f1_key = "F1-" + int2tag[t]
        p_key = "P-" + int2tag[t]
        r_key = "R-" + int2tag[t]
            
        p, r, f1 = model_performace_by_tag(y_true, y_pred, t)
        train_result_by_tag[p_key].append(p)
        train_result_by_tag[r_key].append(r)
        train_result_by_tag[f1_key].append(f1)
            
        p, r, f1 = model_performace_by_tag(y_true_val, y_pred_val, t)
        test_result_by_tag[p_key].append(p)
        test_result_by_tag[r_key].append(r)
        test_result_by_tag[f1_key].append(f1)
            
        p, r, f1 = model_performace_by_tag(y_true_ewo, y_pred_ewo, t)
        ewo_result_by_tag[p_key].append(p)
        ewo_result_by_tag[r_key].append(r)
        ewo_result_by_tag[f1_key].append(f1)
                
    test_precision.append(p_val)
    train_precision.append(p_train)
    ewo_precision.append(p_ewo)
        
    test_recall.append(r_val)
    train_recall.append(r_train)
    ewo_recall.append(r_ewo)
        
    test_fscore.append(f1_val)
    train_fscore.append(f1_train)
    ewo_fscore.append(f1_ewo)
    return pd.DataFrame({
        'P_test': test_precision, 
        'P_train': train_precision, 
        'P_ewo': ewo_precision, 'R_test': test_recall, 'R_train': train_recall, 
        'R_ewo': ewo_recall, 'F1-test': test_fscore, 'F1-train': train_fscore, 'F1-ewo': ewo_fscore}), pd.DataFrame(train_result_by_tag), pd.DataFrame(test_result_by_tag), pd.DataFrame(ewo_result_by_tag)

In [349]:
# model = create_model(X.shape[1], len(tagSet))
# resultEval, train_by_tag, test_by_tag, ewo_by_tag = algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, model=model)

In [350]:
# resultEval

In [351]:
# train_by_tag

In [352]:
# test_by_tag

In [353]:
# ewo_by_tag

In [354]:
# resultEval.mean()

In [355]:
# resultEval.std()

In [356]:
def algoCrossVal(X, y, X_ewo, y_ewo, k = 10, repeat=1): 
    block_size = int(X.shape[0] / k)   
    output = None
    model = None
    train_by_tags, test_by_tags, ewo_by_tags = None, None, None
    for it in range(repeat):
        print("AlgoCrossValIter -", it+1)
        model = create_model(X.shape[1], len(tagSet))
        results = None
        train_by_tagsTmp, test_by_tagsTmp, ewo_by_tagsTmp = None, None, None
        for i in range(k):
            X_val, y_val = X[i*block_size:i*block_size+block_size], y[i*block_size:i*block_size+block_size]
            X_train = np.concatenate((X[0:i*block_size], X[i*block_size+block_size:]))
            y_train = np.concatenate((y[0:i*block_size], y[i*block_size+block_size:]))

            X_train = X_train.reshape(X_train.shape[0], X_train.shape[1])
            X_val = X_val.reshape(X_val.shape[0], X_val.shape[1])

            result, train_by_tag, test_by_tag, ewo_by_tag = algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, model=model)
            if results is None:
                results = result.copy()
                train_by_tagsTmp, test_by_tagsTmp, ewo_by_tagsTmp = train_by_tag.copy(), test_by_tag.copy(), ewo_by_tag.copy()
            else:
                results = pd.concat([results, result], ignore_index=True)
                train_by_tagsTmp = pd.concat([train_by_tagsTmp, train_by_tag], ignore_index=True)
                test_by_tagsTmp = pd.concat([test_by_tagsTmp, test_by_tag], ignore_index=True)
                ewo_by_tagsTmp = pd.concat([ewo_by_tagsTmp, ewo_by_tag], ignore_index=True)
        
        if output is None:
            output = results.mean(axis=0).to_frame()
            train_by_tags = train_by_tagsTmp.mean(axis=0).to_frame()
            test_by_tags = test_by_tagsTmp.mean(axis=0).to_frame()
            ewo_by_tags = ewo_by_tagsTmp.mean(axis=0).to_frame()
        else:
            output = pd.concat([output, results.mean(axis=0).to_frame()], axis=1)
            train_by_tags = pd.concat([train_by_tags, train_by_tagsTmp.mean(axis=0).to_frame()], axis=1)
            test_by_tags = pd.concat([test_by_tags, test_by_tagsTmp.mean(axis=0).to_frame()], axis=1)
            ewo_by_tags = pd.concat([ewo_by_tags, ewo_by_tagsTmp.mean(axis=0).to_frame()], axis=1)

    return output, train_by_tags, test_by_tags, ewo_by_tags, model

In [357]:
resultCrossVal, trainByTagResult, testByTagResult, ewoByTagResult, model = algoCrossVal(X, y, X_ewo, y_ewo, repeat=10)

AlgoCrossValIter - 1
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden1 (Dense)              (None, 320)               67520     
_________________________________________________________________
hidden2 (Dense)              (None, 160)               51360     
_________________________________________________________________
outputlayer (Dense)          (None, 5)                 805       
Total params: 119,685
Trainable params: 119,685
Non-trainable params: 0
_________________________________________________________________

Epoch 00001: val_loss improved from inf to 0.57132, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.57132 to 0.55648, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.55648

Epoch 00004: val_loss improved from 0.55648 to 0.52631, saving model to best-model-conll.hdfs

Epoch 00005: val_loss improved from 0.52631 t




Epoch 00001: val_loss improved from inf to 0.22229, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.22229 to 0.19053, saving model to best-model-conll.hdfs

Epoch 00003: val_loss improved from 0.19053 to 0.16683, saving model to best-model-conll.hdfs

Epoch 00004: val_loss did not improve from 0.16683

Epoch 00005: val_loss did not improve from 0.16683
Epoch 00005: early stopping

Epoch 00001: val_loss improved from inf to 0.15732, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.15732

Epoch 00003: val_loss did not improve from 0.15732
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.17291, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.17291

Epoch 00003: val_loss did not improve from 0.17291
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.08691, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.0869




Epoch 00001: val_loss improved from inf to 0.23671, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.23671 to 0.19688, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.19688

Epoch 00004: val_loss did not improve from 0.19688
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.18174, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.18174

Epoch 00003: val_loss did not improve from 0.18174
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.20154, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.20154

Epoch 00003: val_loss did not improve from 0.20154
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.09319, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.09319

Epoch 00003: val_loss did not improve from 0.09319
Epoch 00003: early stopping

Epoch 00001:


Epoch 00001: val_loss improved from inf to 0.03661, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.03661

Epoch 00003: val_loss did not improve from 0.03661
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.13582, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.13582

Epoch 00003: val_loss did not improve from 0.13582
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.04867, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.04867

Epoch 00003: val_loss did not improve from 0.04867
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.09879, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.09879

Epoch 00003: val_loss did not improve from 0.09879
Epoch 00003: early stopping
AlgoCrossValIter - 6
_________________________________________________________________
Layer (type)         


Epoch 00003: val_loss did not improve from 0.10318
Epoch 00003: early stopping
AlgoCrossValIter - 8
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden1 (Dense)              (None, 320)               67520     
_________________________________________________________________
hidden2 (Dense)              (None, 160)               51360     
_________________________________________________________________
outputlayer (Dense)          (None, 5)                 805       
Total params: 119,685
Trainable params: 119,685
Non-trainable params: 0
_________________________________________________________________

Epoch 00001: val_loss improved from inf to 0.56775, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.56775

Epoch 00003: val_loss improved from 0.56775 to 0.56257, saving model to best-model-conll.hdfs

Epoch 00004: val_loss improved from 0.56257 to 0.56117, savi


Epoch 00001: val_loss improved from inf to 0.60921, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.60921

Epoch 00003: val_loss improved from 0.60921 to 0.59616, saving model to best-model-conll.hdfs

Epoch 00004: val_loss improved from 0.59616 to 0.54240, saving model to best-model-conll.hdfs

Epoch 00005: val_loss did not improve from 0.54240

Epoch 00006: val_loss improved from 0.54240 to 0.47925, saving model to best-model-conll.hdfs

Epoch 00007: val_loss did not improve from 0.47925

Epoch 00008: val_loss did not improve from 0.47925
Epoch 00008: early stopping

Epoch 00001: val_loss improved from inf to 0.16464, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.16464 to 0.14288, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.14288

Epoch 00004: val_loss did not improve from 0.14288
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.15712, saving model to b

In [358]:
resultCrossVal.to_csv("results/merge-{0}.csv".format(max_depth))
resultCrossVal

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
P_test,82.284,84.177,73.203,73.984,84.21,83.671,77.319,82.155,70.108,81.818
P_train,87.694,89.827,86.166,79.699,86.829,86.804,85.376,89.758,87.122,88.597
P_ewo,75.466,85.041,76.699,71.19,78.624,77.296,74.03,79.414,76.643,76.212
R_test,67.495,66.135,64.091,56.254,65.787,71.883,62.322,69.508,62.495,73.071
R_train,73.581,69.436,67.86,62.262,69.856,74.079,65.645,73.758,62.719,74.576
R_ewo,58.798,55.463,53.798,50.648,56.666,60.278,53.24,58.889,51.945,60.186
F1-test,70.617,70.841,74.772222,70.22,71.05,74.953,72.832222,73.8,82.09625,75.084
F1-train,79.355,77.23,73.879,76.5,76.294,79.152,71.751,80.137,69.885,79.898
F1-ewo,65.061,66.066,61.319,64.375556,64.453,66.897,59.854,66.624,59.18,65.707


In [359]:
resultCrossVal.mean(axis=1).to_frame()

Unnamed: 0,0
P_test,79.2929
P_train,86.7872
P_ewo,77.0615
R_test,65.9041
R_train,69.3772
R_ewo,55.9911
F1-test,73.626569
F1-train,76.4081
F1-ewo,63.953656


In [360]:
resultCrossVal.std(axis=1).to_frame()

Unnamed: 0,0
P_test,5.209441
P_train,2.884783
P_ewo,3.635342
R_test,4.984007
R_train,4.683534
R_ewo,3.500531
F1-test,3.537936
F1-train,3.553247
F1-ewo,2.818986


In [361]:
trainByTagResult.to_csv("results/train-by-tag-merge-{0}.csv".format(max_depth))
trainByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-O,97.631,97.465,97.307,97.067,97.351,97.583,97.191,97.734,97.03,97.706
P-O,96.668,96.125,96.006,95.377,96.211,96.705,95.739,96.684,95.397,96.82
R-O,98.638,98.871,98.69,98.885,98.553,98.5,98.746,98.829,98.79,98.638
F1-MISC,69.001,68.97,67.847778,62.895556,65.251111,61.713,73.39625,71.902,66.8475,71.756
P-MISC,95.778,88.889,90.0,87.5,87.5,93.671,77.5,98.889,75.139,93.778
R-MISC,56.411,50.684,47.775,45.663,46.684,51.8,49.482,57.34,44.093,60.745
F1-PER,83.994,83.077,79.422,82.67,81.563,85.207,76.932,84.372,76.386,83.984
P-PER,86.985,90.445,87.469,80.725,86.897,86.622,85.265,90.121,87.569,89.144
R-PER,82.214,78.533,75.316,70.218,78.459,84.726,73.958,81.153,71.893,81.532
F1-LOC,59.363,68.687143,66.8075,60.864286,62.154444,57.903333,57.072222,63.151,58.682857,68.965556


In [362]:
trainByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-O,97.4065
P-O,96.1732
R-O,98.714
F1-MISC,67.958019
P-MISC,88.8644
R-MISC,51.0677
F1-PER,81.7607
P-PER,87.1242
R-PER,77.8002
F1-LOC,62.365134


In [363]:
trainByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-O,0.257847
P-O,0.544014
R-O,0.132397
F1-MISC,3.871082
P-MISC,7.587174
R-MISC,5.504105
F1-PER,3.142865
P-PER,2.771571
R-PER,4.794736
F1-LOC,4.430664


In [364]:
testByTagResult.to_csv("results/test-by-tag-merge-{0}.csv".format(max_depth))
testByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-O,97.065,97.067,96.838,96.545,97.05,97.292,96.833,96.913,96.963,97.165
P-O,95.891,95.648,95.246,94.435,95.626,96.306,95.257,95.684,95.259,96.324
R-O,98.399,98.649,98.646,98.887,98.63,98.399,98.631,98.261,98.884,98.144
F1-MISC,52.381429,50.34,50.34,39.048571,52.381429,50.34,36.508333,55.101429,44.445,55.101429
P-MISC,35.0,35.0,40.0,35.0,40.0,35.0,25.0,40.0,25.0,40.0
R-MISC,40.0,37.5,32.5,27.5,35.0,37.5,22.5,37.5,30.0,37.5
F1-PER,80.342222,80.508889,85.865,79.0975,78.561111,79.93,84.03125,75.415,85.9725,78.539
P-PER,79.667,83.369,70.528,75.071,79.488,82.674,67.766,81.071,72.889,81.662
R-PER,71.516,68.536,68.309,57.678,67.753,84.369,67.69,76.857,65.746,82.412
F1-LOC,55.556667,53.142,63.095,53.142,48.73,58.73,51.111667,59.048571,49.334,58.33375


In [365]:
testByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-O,96.9731
P-O,95.5676
R-O,98.553
F1-MISC,48.598762
P-MISC,35.0
R-MISC,33.75
F1-PER,80.826247
P-PER,77.4185
R-PER,71.0866
F1-LOC,55.022365


In [366]:
testByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-O,0.207247
P-O,0.56009
R-O,0.247425
F1-MISC,6.471481
P-MISC,5.773503
R-MISC,5.559027
F1-PER,3.430513
P-PER,5.481203
R-PER,8.050892
F1-LOC,4.722188


In [367]:
ewoByTagResult.to_csv("results/ewo-by-tag-merge-{0}.csv".format(max_depth))

In [368]:
ewoByTagResult = pd.read_csv("results/ewo-by-tag-merge-{0}.csv".format(2), index_col=0)
ewoByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-LOC,,,,,,17.38,,,,17.38
F1-MISC,46.15,46.15,46.15,46.15,43.6825,43.6825,46.15,46.15,46.15,43.591111
F1-O,94.431,95.075,94.737,94.9,94.964,95.227,94.918,94.868,95.157,95.299
F1-ORG,,,,,,,,,,
F1-PER,8.57625,47.702857,28.391429,32.1575,34.5975,39.691111,34.62875,31.67875,42.05,41.984444
P-LOC,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,60.0
P-MISC,70.0,60.0,60.0,70.0,65.0,65.0,60.0,60.0,80.0,72.5
P-O,89.54,91.146,90.138,90.5,90.709,91.122,90.538,90.384,91.105,91.426
P-ORG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-PER,53.638,51.554,57.193,63.381,66.738,77.824,66.391,68.142,66.391,75.073


In [369]:
ewoByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,17.38
F1-MISC,45.400611
F1-O,94.9576
F1-ORG,
F1-PER,34.145859
P-LOC,12.0
P-MISC,66.25
P-O,90.6608
P-ORG,0.0
P-PER,64.6325


In [370]:
ewoByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,0.0
F1-MISC,1.206887
F1-O,0.254209
F1-ORG,
F1-PER,10.728242
P-LOC,25.298221
P-MISC,6.795628
P-O,0.565721
P-ORG,0.0
P-PER,8.529639


In [371]:
columns = en_fingerprints.columns

print("Pred", "Real", "Freq", "Word", sep="\t")
for c in columns:
    prediction = model.predict(en_fingerprints[c].values.reshape((1, 1, 210)))
    pred_tag = int2tag[np.argmax(prediction)]
    real_tag = en_corpus[en_corpus.word == c].iloc[0]['ne-tag']
    
    if pred_tag != real_tag:
        print(pred_tag, real_tag, en_fingerprints[c].max(), c, sep="\t")

Pred	Real	Freq	Word


ValueError: Error when checking input: expected hidden1_input to have 2 dimensions, but got array with shape (1, 1, 210)

In [None]:
en_corpus[en_corpus.word != "\n"].shape