In [159]:
# import
import keras
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from keras.utils import np_utils, plot_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn import model_selection
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
import h5py as h5py

In [160]:
BINARY = False
timestep = 1
epochs = 50
en_corpus_file = "corpus-en.txt"
ewo_corpus_file = "corpus-ewo.txt"
best_model_file = "best-model-conll.hdfs"
max_nb_of_phrases =  -1
duplication = 1
max_depth = 0
is_only_vocab = True
shuffle = is_only_vocab
h1_size = 160
h2_size = 160

In [161]:
def getTag(aString):
    tag = "O"
    if BINARY:
        if aString != "O":
            return "NE"
    else:
        tag = aString
    return tag
     

In [162]:
def load_corpus(file, max_nb_of_phrases):
    nb_of_phrases = 0
    dataset = {"word": [], "ne-tag": []}
    with open(file) as f:
        prev_line = None
        for cpt, line in enumerate(f):
            if cpt == 0:
                continue
            if nb_of_phrases == max_nb_of_phrases:
                break;

            l = line.strip()
            if len(l) == 0 and len(prev_line) != 0:
                nb_of_phrases += 1
                dataset["word"].append(line)
                dataset["ne-tag"].append(None)
            else:
                l = l.split("\t")
                dataset["word"].append(l[0])
                dataset["ne-tag"].append(ne_type(l[1]))
            prev_line = line.strip()
        
    return pd.DataFrame(dataset), nb_of_phrases+1

In [163]:
def corpus_fingerprint(aDataframe, nb_of_biphrases):
    # create distributionnal signature
    fingerprints = {}
    current_bi_phrase_index = 0
    nb_word_in_corpus = aDataframe[aDataframe.word != "\n"].word.size
    words_in_current_phrase = []
    for index, row in aDataframe.iterrows():
        if current_bi_phrase_index > nb_of_biphrases:
            break
            
        word = row['word']
        
        if word != "\n":
            words_in_current_phrase.append(word)
            if word not in fingerprints:
                fingerprints[word] = np.zeros(nb_of_biphrases, dtype=np.float32)
            fingerprints[word][current_bi_phrase_index] += 1
        else:
            nb_word_in_current_phrase = len(words_in_current_phrase)
#             for w in words_in_current_phrase:
#                 fingerprints[w][current_bi_phrase_index] = nb_word_in_corpus / fingerprints[w][current_bi_phrase_index]                
            current_bi_phrase_index += 1
            words_in_current_phrase = []
    for word in fingerprints:
        for i in range(nb_of_biphrases):
            if fingerprints[word][i] != 0:
                fingerprints[word][i] = nb_word_in_corpus / fingerprints[word][i]
#         fingerprints[word][nb_of_biphrases] = nb_word_in_corpus / aDataframe[aDataframe.word == word].word.size
        
    return pd.DataFrame(fingerprints)

In [164]:
def corpus2trainingdata(aDataframe, fingerprintsDataFrame):
    X = np.zeros((aDataframe.shape[0], fingerprintsDataFrame.shape[0]), dtype=np.int8)
    y = np.zeros(aDataframe.shape[0], dtype=np.int8)
    i = 0
    for row in aDataframe.iterrows():
        X[i] = fingerprintsDataFrame[row[1]['word']].values
        y[i] = tag2int[getTag(row[1]['ne-tag'])]
        i += 1
    return X, y

In [165]:
def train_test_split(X, y, test_size = 0.33):
    total = X.shape[0]
    train_length = round(total * (1 - test_size)) 
    return X[:train_length], X[train_length:], y[:train_length], y[train_length:]

In [166]:
  
def ne_type(aType):
    aType = aType.lower()
    if 'per' in aType:
        t =  'NE' if BINARY else 'PER' 
    elif 'loc' in aType:
        t =  'NE' if BINARY else 'LOC'
    elif 'org' in aType:
        t =  'NE' if BINARY else 'ORG'
    elif 'hour' in aType:
        t =  'NE' if BINARY else 'MISC'
    elif aType != 'o' and len(aType) > 0 :
        t =  'NE' if BINARY else 'MISC'
    else:
        t = 'O'
    return t

In [167]:
def compute_performance(y_true, y_pred, words=None, BINARY=False):
    if BINARY:
        p = precision_score(y_true, y_pred, pos_label=tag2int['NE'])
        r = recall_score(y_true, y_pred, pos_label=tag2int['NE'])
        f1 = f1_score(y_true, y_pred, pos_label=tag2int['NE'])
        acc = accuracy_score(y_true, y_pred)
    else:
        p = precision_score(y_pred, y_true, average='macro')
        r = recall_score(y_pred, y_true, average='macro')
        f1 = f1_score(y_pred, y_true, average='macro')
        acc = accuracy_score(y_pred, y_true)
    if words is None:
        model_output_vs = pd.DataFrame({'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})
    else:
        model_output_vs = pd.DataFrame({'word': words, 'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})

    return p, r, f1, acc, model_output_vs

In [168]:
def P_R_F1(y_pred, y_true, neg_class):
    same = y_pred[y_true==y_pred]
    tp = same[same != neg_class].size
    nb_of_pos_exple = y_true[y_true != neg_class].size
    nb_of_pos_pred = y_pred[y_pred != neg_class].size
    p = r = f1 = 0
    try:
        p = np.round(tp*100/nb_of_pos_pred, 2)
    except ZeroDivisionError:
        print("number of correct positive predictions is 0")
        
    try:
        r = np.round(tp*100/nb_of_pos_exple, 2)
    except ZeroDivisionError:
        print("number of position exple is 0")
        
    try:
        f1 = np.round(2*r*p/(r+p), 2)
    except ZeroDivisionError:
        print("Recall and precision are 0")

    return p, r, f1

In [169]:
def is_mergeable(aListOfConsecutiveTokens, corpus, fingerprints):
    n = len(aListOfConsecutiveTokens)
    if n <= 1:
        return False
    if n == 2:
        w1, w2 = aListOfConsecutiveTokens[0], aListOfConsecutiveTokens[1]
        rep1, rep2 = fingerprints[aListOfConsecutiveTokens[0]], fingerprints[aListOfConsecutiveTokens[1]]
        tag1, tag2 = corpus[corpus.word==w1].iloc[0]['ne-tag'], corpus[corpus.word==w2].iloc[0]['ne-tag']
        if (tag1 == tag2) and (tag1 == "O"): # O + O => False
            return False
        if (tag1 != tag2) and (tag1 != "O") and (tag2 != "O"): # X + Y => False
            return False
        return rep1.equals(rep2)
    else:
        half = int(n / 2)
        return is_mergeable(aListOfConsecutiveTokens[0:half+1], corpus, fingerprints) and is_mergeable(aListOfConsecutiveTokens[half:n], corpus, fingerprints)

In [170]:
def merge(depth, corpus, fingerprint):
    wordDf = corpus[corpus.word != "\n"].word
    nbOfWord = wordDf.shape[0]
    text = list(wordDf)
    X2, target2, tokens = [], [], []
    level, newToken = 1, True
    while level <= depth and newToken:
        i, newToken = 0, False
        limit = nbOfWord - level
        while i < limit:
            if is_mergeable(text[i:i+level+1], corpus, fingerprint):
                tokens.append(" ".join(text[i:i+level+1]))
                newToken = True
            i += 1
        print("level ", level, ":", set(tokens))
        level += 1
    
    X2, target2 = np.array(X2), np.array(target2)
    
    return X2, target2, set(tokens)

In [171]:
def shuffle(X, y):
    indices = [i for i in  range(X.shape[0])]
    np.random.shuffle(indices)
    return X[indices], y[indices]

In [172]:
en_corpus, en_nb_of_phrases = load_corpus(en_corpus_file, max_nb_of_phrases)

In [173]:
tagSet = en_corpus["ne-tag"].dropna().unique()
if BINARY:
    tagSet = ['NE', 'O']
tag2int = {j: i for i, j in enumerate(tagSet)}
int2tag = {i: j for i, j in enumerate(tagSet)}
print(tag2int)

{'O': 0, 'MISC': 1, 'PER': 2, 'LOC': 3, 'ORG': 4}


In [174]:
en_nb_of_phrases

210

In [175]:
en_corpus.describe()

Unnamed: 0,word,ne-tag
count,4962,4753
unique,913,5
top,",",O
freq,343,4362


In [176]:
en_corpus.head(10)

Unnamed: 0,word,ne-tag
0,The,O
1,Promise,O
2,of,O
3,the,O
4,Holy,MISC
5,Spirit,MISC
6,\n,
7,In,O
8,the,O
9,first,O


In [177]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(en_corpus[en_corpus['ne-tag']==tag].shape[0] * 100 / en_corpus[en_corpus['ne-tag']!='\n'].shape[0], 2)))

O % = 87.91 %
MISC % = 2.12 %
PER % = 4.94 %
LOC % = 0.81 %
ORG % = 0.02 %


In [178]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(en_corpus[en_corpus['ne-tag']==tag].word.unique().shape[0] * 100 / en_corpus[en_corpus['ne-tag']!='\n'].word.unique().shape[0], 2)))

O % = 89.27 %
MISC % = 1.86 %
PER % = 8.87 %
LOC % = 1.97 %
ORG % = 0.11 %


In [179]:
en_corpus[en_corpus.word == "\n"].shape

(209, 2)

In [180]:
print("Nb of bi-phrases", en_nb_of_phrases)
en_fingerprints = corpus_fingerprint(en_corpus, en_nb_of_phrases)

Nb of bi-phrases 210


In [181]:
en_fingerprints.head(10)

Unnamed: 0,The,Promise,of,the,Holy,Spirit,In,first,book,",",...,considered,dream,She,save,fulfill,Immanuel,us),woke,sleep,knew
0,4753.0,4753.0,4753.0,4753.0,4753.0,4753.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,4753.0,0.0,0.0,4753.0,4753.0,4753.0,1584.333374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1584.333374,4753.0,4753.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,4753.0,4753.0,0.0,0.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4753.0,2376.5,0.0,0.0,0.0,0.0,0.0,1188.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,4753.0,4753.0,4753.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4753.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,4753.0,0.0,0.0,0.0,0.0,0.0,1584.333374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,4753.0,0.0,0.0,0.0,0.0,0.0,4753.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,4753.0,1584.333374,4753.0,4753.0,0.0,0.0,0.0,2376.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [182]:
en_fingerprints['you'].values.shape

(210,)

In [183]:
en_corpus[en_corpus.word != "\n"].shape

(4753, 2)

In [184]:
X2, target2, tokens = merge(max_depth, en_corpus, en_fingerprints)

In [185]:
if is_only_vocab:
    text = list(en_corpus[en_corpus.word != "\n"].word.unique())
else:
    text = list(en_corpus[en_corpus.word != "\n"].word)
en_vocab = pd.DataFrame({'text': text + list(tokens)})
en_vocab.describe()

Unnamed: 0,text
count,912
unique,912
top,On
freq,1


In [186]:
if is_only_vocab:
    X = np.zeros((en_vocab.shape[0] * duplication, en_nb_of_phrases))
    target = np.zeros((en_vocab.shape[0] * duplication))
    p=0
    for i, row in en_vocab.iterrows():
        c = row.text
        for j in range(duplication):
            X[p] = en_fingerprints[c.split(" ")[0]]
            target[p] = tag2int[getTag(en_corpus[en_corpus.word == c.split(" ")[-1:][0]]['ne-tag'].iloc[0])]
            p+=1
    X, target = shuffle(X, target)
    print(X.shape, en_fingerprints.shape, target.shape)

(912, 210) (210, 912) (912,)


In [187]:
en_vocab[-20:]

Unnamed: 0,text
892,Eliud
893,Eleazar
894,Matthan
895,husband
896,fourteen
897,unwilling
898,shame
899,resolved
900,divorce
901,quietly


In [188]:
if not is_only_vocab:
    X, target = corpus2trainingdata(en_corpus[en_corpus.word != "\n"], en_fingerprints)

In [189]:
print(X.shape, target.shape)
if len(X.shape) == len(X2.shape):
    X = np.concatenate((X, X2))
    target = np.concatenate((target, target2))
    if shuffle:
        X, target = shuffle(X, target)
print(X.shape, target.shape)

(912, 210) (912,)
(912, 210) (912,)


In [190]:
y = target.copy()
y[0:100]
if not BINARY:
    y = np_utils.to_categorical(y, len(tagSet))
y.shape

(912, 5)

In [191]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=0.33)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1])
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1])
print("X_train.shape =", X_train.shape)
print("y_train.shape =", y_train.shape)
print("X_val.shape =", X_val.shape)
print("y_val.shape =", y_val.shape)

tTarget = np.array([np.argmax(yy) for yy in y_train])
vTarget = np.array([np.argmax(yy) for yy in y_val])

for tag in tagSet:
    print("{0} % in training data = {1} %".format(tag, np.round(tTarget[tTarget==tag2int[tag]].size * 100 / tTarget.shape[0], 2)))
    print("{0} % in validation data = {1} %".format(tag, np.round(vTarget[vTarget==tag2int[tag]].size * 100 / vTarget.shape[0], 2)))

X_train.shape = (611, 210)
y_train.shape = (611, 5)
X_val.shape = (301, 210)
y_val.shape = (301, 5)
O % in training data = 87.89 %
O % in validation data = 89.37 %
MISC % in training data = 1.31 %
MISC % in validation data = 1.0 %
PER % in training data = 8.67 %
PER % in validation data = 7.97 %
LOC % in training data = 1.96 %
LOC % in validation data = 1.66 %
ORG % in training data = 0.16 %
ORG % in validation data = 0.0 %


In [192]:
def create_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(h1_size, input_dim=input_dim, activation='sigmoid', name="hidden1"))
    model.add(Dense(h2_size, activation='sigmoid', name="hidden2"))
    if BINARY:
        model.add(Dense(1, activation='sigmoid', name="outputlayer"))
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['binary_accuracy'])
    else:
        model.add(Dense(output_dim, activation='softmax', name="outputlayer"))
        model.compile(loss='categorical_crossentropy', optimizer="rmsprop", metrics=['accuracy'])
    model.summary()
    return model

In [193]:
def train_model(model, X_train, y_train, X_val, y_val, epochs=epochs):
    early_stop = EarlyStopping(patience=2, verbose=2) # stop learning if the error is the same between two consecutive epochs
    best_model_cp = ModelCheckpoint(best_model_file, save_best_only=True, verbose=1) # saved best model
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, verbose=0, shuffle=shuffle, callbacks=[best_model_cp, early_stop])
    best_model = keras.models.load_model(best_model_file) #loading the best model
    return best_model

In [194]:
def predict(model, X, y, binary=BINARY):
    if BINARY:
        y_pred = np.round(model.predict(X))
        y_true = y
    else:
        predictions = model.predict(X)
        y_pred = np.array([np.argmax(p) for p in predictions])
        y_true = np.array([np.argmax(t) for t in y ])
    return y_true, y_pred

In [195]:
def model_performance(y_true, y_pred):
    return P_R_F1(y_pred, y_true, tag2int['O']) #precision, recall, f1-score

In [196]:
def model_performace_by_tag(y_true, y_pred, tag):
    p, r, f1 = 0, 0, 0
    
    eq = y_pred[y_pred==y_true]
    correctly_pred = eq[eq==tag].size
    try:
        p = np.round(100 * correctly_pred / y_pred[y_pred==tag].size, 2)
    except ZeroDivisionError:
        pass
    
    try:
        r = np.round(100 * correctly_pred / y_true[y_true==tag].size, 2)
    except ZeroDivisionError:
        pass
    
    try:
        f1 = np.round(2 * r * p / (r + p), 2)
    except ZeroDivisionError:
        pass
    
    return p, r, f1

In [197]:
ewo_corpus, ewo_nb_of_phrases = load_corpus(ewo_corpus_file, max_nb_of_phrases)

In [198]:
ewo_nb_of_phrases

210

In [199]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(ewo_corpus[ewo_corpus['ne-tag']==tag].shape[0] * 100 / ewo_corpus[ewo_corpus['ne-tag']!='\n'].shape[0], 2)))

O % = 86.37 %
MISC % = 2.18 %
PER % = 5.76 %
LOC % = 0.89 %
ORG % = 0.05 %


In [200]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(ewo_corpus[ewo_corpus['ne-tag']==tag].word.unique().shape[0] * 100 / ewo_corpus[ewo_corpus['ne-tag']!='\n'].word.unique().shape[0], 2)))

O % = 90.0 %
MISC % = 1.17 %
PER % = 8.25 %
LOC % = 1.84 %
ORG % = 0.19 %


In [201]:
ewo_corpus.describe()

Unnamed: 0,word,ne-tag
count,4394,4185
unique,1030,5
top,",",O
freq,413,3795


In [202]:
ewo_corpus.head()

Unnamed: 0,word,ne-tag
0,Mfufub,MISC
1,Nsisim,MISC
2,ayi,O
3,sò,O
4,\n,


In [203]:
ewo_fingerprints = corpus_fingerprint(ewo_corpus, en_nb_of_phrases)

In [204]:
X2, target2, tokens = merge(max_depth, ewo_corpus, ewo_fingerprints)

In [205]:
if is_only_vocab:
    text = list(ewo_corpus[ewo_corpus.word != "\n"].word.unique())
else:
    text = list(ewo_corpus[ewo_corpus.word != "\n"].word)
ewo_vocab = pd.DataFrame({"text":text + list(tokens)})

In [206]:
if is_only_vocab:
    X_ewo = np.zeros((ewo_vocab.shape[0] * duplication, en_nb_of_phrases))
    ewo_target = np.zeros((ewo_vocab.shape[0] * duplication))
    p=0
    for i, row in ewo_vocab.iterrows():
        c = row.text
        for j in range(duplication):
            X_ewo[p] = ewo_fingerprints[c.split(" ")[0]]
            ewo_target[p] = tag2int[getTag(ewo_corpus[ewo_corpus.word == c.split(" ")[-1:][0]]['ne-tag'].iloc[0])]
            p+=1
    X_ewo, ewo_target = shuffle(X_ewo, ewo_target)

In [207]:
ewo_vocab[-20:]

Unnamed: 0,text
1009,nlo
1010,obë
1011,mbara
1012,yabyali
1013,dzili
1014,yasò
1015,oyolëge
1016,kode
1017,dili
1018,atoban


In [208]:
if not is_only_vocab:
    X_ewo, ewo_target = corpus2trainingdata(ewo_corpus[ewo_corpus.word != "\n"], ewo_fingerprints)

In [209]:
print(X_ewo.shape, ewo_target.shape)
if len(X_ewo.shape) == len(X2.shape):
    X_ewo = np.concatenate((X_ewo, X2))
    ewo_target = np.concatenate((ewo_target, target2))
    if shuffle:
        X_ewo, ewo_target = shuffle(X_ewo, ewo_target)
print(X_ewo.shape, ewo_target.shape)

(1029, 210) (1029,)
(1029, 210) (1029,)


In [210]:
y_ewo = ewo_target.copy()
print(y_ewo.shape, len(ewo_vocab))

(1029,) 1029


In [211]:
X_ewo.shape

(1029, 210)

In [212]:
y_ewo = ewo_target.copy()
y_ewo[:20]
if not BINARY:
    y_ewo = np_utils.to_categorical(y_ewo)

In [213]:
X_ewo = X_ewo.reshape((X_ewo.shape[0], en_nb_of_phrases))

In [214]:
def algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, epochs=epochs, model=None):
    test_precision, train_precision, ewo_precision = [], [], []
    test_recall, train_recall, ewo_recall = [], [], []
    test_fscore, train_fscore, ewo_fscore = [], [], []
    
    test_result_by_tag = {}
    train_result_by_tag = {}
    ewo_result_by_tag = {}
    for t in tagSet:
        f1_key = "F1-"+t
        p_key = "P-"+t
        r_key = "R-"+t
        train_result_by_tag[f1_key], train_result_by_tag[p_key], train_result_by_tag[r_key] = [], [], []
        test_result_by_tag[f1_key], test_result_by_tag[p_key], test_result_by_tag[r_key] = [], [], []
        ewo_result_by_tag[f1_key], ewo_result_by_tag[p_key], ewo_result_by_tag[r_key] = [], [], []

    m = train_model(model, X_train, y_train, X_val, y_val, epochs=epochs)
        
    y_true, y_pred = predict(m, X_train, y_train)
    p_train, r_train, f1_train = model_performance(y_true, y_pred)
        
    y_true_val, y_pred_val = predict(m, X_val, y_val)
    p_val, r_val, f1_val = model_performance(y_true_val, y_pred_val)
        
    y_true_ewo, y_pred_ewo = predict(m, X_ewo, y_ewo) 
    p_ewo, r_ewo, f1_ewo = model_performance(y_true_ewo, y_pred_ewo)
        
    for t in range(len(int2tag)):
        f1_key = "F1-" + int2tag[t]
        p_key = "P-" + int2tag[t]
        r_key = "R-" + int2tag[t]
            
        p, r, f1 = model_performace_by_tag(y_true, y_pred, t)
        train_result_by_tag[p_key].append(p)
        train_result_by_tag[r_key].append(r)
        train_result_by_tag[f1_key].append(f1)
            
        p, r, f1 = model_performace_by_tag(y_true_val, y_pred_val, t)
        test_result_by_tag[p_key].append(p)
        test_result_by_tag[r_key].append(r)
        test_result_by_tag[f1_key].append(f1)
            
        p, r, f1 = model_performace_by_tag(y_true_ewo, y_pred_ewo, t)
        ewo_result_by_tag[p_key].append(p)
        ewo_result_by_tag[r_key].append(r)
        ewo_result_by_tag[f1_key].append(f1)
                
    test_precision.append(p_val)
    train_precision.append(p_train)
    ewo_precision.append(p_ewo)
        
    test_recall.append(r_val)
    train_recall.append(r_train)
    ewo_recall.append(r_ewo)
        
    test_fscore.append(f1_val)
    train_fscore.append(f1_train)
    ewo_fscore.append(f1_ewo)
    return pd.DataFrame({
        'P_test': test_precision, 
        'P_train': train_precision, 
        'P_ewo': ewo_precision, 'R_test': test_recall, 'R_train': train_recall, 
        'R_ewo': ewo_recall, 'F1-test': test_fscore, 'F1-train': train_fscore, 'F1-ewo': ewo_fscore}), pd.DataFrame(train_result_by_tag), pd.DataFrame(test_result_by_tag), pd.DataFrame(ewo_result_by_tag)

In [215]:
# model = create_model(X.shape[1], len(tagSet))
# resultEval, train_by_tag, test_by_tag, ewo_by_tag = algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, model=model)

In [216]:
# resultEval

In [217]:
# train_by_tag

In [218]:
# test_by_tag

In [219]:
# ewo_by_tag

In [220]:
# resultEval.mean()

In [221]:
# resultEval.std()

In [222]:
def algoCrossVal(X, y, X_ewo, y_ewo, k = 10, repeat=1): 
    block_size = int(X.shape[0] / k)   
    output = None
    model = None
    train_by_tags, test_by_tags, ewo_by_tags = None, None, None
    for it in range(repeat):
        print("AlgoCrossValIter -", it+1)
        model = create_model(X.shape[1], len(tagSet))
        results = None
        train_by_tagsTmp, test_by_tagsTmp, ewo_by_tagsTmp = None, None, None
        for i in range(k):
            X_val, y_val = X[i*block_size:i*block_size+block_size], y[i*block_size:i*block_size+block_size]
            X_train = np.concatenate((X[0:i*block_size], X[i*block_size+block_size:]))
            y_train = np.concatenate((y[0:i*block_size], y[i*block_size+block_size:]))

            X_train = X_train.reshape(X_train.shape[0], X_train.shape[1])
            X_val = X_val.reshape(X_val.shape[0], X_val.shape[1])

            result, train_by_tag, test_by_tag, ewo_by_tag = algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, model=model)
            if results is None:
                results = result.copy()
                train_by_tagsTmp, test_by_tagsTmp, ewo_by_tagsTmp = train_by_tag.copy(), test_by_tag.copy(), ewo_by_tag.copy()
            else:
                results = pd.concat([results, result], ignore_index=True)
                train_by_tagsTmp = pd.concat([train_by_tagsTmp, train_by_tag], ignore_index=True)
                test_by_tagsTmp = pd.concat([test_by_tagsTmp, test_by_tag], ignore_index=True)
                ewo_by_tagsTmp = pd.concat([ewo_by_tagsTmp, ewo_by_tag], ignore_index=True)
        
        if output is None:
            output = results.mean(axis=0).to_frame()
            train_by_tags = train_by_tagsTmp.mean(axis=0).to_frame()
            test_by_tags = test_by_tagsTmp.mean(axis=0).to_frame()
            ewo_by_tags = ewo_by_tagsTmp.mean(axis=0).to_frame()
        else:
            output = pd.concat([output, results.mean(axis=0).to_frame()], axis=1)
            train_by_tags = pd.concat([train_by_tags, train_by_tagsTmp.mean(axis=0).to_frame()], axis=1)
            test_by_tags = pd.concat([test_by_tags, test_by_tagsTmp.mean(axis=0).to_frame()], axis=1)
            ewo_by_tags = pd.concat([ewo_by_tags, ewo_by_tagsTmp.mean(axis=0).to_frame()], axis=1)

    return output, train_by_tags, test_by_tags, ewo_by_tags, model

In [223]:
resultCrossVal, trainByTagResult, testByTagResult, ewoByTagResult, model = algoCrossVal(X, y, X_ewo, y_ewo, repeat=10)

AlgoCrossValIter - 1
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden1 (Dense)              (None, 160)               33760     
_________________________________________________________________
hidden2 (Dense)              (None, 160)               25760     
_________________________________________________________________
outputlayer (Dense)          (None, 5)                 805       
Total params: 60,325
Trainable params: 60,325
Non-trainable params: 0
_________________________________________________________________

Epoch 00001: val_loss improved from inf to 0.50790, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.50790 to 0.50294, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.50294

Epoch 00004: val_loss did not improve from 0.50294
Epoch 00004: early stopping
number of correct positive predictions is 0
number of correct p




Epoch 00001: val_loss improved from inf to 0.38880, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.38880 to 0.37030, saving model to best-model-conll.hdfs

Epoch 00003: val_loss improved from 0.37030 to 0.36008, saving model to best-model-conll.hdfs

Epoch 00004: val_loss did not improve from 0.36008

Epoch 00005: val_loss improved from 0.36008 to 0.35451, saving model to best-model-conll.hdfs

Epoch 00006: val_loss improved from 0.35451 to 0.34547, saving model to best-model-conll.hdfs

Epoch 00007: val_loss did not improve from 0.34547

Epoch 00008: val_loss did not improve from 0.34547
Epoch 00008: early stopping

Epoch 00001: val_loss improved from inf to 0.18346, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.18346 to 0.18335, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.18335

Epoch 00004: val_loss did not improve from 0.18335
Epoch 00004: early stopping

Epoch 00001: val_loss impr


Epoch 00003: val_loss improved from 0.22146 to 0.21627, saving model to best-model-conll.hdfs

Epoch 00004: val_loss did not improve from 0.21627

Epoch 00005: val_loss did not improve from 0.21627
Epoch 00005: early stopping

Epoch 00001: val_loss improved from inf to 0.28636, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.28636

Epoch 00003: val_loss did not improve from 0.28636
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.17655, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.17655

Epoch 00003: val_loss did not improve from 0.17655
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.16810, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.16810 to 0.16660, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.16660

Epoch 00004: val_loss did not improve from 0.16660
Epoch 00004: early stopping

Epoch 00


Epoch 00003: val_loss did not improve from 0.18607
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.21320, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.21320

Epoch 00003: val_loss did not improve from 0.21320
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.13320, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.13320

Epoch 00003: val_loss did not improve from 0.13320
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.15737, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.15737

Epoch 00003: val_loss did not improve from 0.15737
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.13343, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.13343 to 0.08340, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.08340

Epoch 00004:


Epoch 00005: val_loss did not improve from 0.31047

Epoch 00006: val_loss did not improve from 0.31047
Epoch 00006: early stopping

Epoch 00001: val_loss improved from inf to 0.18012, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.18012

Epoch 00003: val_loss did not improve from 0.18012
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.15341, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.15341

Epoch 00003: val_loss did not improve from 0.15341
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.13160, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.13160 to 0.11153, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.11153

Epoch 00004: val_loss did not improve from 0.11153
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.07007, saving model to best-model-conll.hdfs

Epoch 00002:


Epoch 00003: val_loss did not improve from 0.24952

Epoch 00004: val_loss did not improve from 0.24952
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.13361, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.13361

Epoch 00003: val_loss did not improve from 0.13361
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.15976, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.15976

Epoch 00003: val_loss did not improve from 0.15976
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.10535, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.10535 to 0.09961, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.09961

Epoch 00004: val_loss did not improve from 0.09961
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.05083, saving model to best-model-conll.hdfs

Epoch 00002:

In [224]:
resultCrossVal.to_csv("results/merge-{0}.csv".format(max_depth))
resultCrossVal

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
P_test,78.615,74.305,80.112,74.015,77.75,76.786,79.073,81.755,79.722,83.083
P_train,77.321,76.469,75.749,77.706,79.134,76.74,86.941,77.839,76.793,78.524
P_ewo,68.745,66.871,62.206,68.51,67.707,61.757,74.126,66.925,63.661,66.993
R_test,54.849,54.718,52.624,47.076,56.155,55.509,54.537,59.686,55.161,56.021
R_train,52.484,52.739,52.558,47.18,54.604,56.447,54.541,56.907,55.879,51.162
R_ewo,42.222,44.445,42.778,40.001,46.757,46.295,44.076,47.871,44.352,41.574
F1-test,70.14,68.232222,66.125556,68.59,70.871111,70.093333,70.132222,74.653333,69.376667,72.23
F1-train,68.885556,67.994444,66.471111,62.838889,70.822222,71.963333,64.643,72.475556,70.626667,68.094444
F1-ewo,57.512222,58.162222,54.398889,52.757778,60.721111,58.172222,53.923,61.476667,57.015556,56.408889


In [225]:
resultCrossVal.mean(axis=1).to_frame()

Unnamed: 0,0
P_test,78.5216
P_train,78.3216
P_ewo,66.7501
R_test,54.6336
R_train,53.4501
R_ewo,44.0371
F1-test,70.044444
F1-train,68.481522
F1-ewo,57.054856


In [226]:
resultCrossVal.std(axis=1).to_frame()

Unnamed: 0,0
P_test,2.929565
P_train,3.188461
P_ewo,3.615139
R_test,3.197732
R_train,2.908552
R_ewo,2.465286
F1-test,2.307579
F1-train,3.150334
F1-ewo,2.817094


In [227]:
trainByTagResult.to_csv("results/train-by-tag-merge-{0}.csv".format(max_depth))
trainByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-O,96.443,96.421,96.425,96.223,96.636,96.575,96.597,96.688,96.6,96.42
P-O,94.143,94.23,94.212,93.565,94.421,94.587,94.374,94.68,94.569,93.986
R-O,98.897,98.775,98.815,99.091,99.008,98.691,98.965,98.829,98.773,99.022
F1-MISC,64.065,64.081111,64.065,57.372857,62.53,68.413333,61.81875,61.937778,62.375,61.43125
P-MISC,80.0,88.75,80.0,70.0,90.0,88.75,78.75,87.5,80.0,80.0
R-MISC,38.242,44.061,38.242,28.576,41.354,47.487,37.041,42.061,36.444,36.132
F1-PER,74.954444,73.555556,71.955556,69.9,76.171111,76.974444,71.051,79.104444,76.863333,74.494444
P-PER,78.192,75.099,76.575,76.587,78.411,78.082,87.232,77.481,75.766,77.871
R-PER,59.923,61.033,59.642,56.65,62.337,62.696,63.132,66.521,65.287,59.619
F1-LOC,48.67125,53.291667,54.604286,45.708333,57.795714,49.677778,49.11375,50.412857,49.96125,48.231429


In [228]:
trainByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-O,96.5028
P-O,94.2767
R-O,98.8866
F1-MISC,62.809008
P-MISC,82.375
R-MISC,38.964
F1-PER,74.502433
P-PER,78.1296
R-PER,61.684
F1-LOC,50.746831


In [229]:
trainByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-O,0.140254
P-O,0.331651
R-O,0.130456
F1-MISC,2.781637
P-MISC,6.276333
R-MISC,5.174146
F1-PER,2.912235
P-PER,3.382867
R-PER,2.929268
F1-LOC,3.521413


In [230]:
testByTagResult.to_csv("results/test-by-tag-merge-{0}.csv".format(max_depth))
testByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-O,96.416,96.48,96.375,96.22,96.477,96.539,96.359,96.91,96.494,96.663
P-O,94.164,94.518,93.995,93.369,94.396,94.389,94.144,94.883,94.207,94.295
R-O,98.865,98.628,98.997,99.374,98.746,98.868,98.748,99.127,99.011,99.234
F1-MISC,61.111667,61.111667,61.111667,53.334,61.111667,61.111667,61.111667,61.111667,61.111667,61.111667
P-MISC,40.0,40.0,40.0,30.0,40.0,40.0,40.0,40.0,40.0,40.0
R-MISC,35.0,35.0,35.0,25.0,35.0,35.0,35.0,35.0,35.0,35.0
F1-PER,77.322222,75.281111,72.244444,73.7675,78.635556,75.546667,78.3,79.201111,75.668889,76.14
P-PER,81.253,74.46,81.293,73.432,79.808,82.083,80.499,80.979,78.654,82.166
R-PER,62.174,63.508,58.94,53.035,64.62,60.479,63.746,65.953,62.73,60.735
F1-LOC,44.445,39.334,44.445,52.223333,43.89,45.782857,42.668,54.762857,53.334,61.111667


In [231]:
testByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-O,96.4933
P-O,94.236
R-O,98.9598
F1-MISC,60.3339
P-MISC,39.0
R-MISC,34.0
F1-PER,76.21075
P-PER,79.4627
R-PER,61.592
F1-LOC,48.199671


In [232]:
testByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-O,0.187874
P-O,0.390851
R-O,0.234943
F1-MISC,2.459514
P-MISC,3.162278
R-MISC,3.162278
F1-PER,2.204218
P-PER,3.096084
R-PER,3.65696
F1-LOC,6.7836


In [233]:
ewoByTagResult.to_csv("results/ewo-by-tag-merge-{0}.csv".format(max_depth))

In [234]:
ewoByTagResult = pd.read_csv("results/ewo-by-tag-merge-{0}.csv".format(2), index_col=0)
ewoByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-LOC,,,,,,17.38,,,,17.38
F1-MISC,46.15,46.15,46.15,46.15,43.6825,43.6825,46.15,46.15,46.15,43.591111
F1-O,94.431,95.075,94.737,94.9,94.964,95.227,94.918,94.868,95.157,95.299
F1-ORG,,,,,,,,,,
F1-PER,8.57625,47.702857,28.391429,32.1575,34.5975,39.691111,34.62875,31.67875,42.05,41.984444
P-LOC,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,60.0
P-MISC,70.0,60.0,60.0,70.0,65.0,65.0,60.0,60.0,80.0,72.5
P-O,89.54,91.146,90.138,90.5,90.709,91.122,90.538,90.384,91.105,91.426
P-ORG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-PER,53.638,51.554,57.193,63.381,66.738,77.824,66.391,68.142,66.391,75.073


In [235]:
ewoByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,17.38
F1-MISC,45.400611
F1-O,94.9576
F1-ORG,
F1-PER,34.145859
P-LOC,12.0
P-MISC,66.25
P-O,90.6608
P-ORG,0.0
P-PER,64.6325


In [236]:
ewoByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,0.0
F1-MISC,1.206887
F1-O,0.254209
F1-ORG,
F1-PER,10.728242
P-LOC,25.298221
P-MISC,6.795628
P-O,0.565721
P-ORG,0.0
P-PER,8.529639


In [237]:
columns = en_fingerprints.columns

print("Pred", "Real", "Freq", "Word", sep="\t")
for c in columns:
    prediction = model.predict(en_fingerprints[c].values.reshape((1, 1, 210)))
    pred_tag = int2tag[np.argmax(prediction)]
    real_tag = en_corpus[en_corpus.word == c].iloc[0]['ne-tag']
    
    if pred_tag != real_tag:
        print(pred_tag, real_tag, en_fingerprints[c].max(), c, sep="\t")

Pred	Real	Freq	Word


ValueError: Error when checking input: expected hidden1_input to have 2 dimensions, but got array with shape (1, 1, 210)

In [None]:
en_corpus[en_corpus.word != "\n"].shape