In [1]:
# import
import keras
import sys
import numpy as np
import string
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from keras.utils import np_utils, plot_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn import model_selection
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
import h5py as h5py

Using TensorFlow backend.


In [2]:
BINARY = False
timestep = 1
epochs = 50
en_corpus_file = "corpus-en.txt"
ewo_corpus_file = "corpus-ewo.txt"
best_model_file = "best-model-conll.hdfs"
max_nb_of_phrases =  -1
duplication = 1
max_depth = 0
is_only_vocab = True
shuffle = is_only_vocab
h1_size = 2000
h2_size = 160
h3_size = 40

In [3]:
def getTag(aString):
    tag = "O"
    if BINARY:
        if aString != "O":
            return "NE"
    else:
        tag = aString
    return tag
     

In [4]:
def load_corpus(file, max_nb_of_phrases):
    nb_of_phrases = 0
    dataset = {"word": [], "ne-tag": []}
    with open(file) as f:
        prev_line = None
        for cpt, line in enumerate(f):
            if cpt == 0:
                continue
            if nb_of_phrases == max_nb_of_phrases:
                break;

            l = line.strip()
            if len(l) == 0 and len(prev_line) != 0:
                nb_of_phrases += 1
                dataset["word"].append(line)
                dataset["ne-tag"].append(None)
            else:
                l = l.split("\t")
                if l[0] not in string.punctuation:
                    dataset["word"].append(l[0])
                    dataset["ne-tag"].append(ne_type(l[1]))
            prev_line = line.strip()
        
    return pd.DataFrame(dataset), nb_of_phrases+1

In [5]:
def corpus_fingerprint(aDataframe, nb_of_biphrases):
    # create distributionnal signature
    fingerprints = {}
    current_bi_phrase_index = 0
    nb_word_in_corpus = aDataframe[aDataframe.word != "\n"].word.size
    words_in_current_phrase = []
    for index, row in aDataframe.iterrows():
        if current_bi_phrase_index > nb_of_biphrases:
            break
            
        word = row['word']
        
        if word != "\n":
            words_in_current_phrase.append(word)
            if word not in fingerprints:
                fingerprints[word] = np.zeros(nb_of_biphrases, dtype=np.float32)
            fingerprints[word][current_bi_phrase_index] += 1
        else:
            nb_word_in_current_phrase = len(words_in_current_phrase)
#             for w in words_in_current_phrase:
#                 fingerprints[w][current_bi_phrase_index] = nb_word_in_corpus / fingerprints[w][current_bi_phrase_index]                
            current_bi_phrase_index += 1
            words_in_current_phrase = []
    for word in fingerprints:
        for i in range(nb_of_biphrases):
            if fingerprints[word][i] != 0:
                fingerprints[word][i] = nb_word_in_corpus / fingerprints[word][i]
#         fingerprints[word][nb_of_biphrases] = nb_word_in_corpus / aDataframe[aDataframe.word == word].word.size
        
    return pd.DataFrame(fingerprints)

In [6]:
def corpus2trainingdata(aDataframe, fingerprintsDataFrame):
    X = np.zeros((aDataframe.shape[0], fingerprintsDataFrame.shape[0]), dtype=np.int8)
    y = np.zeros(aDataframe.shape[0], dtype=np.int8)
    i = 0
    for row in aDataframe.iterrows():
        X[i] = fingerprintsDataFrame[row[1]['word']].values
        y[i] = tag2int[getTag(row[1]['ne-tag'])]
        i += 1
    return X, y

In [7]:
def train_test_split(X, y, test_size = 0.33):
    total = X.shape[0]
    train_length = round(total * (1 - test_size)) 
    return X[:train_length], X[train_length:], y[:train_length], y[train_length:]

In [8]:
  
def ne_type(aType):
    aType = aType.lower()
    if 'per' in aType:
        t =  'NE' if BINARY else 'PER' 
    elif 'loc' in aType:
        t =  'NE' if BINARY else 'LOC'
    elif 'org' in aType:
        t =  'NE' if BINARY else 'ORG'
    elif 'hour' in aType:
        t =  'NE' if BINARY else 'MISC'
    elif aType != 'o' and len(aType) > 0 :
        t =  'NE' if BINARY else 'MISC'
    else:
        t = 'O'
    return t

In [9]:
def compute_performance(y_true, y_pred, words=None, BINARY=False):
    if BINARY:
        p = precision_score(y_true, y_pred, pos_label=tag2int['NE'])
        r = recall_score(y_true, y_pred, pos_label=tag2int['NE'])
        f1 = f1_score(y_true, y_pred, pos_label=tag2int['NE'])
        acc = accuracy_score(y_true, y_pred)
    else:
        p = precision_score(y_pred, y_true, average='macro')
        r = recall_score(y_pred, y_true, average='macro')
        f1 = f1_score(y_pred, y_true, average='macro')
        acc = accuracy_score(y_pred, y_true)
    if words is None:
        model_output_vs = pd.DataFrame({'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})
    else:
        model_output_vs = pd.DataFrame({'word': words, 'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})

    return p, r, f1, acc, model_output_vs

In [10]:
def P_R_F1(y_pred, y_true, neg_class):
    same = y_pred[y_true==y_pred]
    tp = same[same != neg_class].size
    nb_of_pos_exple = y_true[y_true != neg_class].size
    nb_of_pos_pred = y_pred[y_pred != neg_class].size
    p = r = f1 = 0
    try:
        p = np.round(tp*100/nb_of_pos_pred, 2)
    except ZeroDivisionError:
        print("number of correct positive predictions is 0")
        
    try:
        r = np.round(tp*100/nb_of_pos_exple, 2)
    except ZeroDivisionError:
        print("number of position exple is 0")
        
    try:
        f1 = np.round(2*r*p/(r+p), 2)
    except ZeroDivisionError:
        print("Recall and precision are 0")

    return p, r, f1

In [11]:
def is_mergeable(aListOfConsecutiveTokens, corpus, fingerprints):
    n = len(aListOfConsecutiveTokens)
    if n <= 1:
        return False
    if n == 2:
        w1, w2 = aListOfConsecutiveTokens[0], aListOfConsecutiveTokens[1]
        rep1, rep2 = fingerprints[aListOfConsecutiveTokens[0]], fingerprints[aListOfConsecutiveTokens[1]]
        tag1, tag2 = corpus[corpus.word==w1].iloc[0]['ne-tag'], corpus[corpus.word==w2].iloc[0]['ne-tag']
        if (tag1 == tag2) and (tag1 == "O"): # O + O => False
            return False
        if (tag1 != tag2) and (tag1 != "O") and (tag2 != "O"): # X + Y => False
            return False
        return rep1.equals(rep2)
    else:
        half = int(n / 2)
        return is_mergeable(aListOfConsecutiveTokens[0:half+1], corpus, fingerprints) and is_mergeable(aListOfConsecutiveTokens[half:n], corpus, fingerprints)

In [12]:
def merge(depth, corpus, fingerprint):
    wordDf = corpus[corpus.word != "\n"].word
    nbOfWord = wordDf.shape[0]
    text = list(wordDf)
    X2, target2, tokens = [], [], []
    level, newToken = 1, True
    while level <= depth and newToken:
        i, newToken = 0, False
        limit = nbOfWord - level
        while i < limit:
            if is_mergeable(text[i:i+level+1], corpus, fingerprint):
                tokens.append(" ".join(text[i:i+level+1]))
                newToken = True
            i += 1
        print("level ", level, ":", set(tokens))
        level += 1
    
    X2, target2 = np.array(X2), np.array(target2)
    
    return X2, target2, set(tokens)

In [13]:
def shuffle(X, y):
    indices = [i for i in  range(X.shape[0])]
    np.random.shuffle(indices)
    return X[indices], y[indices]

In [14]:
def create_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(h1_size, input_dim=input_dim, activation='sigmoid', name="hidden1"))
#     model.add(Dense(h2_size, activation='sigmoid', name="hidden2"))
#     model.add(Dense(h3_size, activation='sigmoid', name="hidden3"))
    if BINARY:
        model.add(Dense(1, activation='sigmoid', name="outputlayer"))
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['binary_accuracy'])
    else:
        model.add(Dense(output_dim, activation='softmax', name="outputlayer"))
        model.compile(loss='categorical_crossentropy', optimizer="rmsprop", metrics=['accuracy'])
    model.summary()
    return model

In [15]:
def train_model(model, X_train, y_train, X_val, y_val, epochs=epochs):
    early_stop = EarlyStopping(patience=2, verbose=2) # stop learning if the error is the same between two consecutive epochs
    best_model_cp = ModelCheckpoint(best_model_file, save_best_only=True, verbose=1) # saved best model
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, verbose=0, shuffle=shuffle, callbacks=[best_model_cp, early_stop])
    best_model = keras.models.load_model(best_model_file) #loading the best model
    return best_model

In [16]:
def predict(model, X, y, binary=BINARY):
    if BINARY:
        y_pred = np.round(model.predict(X))
        y_true = y
    else:
        predictions = model.predict(X)
        y_pred = np.array([np.argmax(p) for p in predictions])
        y_true = np.array([np.argmax(t) for t in y ])
    return y_true, y_pred

In [17]:
def model_performance(y_true, y_pred):
    return P_R_F1(y_pred, y_true, tag2int['O']) #precision, recall, f1-score

In [18]:
def model_performace_by_tag(y_true, y_pred, tag):
    p, r, f1 = 0, 0, 0
    
    eq = y_pred[y_pred==y_true]
    correctly_pred = eq[eq==tag].size
    try:
        p = np.round(100 * correctly_pred / y_pred[y_pred==tag].size, 2)
    except ZeroDivisionError:
        pass
    
    try:
        r = np.round(100 * correctly_pred / y_true[y_true==tag].size, 2)
    except ZeroDivisionError:
        pass
    
    try:
        f1 = np.round(2 * r * p / (r + p), 2)
    except ZeroDivisionError:
        pass
    
    return p, r, f1

In [19]:
en_corpus, en_nb_of_phrases = load_corpus(en_corpus_file, max_nb_of_phrases)

In [20]:
tagSet = en_corpus["ne-tag"].dropna().unique()
if BINARY:
    tagSet = ['NE', 'O']
tag2int = {j: i for i, j in enumerate(tagSet)}
int2tag = {i: j for i, j in enumerate(tagSet)}
print(tag2int)

{'O': 0, 'MISC': 1, 'PER': 2, 'LOC': 3, 'ORG': 4}


In [21]:
en_nb_of_phrases

210

In [22]:
en_corpus.describe()

Unnamed: 0,word,ne-tag
count,4379,4170
unique,904,5
top,the,O
freq,313,3779


In [23]:
en_corpus.head(10)

Unnamed: 0,word,ne-tag
0,The,O
1,Promise,O
2,of,O
3,the,O
4,Holy,MISC
5,Spirit,MISC
6,\n,
7,In,O
8,the,O
9,first,O


In [24]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(en_corpus[en_corpus['ne-tag']==tag].shape[0] * 100 / en_corpus[en_corpus['ne-tag']!='\n'].shape[0], 2)))

O % = 86.3 %
MISC % = 2.4 %
PER % = 5.59 %
LOC % = 0.91 %
ORG % = 0.02 %


In [25]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(en_corpus[en_corpus['ne-tag']==tag].word.unique().shape[0] * 100 / en_corpus[en_corpus['ne-tag']!='\n'].word.unique().shape[0], 2)))

O % = 89.16 %
MISC % = 1.88 %
PER % = 8.96 %
LOC % = 1.99 %
ORG % = 0.11 %


In [26]:
en_corpus[en_corpus.word == "\n"].shape

(209, 2)

In [27]:
print("Nb of bi-phrases", en_nb_of_phrases)
en_fingerprints = corpus_fingerprint(en_corpus, en_nb_of_phrases)

Nb of bi-phrases 210


In [28]:
en_fingerprints.head(10)

Unnamed: 0,The,Promise,of,the,Holy,Spirit,In,first,book,O,...,considered,dream,She,save,fulfill,Immanuel,us),woke,sleep,knew
0,4170.0,4170.0,4170.0,4170.0,4170.0,4170.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,4170.0,0.0,0.0,4170.0,4170.0,4170.0,4170.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1390.0,4170.0,4170.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,4170.0,4170.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4170.0,2085.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,4170.0,4170.0,4170.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4170.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,4170.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,4170.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,4170.0,1390.0,4170.0,4170.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
en_fingerprints['you'].values.shape

(210,)

In [30]:
en_corpus[en_corpus.word != "\n"].shape

(4170, 2)

In [31]:
X2, target2, tokens = merge(max_depth, en_corpus, en_fingerprints)

In [32]:
if is_only_vocab:
    text = list(en_corpus[en_corpus.word != "\n"].word.unique())
else:
    text = list(en_corpus[en_corpus.word != "\n"].word)
en_vocab = pd.DataFrame({'text': text + list(tokens)})
en_vocab.describe()

Unnamed: 0,text
count,903
unique,903
top,hearts
freq,1


In [33]:
if is_only_vocab:
    X = np.zeros((en_vocab.shape[0] * duplication, en_nb_of_phrases))
    target = np.zeros((en_vocab.shape[0] * duplication))
    p=0
    for i, row in en_vocab.iterrows():
        c = row.text
        for j in range(duplication):
            X[p] = en_fingerprints[c.split(" ")[0]]
            target[p] = tag2int[getTag(en_corpus[en_corpus.word == c.split(" ")[-1:][0]]['ne-tag'].iloc[0])]
            p+=1
    X, target = shuffle(X, target)
    print(X.shape, en_fingerprints.shape, target.shape)

(903, 210) (210, 903) (903,)


In [34]:
en_vocab[-20:]

Unnamed: 0,text
883,Eliud
884,Eleazar
885,Matthan
886,husband
887,fourteen
888,unwilling
889,shame
890,resolved
891,divorce
892,quietly


In [35]:
if not is_only_vocab:
    X, target = corpus2trainingdata(en_corpus[en_corpus.word != "\n"], en_fingerprints)

In [36]:
print(X.shape, target.shape)
if len(X.shape) == len(X2.shape):
    X = np.concatenate((X, X2))
    target = np.concatenate((target, target2))
    if shuffle:
        X, target = shuffle(X, target)
print(X.shape, target.shape)

(903, 210) (903,)
(903, 210) (903,)


In [37]:
y = target.copy()
y[0:100]
if not BINARY:
    y = np_utils.to_categorical(y, len(tagSet))
y.shape

(903, 5)

In [38]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=0.33)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1])
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1])
print("X_train.shape =", X_train.shape)
print("y_train.shape =", y_train.shape)
print("X_val.shape =", X_val.shape)
print("y_val.shape =", y_val.shape)

tTarget = np.array([np.argmax(yy) for yy in y_train])
vTarget = np.array([np.argmax(yy) for yy in y_val])

for tag in tagSet:
    print("{0} % in training data = {1} %".format(tag, np.round(tTarget[tTarget==tag2int[tag]].size * 100 / tTarget.shape[0], 2)))
    print("{0} % in validation data = {1} %".format(tag, np.round(vTarget[vTarget==tag2int[tag]].size * 100 / vTarget.shape[0], 2)))

X_train.shape = (605, 210)
y_train.shape = (605, 5)
X_val.shape = (298, 210)
y_val.shape = (298, 5)
O % in training data = 87.93 %
O % in validation data = 88.93 %
MISC % in training data = 1.49 %
MISC % in validation data = 0.67 %
PER % in training data = 8.43 %
PER % in validation data = 8.72 %
LOC % in training data = 1.98 %
LOC % in validation data = 1.68 %
ORG % in training data = 0.17 %
ORG % in validation data = 0.0 %


In [39]:
ewo_corpus, ewo_nb_of_phrases = load_corpus(ewo_corpus_file, max_nb_of_phrases)

In [40]:
ewo_nb_of_phrases

210

In [41]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(ewo_corpus[ewo_corpus['ne-tag']==tag].shape[0] * 100 / ewo_corpus[ewo_corpus['ne-tag']!='\n'].shape[0], 2)))

O % = 84.15 %
MISC % = 2.54 %
PER % = 6.69 %
LOC % = 1.03 %
ORG % = 0.05 %


In [42]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(ewo_corpus[ewo_corpus['ne-tag']==tag].word.unique().shape[0] * 100 / ewo_corpus[ewo_corpus['ne-tag']!='\n'].word.unique().shape[0], 2)))

O % = 89.94 %
MISC % = 1.17 %
PER % = 8.3 %
LOC % = 1.86 %
ORG % = 0.2 %


In [43]:
ewo_corpus.describe()

Unnamed: 0,word,ne-tag
count,3779,3570
unique,1024,5
top,\n,O
freq,209,3180


In [44]:
ewo_corpus.head()

Unnamed: 0,word,ne-tag
0,Mfufub,MISC
1,Nsisim,MISC
2,ayi,O
3,sò,O
4,\n,


In [45]:
ewo_fingerprints = corpus_fingerprint(ewo_corpus, en_nb_of_phrases)

In [46]:
X2, target2, tokens = merge(max_depth, ewo_corpus, ewo_fingerprints)

In [47]:
if is_only_vocab:
    text = list(ewo_corpus[ewo_corpus.word != "\n"].word.unique())
else:
    text = list(ewo_corpus[ewo_corpus.word != "\n"].word)
ewo_vocab = pd.DataFrame({"text":text + list(tokens)})

In [48]:
if is_only_vocab:
    X_ewo = np.zeros((ewo_vocab.shape[0] * duplication, en_nb_of_phrases))
    ewo_target = np.zeros((ewo_vocab.shape[0] * duplication))
    p=0
    for i, row in ewo_vocab.iterrows():
        c = row.text
        for j in range(duplication):
            X_ewo[p] = ewo_fingerprints[c.split(" ")[0]]
            ewo_target[p] = tag2int[getTag(ewo_corpus[ewo_corpus.word == c.split(" ")[-1:][0]]['ne-tag'].iloc[0])]
            p+=1
    X_ewo, ewo_target = shuffle(X_ewo, ewo_target)

In [49]:
ewo_vocab[-20:]

Unnamed: 0,text
1003,nlo
1004,obë
1005,mbara
1006,yabyali
1007,dzili
1008,yasò
1009,oyolëge
1010,kode
1011,dili
1012,atoban


In [50]:
if not is_only_vocab:
    X_ewo, ewo_target = corpus2trainingdata(ewo_corpus[ewo_corpus.word != "\n"], ewo_fingerprints)

In [51]:
print(X_ewo.shape, ewo_target.shape)
if len(X_ewo.shape) == len(X2.shape):
    X_ewo = np.concatenate((X_ewo, X2))
    ewo_target = np.concatenate((ewo_target, target2))
    if shuffle:
        X_ewo, ewo_target = shuffle(X_ewo, ewo_target)
print(X_ewo.shape, ewo_target.shape)

(1023, 210) (1023,)
(1023, 210) (1023,)


In [52]:
y_ewo = ewo_target.copy()
print(y_ewo.shape, len(ewo_vocab))

(1023,) 1023


In [53]:
X_ewo.shape

(1023, 210)

In [54]:
y_ewo = ewo_target.copy()
y_ewo[:20]
if not BINARY:
    y_ewo = np_utils.to_categorical(y_ewo)

In [55]:
X_ewo = X_ewo.reshape((X_ewo.shape[0], en_nb_of_phrases))

In [56]:
def algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, epochs=epochs, model=None):
    test_precision, train_precision, ewo_precision = [], [], []
    test_recall, train_recall, ewo_recall = [], [], []
    test_fscore, train_fscore, ewo_fscore = [], [], []
    
    test_result_by_tag = {}
    train_result_by_tag = {}
    ewo_result_by_tag = {}
    for t in tagSet:
        f1_key = "F1-"+t
        p_key = "P-"+t
        r_key = "R-"+t
        train_result_by_tag[f1_key], train_result_by_tag[p_key], train_result_by_tag[r_key] = [], [], []
        test_result_by_tag[f1_key], test_result_by_tag[p_key], test_result_by_tag[r_key] = [], [], []
        ewo_result_by_tag[f1_key], ewo_result_by_tag[p_key], ewo_result_by_tag[r_key] = [], [], []

    m = train_model(model, X_train, y_train, X_val, y_val, epochs=epochs)
        
    y_true, y_pred = predict(m, X_train, y_train)
    p_train, r_train, f1_train = model_performance(y_true, y_pred)
        
    y_true_val, y_pred_val = predict(m, X_val, y_val)
    p_val, r_val, f1_val = model_performance(y_true_val, y_pred_val)
        
    y_true_ewo, y_pred_ewo = predict(m, X_ewo, y_ewo) 
    p_ewo, r_ewo, f1_ewo = model_performance(y_true_ewo, y_pred_ewo)
        
    for t in range(len(int2tag)):
        f1_key = "F1-" + int2tag[t]
        p_key = "P-" + int2tag[t]
        r_key = "R-" + int2tag[t]
            
        p, r, f1 = model_performace_by_tag(y_true, y_pred, t)
        train_result_by_tag[p_key].append(p)
        train_result_by_tag[r_key].append(r)
        train_result_by_tag[f1_key].append(f1)
            
        p, r, f1 = model_performace_by_tag(y_true_val, y_pred_val, t)
        test_result_by_tag[p_key].append(p)
        test_result_by_tag[r_key].append(r)
        test_result_by_tag[f1_key].append(f1)
            
        p, r, f1 = model_performace_by_tag(y_true_ewo, y_pred_ewo, t)
        ewo_result_by_tag[p_key].append(p)
        ewo_result_by_tag[r_key].append(r)
        ewo_result_by_tag[f1_key].append(f1)
                
    test_precision.append(p_val)
    train_precision.append(p_train)
    ewo_precision.append(p_ewo)
        
    test_recall.append(r_val)
    train_recall.append(r_train)
    ewo_recall.append(r_ewo)
        
    test_fscore.append(f1_val)
    train_fscore.append(f1_train)
    ewo_fscore.append(f1_ewo)
    return pd.DataFrame({
        'P_test': test_precision, 
        'P_train': train_precision, 
        'P_ewo': ewo_precision, 'R_test': test_recall, 'R_train': train_recall, 
        'R_ewo': ewo_recall, 'F1-test': test_fscore, 'F1-train': train_fscore, 'F1-ewo': ewo_fscore}), pd.DataFrame(train_result_by_tag), pd.DataFrame(test_result_by_tag), pd.DataFrame(ewo_result_by_tag)

In [57]:
# model = create_model(X.shape[1], len(tagSet))
# resultEval, train_by_tag, test_by_tag, ewo_by_tag = algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, model=model)

In [58]:
# resultEval

In [59]:
# train_by_tag

In [60]:
# test_by_tag

In [61]:
# ewo_by_tag

In [62]:
# resultEval.mean()

In [63]:
# resultEval.std()

In [64]:
def algoCrossVal(X, y, X_ewo, y_ewo, k = 10, repeat=1): 
    block_size = int(X.shape[0] / k)   
    output = None
    model = None
    train_by_tags, test_by_tags, ewo_by_tags = None, None, None
    for it in range(repeat):
        print("AlgoCrossValIter -", it+1)
        model = create_model(X.shape[1], len(tagSet))
        results = None
        train_by_tagsTmp, test_by_tagsTmp, ewo_by_tagsTmp = None, None, None
        for i in range(k):
            X_val, y_val = X[i*block_size:i*block_size+block_size], y[i*block_size:i*block_size+block_size]
            X_train = np.concatenate((X[0:i*block_size], X[i*block_size+block_size:]))
            y_train = np.concatenate((y[0:i*block_size], y[i*block_size+block_size:]))

            X_train = X_train.reshape(X_train.shape[0], X_train.shape[1])
            X_val = X_val.reshape(X_val.shape[0], X_val.shape[1])

            result, train_by_tag, test_by_tag, ewo_by_tag = algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, model=model)
            if results is None:
                results = result.copy()
                train_by_tagsTmp, test_by_tagsTmp, ewo_by_tagsTmp = train_by_tag.copy(), test_by_tag.copy(), ewo_by_tag.copy()
            else:
                results = pd.concat([results, result], ignore_index=True)
                train_by_tagsTmp = pd.concat([train_by_tagsTmp, train_by_tag], ignore_index=True)
                test_by_tagsTmp = pd.concat([test_by_tagsTmp, test_by_tag], ignore_index=True)
                ewo_by_tagsTmp = pd.concat([ewo_by_tagsTmp, ewo_by_tag], ignore_index=True)
        
        if output is None:
            output = results.mean(axis=0).to_frame()
            train_by_tags = train_by_tagsTmp.mean(axis=0).to_frame()
            test_by_tags = test_by_tagsTmp.mean(axis=0).to_frame()
            ewo_by_tags = ewo_by_tagsTmp.mean(axis=0).to_frame()
        else:
            output = pd.concat([output, results.mean(axis=0).to_frame()], axis=1)
            train_by_tags = pd.concat([train_by_tags, train_by_tagsTmp.mean(axis=0).to_frame()], axis=1)
            test_by_tags = pd.concat([test_by_tags, test_by_tagsTmp.mean(axis=0).to_frame()], axis=1)
            ewo_by_tags = pd.concat([ewo_by_tags, ewo_by_tagsTmp.mean(axis=0).to_frame()], axis=1)

    return output, train_by_tags, test_by_tags, ewo_by_tags, model

In [65]:
resultCrossVal, trainByTagResult, testByTagResult, ewoByTagResult, model = algoCrossVal(X, y, X_ewo, y_ewo, repeat=10)

AlgoCrossValIter - 1
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden1 (Dense)              (None, 2000)              422000    
_________________________________________________________________
outputlayer (Dense)          (None, 5)                 10005     
Total params: 432,005
Trainable params: 432,005
Non-trainable params: 0
_________________________________________________________________

Epoch 00001: val_loss improved from inf to 0.19233, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.19233 to 0.18268, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.18268

Epoch 00004: val_loss did not improve from 0.18268
Epoch 00004: early stopping





Epoch 00001: val_loss improved from inf to 0.15163, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.15163

Epoch 00003: val_loss did not improve from 0.15163
Epoch 00003: early stopping





Epoch 00001: val_loss improved from inf to 0.13334, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.13334

Epoch 00003: val_loss did not improve from 0.13334
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.09095, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.09095

Epoch 00003: val_loss did not improve from 0.09095
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.08375, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.08375

Epoch 00003: val_loss did not improve from 0.08375
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.19500, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.19500 to 0.12837, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.12837

Epoch 00004: val_loss did not improve from 0.12837
Epoch 00004: early stopping

Epoch 00001:


Epoch 00001: val_loss improved from inf to 0.05958, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.05958 to 0.04081, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.04081

Epoch 00004: val_loss did not improve from 0.04081
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.09233, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.09233

Epoch 00003: val_loss did not improve from 0.09233
Epoch 00003: early stopping
AlgoCrossValIter - 4
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden1 (Dense)              (None, 2000)              422000    
_________________________________________________________________
outputlayer (Dense)          (None, 5)                 10005     
Total params: 432,005
Trainable params: 432,005
Non-trainable params: 0
____________________________________


Epoch 00001: val_loss improved from inf to 0.13883, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.13883

Epoch 00003: val_loss did not improve from 0.13883
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.11854, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.11854 to 0.11792, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.11792

Epoch 00004: val_loss did not improve from 0.11792
Epoch 00004: early stopping

Epoch 00001: val_loss improved from inf to 0.09075, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.09075

Epoch 00003: val_loss did not improve from 0.09075
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.06918, saving model to best-model-conll.hdfs

Epoch 00002: val_loss did not improve from 0.06918

Epoch 00003: val_loss did not improve from 0.06918
Epoch 00003: early stopping

Epoch 00001:


Epoch 00003: val_loss did not improve from 0.02092
Epoch 00003: early stopping

Epoch 00001: val_loss improved from inf to 0.10902, saving model to best-model-conll.hdfs

Epoch 00002: val_loss improved from 0.10902 to 0.10528, saving model to best-model-conll.hdfs

Epoch 00003: val_loss did not improve from 0.10528

Epoch 00004: val_loss did not improve from 0.10528
Epoch 00004: early stopping
AlgoCrossValIter - 9
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden1 (Dense)              (None, 2000)              422000    
_________________________________________________________________
outputlayer (Dense)          (None, 5)                 10005     
Total params: 432,005
Trainable params: 432,005
Non-trainable params: 0
_________________________________________________________________

Epoch 00001: val_loss improved from inf to 0.33239, saving model to best-model-conll.hdfs

Epoch 00002: val_loss

In [66]:
resultCrossVal.to_csv("results/merge-{0}.csv".format(max_depth))
resultCrossVal

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
P_test,78.958,86.115,79.292,82.784,81.459,82.014,81.474,79.185,79.741,86.355
P_train,88.356,90.424,87.34,88.582,87.356,87.135,87.65,87.949,85.028,88.171
P_ewo,77.601,80.524,75.42,80.313,73.014,78.148,77.306,79.368,74.377,79.978
R_test,65.888,70.151,71.13,73.85,75.553,75.463,70.867,69.904,77.35,69.504
R_train,80.593,79.066,80.103,79.841,82.741,83.451,80.987,79.664,87.019,79.663
R_ewo,63.15,61.759,63.888,62.593,65.927,65.278,63.611,64.074,68.61,62.963
F1-test,75.997778,75.894,82.024444,76.215,74.545,77.87,74.697,81.103333,77.609,72.011
F1-train,83.629,83.52,83.092,83.027,84.291,85.063,83.829,82.796,85.697,82.669
F1-ewo,68.851,68.618,68.67,69.173,68.326,70.849,68.849,69.739,70.565,68.987


In [67]:
resultCrossVal.mean(axis=1).to_frame()

Unnamed: 0,0
P_test,81.7377
P_train,87.7991
P_ewo,77.6049
R_test,71.966
R_train,81.3128
R_ewo,64.1853
F1-test,76.796656
F1-train,83.7613
F1-ewo,69.2627


In [68]:
resultCrossVal.std(axis=1).to_frame()

Unnamed: 0,0
P_test,2.708213
P_train,1.354963
P_ewo,2.608722
R_test,3.49744
R_train,2.44756
R_ewo,1.978335
F1-test,3.015912
F1-train,0.995229
F1-ewo,0.850375


In [69]:
trainByTagResult.to_csv("results/train-by-tag-merge-{0}.csv".format(max_depth))
trainByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-O,97.971,98.011,97.945,97.982,98.057,98.116,98.009,97.961,98.141,97.869
P-O,97.46,97.282,97.429,97.436,97.785,97.857,97.582,97.416,98.345,97.34
R-O,98.511,98.778,98.483,98.565,98.357,98.385,98.455,98.539,97.956,98.443
F1-MISC,76.652,75.492,77.461111,72.115,75.247,76.681,76.907,75.026667,77.983,75.231
P-MISC,81.883,90.586,87.46,93.056,95.238,94.306,96.889,79.778,96.389,87.778
R-MISC,75.926,68.951,58.482,63.181,64.34,65.908,64.888,61.982,66.525,71.15
F1-PER,87.321,86.763,87.422,87.147,87.954,88.666,86.764,87.028,88.951,85.866
P-PER,89.798,92.016,89.289,90.547,87.027,89.528,89.51,89.339,87.423,89.836
R-PER,86.029,83.474,86.354,85.008,89.852,88.379,85.439,86.025,91.159,83.674
F1-LOC,71.852,75.148,70.149,77.351111,72.621,75.547,76.232,78.068889,77.835,74.838


In [70]:
trainByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-O,98.0062
P-O,97.5932
R-O,98.4472
F1-MISC,75.879578
P-MISC,90.3363
R-MISC,66.1333
F1-PER,87.3882
P-PER,89.4313
R-PER,86.5393
F1-LOC,74.9642


In [71]:
trainByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-O,0.081155
P-O,0.322055
R-O,0.208285
F1-MISC,1.669686
P-MISC,6.005288
R-MISC,4.926869
F1-PER,0.924146
P-PER,1.417429
R-PER,2.525886
F1-LOC,2.665357


In [72]:
testByTagResult.to_csv("results/test-by-tag-merge-{0}.csv".format(max_depth))
testByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-O,97.45,97.625,97.622,97.558,97.483,97.816,97.298,97.568,97.924,97.431
P-O,96.849,96.835,97.064,97.173,97.549,97.546,96.651,96.83,98.024,96.948
R-O,98.1,98.461,98.215,97.978,97.482,98.102,97.989,98.351,97.835,97.953
F1-MISC,54.445,42.5,45.0,45.0,45.0,45.0,40.83375,42.5,51.428571,42.5
P-MISC,31.667,33.334,36.667,36.667,36.667,36.667,31.667,33.334,36.667,33.334
R-MISC,36.667,36.667,36.667,36.667,36.667,36.667,36.667,36.667,36.667,36.667
F1-PER,80.036667,79.392,85.981111,80.007,78.583,81.552,77.74,86.816667,82.48,76.137
P-PER,80.47,87.768,80.083,86.528,82.832,83.291,84.0,81.855,85.768,88.611
R-PER,70.234,75.075,76.003,78.114,81.753,80.662,74.796,76.734,81.75,73.932
F1-LOC,71.853333,68.63125,72.778889,74.322222,68.413333,73.704444,71.0425,66.25125,75.001111,70.715


In [73]:
testByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-O,97.5775
P-O,97.1469
R-O,98.0466
F1-MISC,45.420732
P-MISC,34.6671
R-MISC,36.667
F1-PER,80.872544
P-PER,84.1206
R-PER,76.9053
F1-LOC,71.271333


In [74]:
testByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-O,0.1845175
P-O,0.4307297
R-O,0.2746712
F1-MISC,4.280084
P-MISC,2.194201
R-MISC,7.489778e-15
F1-PER,3.425326
P-PER,2.964644
R-PER,3.719834
F1-LOC,2.83729


In [75]:
ewoByTagResult.to_csv("results/ewo-by-tag-merge-{0}.csv".format(max_depth))

In [76]:
ewoByTagResult = pd.read_csv("results/ewo-by-tag-merge-{0}.csv".format(2), index_col=0)
ewoByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-LOC,,,,,,17.38,,,,17.38
F1-MISC,46.15,46.15,46.15,46.15,43.6825,43.6825,46.15,46.15,46.15,43.591111
F1-O,94.431,95.075,94.737,94.9,94.964,95.227,94.918,94.868,95.157,95.299
F1-ORG,,,,,,,,,,
F1-PER,8.57625,47.702857,28.391429,32.1575,34.5975,39.691111,34.62875,31.67875,42.05,41.984444
P-LOC,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,60.0
P-MISC,70.0,60.0,60.0,70.0,65.0,65.0,60.0,60.0,80.0,72.5
P-O,89.54,91.146,90.138,90.5,90.709,91.122,90.538,90.384,91.105,91.426
P-ORG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-PER,53.638,51.554,57.193,63.381,66.738,77.824,66.391,68.142,66.391,75.073


In [77]:
ewoByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,17.38
F1-MISC,45.400611
F1-O,94.9576
F1-ORG,
F1-PER,34.145859
P-LOC,12.0
P-MISC,66.25
P-O,90.6608
P-ORG,0.0
P-PER,64.6325


In [78]:
ewoByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,0.0
F1-MISC,1.206887
F1-O,0.254209
F1-ORG,
F1-PER,10.728242
P-LOC,25.298221
P-MISC,6.795628
P-O,0.565721
P-ORG,0.0
P-PER,8.529639


In [79]:
columns = en_fingerprints.columns

print("Pred", "Real", "Freq", "Word", sep="\t")
for c in columns:
    prediction = model.predict(en_fingerprints[c].values.reshape((1, 1, 210)))
    pred_tag = int2tag[np.argmax(prediction)]
    real_tag = en_corpus[en_corpus.word == c].iloc[0]['ne-tag']
    
    if pred_tag != real_tag:
        print(pred_tag, real_tag, en_fingerprints[c].max(), c, sep="\t")

Pred	Real	Freq	Word


ValueError: Error when checking input: expected hidden1_input to have 2 dimensions, but got array with shape (1, 1, 210)

In [None]:
en_corpus[en_corpus.word != "\n"].shape