In [1]:
# import
import keras
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential, Model
from keras.layers import SimpleRNN, Dense
from keras.utils import np_utils, plot_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn import model_selection
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
import h5py as h5py

  from ._conv import register_converters as _register_converters
Using Theano backend.


In [2]:
BINARY = False
timestep = 1
epochs = 10
en_corpus_file = "corpus-en.txt"
ewo_corpus_file = "corpus-ewo.txt"
best_model_file = "best-model-conll.hdfs"
max_nb_of_phrases =  -1
duplication = 1
max_depth = 0
is_only_vocab = True
time_history_default = None
time_history_autoencoder = None

In [3]:
with_encoding = False

In [4]:
def getTag(aString):
    tag = "O"
    if BINARY:
        if aString != "O":
            return "NE"
    else:
        tag = aString
    return tag
     

In [5]:
def load_corpus(file, max_nb_of_phrases):
    nb_of_phrases = 0
    dataset = {"word": [], "ne-tag": []}
    with open(file) as f:
        prev_line = None
        for cpt, line in enumerate(f):
            if cpt == 0:
                continue
            if nb_of_phrases == max_nb_of_phrases:
                break;

            l = line.strip()
            if len(l) == 0 and len(prev_line) != 0:
                nb_of_phrases += 1
                dataset["word"].append(line)
                dataset["ne-tag"].append(None)
            else:
                l = l.split("\t")
                dataset["word"].append(l[0])
                dataset["ne-tag"].append(ne_type(l[1]))
            prev_line = line.strip()
        
    return pd.DataFrame(dataset), nb_of_phrases+1

In [6]:
def corpus_fingerprint(aDataframe, nb_of_biphrases):
    # create distributionnal signature
    fingerprints = {}
    current_bi_phrase_index = 0
    for index, row in aDataframe.iterrows():
        if current_bi_phrase_index > nb_of_biphrases:
            break
            
        word = row['word']
        
        if word != "\n":
            if word not in fingerprints:
                fingerprints[word] = np.zeros(nb_of_biphrases, dtype=np.int8)
            fingerprints[word][current_bi_phrase_index] = 1
        else:
            current_bi_phrase_index += 1
    return pd.DataFrame(fingerprints)

In [7]:
def corpus2trainingdata(aDataframe, fingerprintsDataFrame):
    X = np.zeros((aDataframe.shape[0], fingerprintsDataFrame.shape[0]), dtype=np.int8)
    y = np.zeros(aDataframe.shape[0], dtype=np.int8)
    i = 0
    for row in aDataframe.iterrows():
        X[i] = fingerprintsDataFrame[row[1]['word']].values
        y[i] = tag2int[getTag(row[1]['ne-tag'])]
        i += 1
    return X, y

In [8]:
def train_test_split(X, y, test_size = 0.33):
    total = X.shape[0]
    train_length = round(total * (1 - test_size)) 
    return X[:train_length], X[train_length:], y[:train_length], y[train_length:]

In [9]:
  
def ne_type(aType):
    aType = aType.lower()
    if 'per' in aType:
        t =  'NE' if BINARY else 'PER' 
    elif 'loc' in aType:
        t =  'NE' if BINARY else 'LOC'
    elif 'org' in aType:
        t =  'NE' if BINARY else 'ORG'
    elif 'hour' in aType:
        t =  'NE' if BINARY else 'MISC'
    elif aType != 'o' and len(aType) > 0 :
        t =  'NE' if BINARY else 'MISC'
    else:
        t = 'O'
    return t

In [10]:
def compute_performance(y_true, y_pred, words=None, BINARY=False):
    if BINARY:
        p = precision_score(y_true, y_pred, pos_label=tag2int['NE'])
        r = recall_score(y_true, y_pred, pos_label=tag2int['NE'])
        f1 = f1_score(y_true, y_pred, pos_label=tag2int['NE'])
        acc = accuracy_score(y_true, y_pred)
    else:
        p = precision_score(y_pred, y_true, average='macro')
        r = recall_score(y_pred, y_true, average='macro')
        f1 = f1_score(y_pred, y_true, average='macro')
        acc = accuracy_score(y_pred, y_true)
    if words is None:
        model_output_vs = pd.DataFrame({'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})
    else:
        model_output_vs = pd.DataFrame({'word': words, 'y_true': [int2tag[i] for i in y_true], 'y_pred': [int2tag[i] for i in y_pred]})

    return p, r, f1, acc, model_output_vs

In [11]:
def P_R_F1(y_pred, y_true, neg_class):
    same = y_pred[y_true==y_pred]
    tp = same[same != neg_class].size
    nb_of_pos_exple = y_true[y_true != neg_class].size
    nb_of_pos_pred = y_pred[y_pred != neg_class].size
    p = r = f1 = 0
    try:
        p = np.round(tp*100/nb_of_pos_pred, 2)
    except ZeroDivisionError:
        print("number of correct positive predictions is 0")
        
    try:
        r = np.round(tp*100/nb_of_pos_exple, 2)
    except ZeroDivisionError:
        print("number of position exple is 0")
        
    try:
        f1 = np.round(2*r*p/(r+p), 2)
    except ZeroDivisionError:
        print("Recall and precision are 0")

    return p, r, f1

In [12]:
def is_mergeable(aListOfConsecutiveTokens, corpus, fingerprints):
    n = len(aListOfConsecutiveTokens)
    if n <= 1:
        return False
    if n == 2:
        w1, w2 = aListOfConsecutiveTokens[0], aListOfConsecutiveTokens[1]
        rep1, rep2 = fingerprints[aListOfConsecutiveTokens[0]], fingerprints[aListOfConsecutiveTokens[1]]
        tag1, tag2 = corpus[corpus.word==w1].iloc[0]['ne-tag'], corpus[corpus.word==w2].iloc[0]['ne-tag']
        if (tag1 == tag2) and (tag1 == "O"): # O + O => False
            return False
        if (tag1 != tag2) and (tag1 != "O") and (tag2 != "O"): # X + Y => False
            return False
        return rep1.equals(rep2)
    else:
        half = int(n / 2)
        return is_mergeable(aListOfConsecutiveTokens[0:half+1], corpus, fingerprints) and is_mergeable(aListOfConsecutiveTokens[half:n], corpus, fingerprints)

In [13]:
def merge(depth, corpus, fingerprint):
    wordDf = corpus[corpus.word != "\n"].word
    nbOfWord = wordDf.shape[0]
    text = list(wordDf)
    X2, target2, tokens = [], [], []
    level, newToken = 1, True
    while level <= depth and newToken:
        i, newToken = 0, False
        limit = nbOfWord - level
        while i < limit:
            if is_mergeable(text[i:i+level+1], corpus, fingerprint):
                tokens.append(" ".join(text[i:i+level+1]))
                newToken = True
            i += 1
        print("level ", level, ":", set(tokens))
        level += 1
    
    X2, target2 = np.array(X2), np.array(target2)
    
    return X2, target2, set(tokens)

In [14]:
def shuffle(X, y):
    indices = [i for i in  range(X.shape[0])]
    np.random.shuffle(indices)
    return X[indices], y[indices]

In [15]:
class TimeHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)

In [16]:
from keras.layers import Input

def create_and_train_autoencoder(X_train, code_dim, epochs=10):
    input_data = Input(shape=(X_train.shape[1],))
    encoded = Dense(code_dim, activation="sigmoid")(input_data)
    decoded = Dense(X_train.shape[1], activation="sigmoid")(encoded)
    
    encoder = Model(input_data, encoded)
    autoencoder = Model(input_data, decoded)
    autoencoder.compile(optimizer="sgd", loss="binary_crossentropy")
    autoencoder.fit(X_train, X_train, shuffle=True, epochs=epochs, validation_data=(X_train, X_train))
    
    return encoder

In [17]:
en_corpus, en_nb_of_phrases = load_corpus(en_corpus_file, max_nb_of_phrases)

In [18]:
tagSet = en_corpus["ne-tag"].dropna().unique()
if BINARY:
    tagSet = ['NE', 'O']
tag2int = {j: i for i, j in enumerate(tagSet)}
int2tag = {i: j for i, j in enumerate(tagSet)}
print(tag2int)

{'O': 0, 'MISC': 1, 'PER': 2, 'LOC': 3, 'ORG': 4}


In [19]:
en_nb_of_phrases

210

In [20]:
en_corpus.describe()

Unnamed: 0,ne-tag,word
count,4753,4962
unique,5,913
top,O,","
freq,4362,343


In [21]:
en_corpus.head(10)

Unnamed: 0,ne-tag,word
0,O,The
1,O,Promise
2,O,of
3,O,the
4,MISC,Holy
5,MISC,Spirit
6,,\n
7,O,In
8,O,the
9,O,first


In [22]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(en_corpus[en_corpus['ne-tag']==tag].shape[0] * 100 / en_corpus[en_corpus['ne-tag']!='\n'].shape[0], 2)))

O % = 87.91 %
MISC % = 2.12 %
PER % = 4.94 %
LOC % = 0.81 %
ORG % = 0.02 %


In [23]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(en_corpus[en_corpus['ne-tag']==tag].word.unique().shape[0] * 100 / en_corpus[en_corpus['ne-tag']!='\n'].word.unique().shape[0], 2)))

O % = 89.27 %
MISC % = 1.86 %
PER % = 8.87 %
LOC % = 1.97 %
ORG % = 0.11 %


In [24]:
en_corpus[en_corpus.word == "\n"].shape

(209, 2)

In [25]:
print("Nb of bi-phrases", en_nb_of_phrases)
en_fingerprints = corpus_fingerprint(en_corpus, en_nb_of_phrases)

Nb of bi-phrases 210


In [26]:
en_corpus[en_corpus.word != "\n"].shape

(4753, 2)

In [27]:
X2, target2, tokens = merge(max_depth, en_corpus, en_fingerprints)

In [28]:
if is_only_vocab:
    text = list(en_corpus[en_corpus.word != "\n"].word.unique())
else:
    text = list(en_corpus[en_corpus.word != "\n"].word)
en_vocab = pd.DataFrame({'text': text + list(tokens)})
en_vocab.describe()

Unnamed: 0,text
count,912
unique,912
top,Lord
freq,1


In [29]:
if is_only_vocab:
    X = np.zeros((en_vocab.shape[0] * duplication, en_nb_of_phrases))
    target = np.zeros((en_vocab.shape[0] * duplication))
    p=0
    for i, row in en_vocab.iterrows():
        c = row.text
        for j in range(duplication):
            X[p] = en_fingerprints[c.split(" ")[0]]
            target[p] = tag2int[getTag(en_corpus[en_corpus.word == c.split(" ")[-1:][0]]['ne-tag'].iloc[0])]
            p+=1
    X, target = shuffle(X, target)
    print(X.shape, en_fingerprints.shape, target.shape)

(912, 210) (210, 912) (912,)


In [30]:
en_vocab[-20:]

Unnamed: 0,text
892,Eliud
893,Eleazar
894,Matthan
895,husband
896,fourteen
897,unwilling
898,shame
899,resolved
900,divorce
901,quietly


In [31]:
if not is_only_vocab:
    X, target = corpus2trainingdata(en_corpus[en_corpus.word != "\n"], en_fingerprints)

In [32]:
print(X.shape, target.shape)
if len(X.shape) == len(X2.shape):
    X = np.concatenate((X, X2))
    target = np.concatenate((target, target2))
    X, target = shuffle(X, target)
print(X.shape, target.shape)

(912, 210) (912,)
(912, 210) (912,)


In [33]:
#encoding
encoder = None
if with_encoding:
    encoder = create_and_train_autoencoder(X_train=X, code_dim=105, epochs=20)

Train on 912 samples, validate on 912 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [34]:
X_encoded = X.copy()
if with_encoding:
    X_encoded = encoder.predict(X)
    print(X_encoded[0])

[0.5211265  0.5007705  0.48045832 0.46695423 0.5013819  0.5110806
 0.4892157  0.49492922 0.5141073  0.4907691  0.49917176 0.4919977
 0.48330146 0.49402916 0.47615057 0.50871265 0.49408615 0.47685164
 0.49098873 0.5062708  0.4751904  0.48350462 0.4691016  0.48240003
 0.4732721  0.5227527  0.49278572 0.51308554 0.46921358 0.5046807
 0.49613118 0.5050585  0.53322744 0.5327735  0.48609406 0.5270021
 0.49062496 0.5325807  0.4712937  0.53323525 0.5091743  0.52026105
 0.51255244 0.48532775 0.51352906 0.47419164 0.48523787 0.5315397
 0.49447408 0.47977334 0.534224   0.52416867 0.49683645 0.47905055
 0.5243849  0.5168504  0.50817245 0.48420277 0.49042907 0.49662575
 0.49234915 0.49057162 0.49899086 0.47510254 0.4867539  0.4895171
 0.48784363 0.48337126 0.48882484 0.47295913 0.48417526 0.47215062
 0.5110909  0.48278385 0.51594007 0.4765113  0.52458596 0.53033274
 0.47278726 0.5340229  0.5275985  0.5149643  0.5170001  0.4898818
 0.5262383  0.4814509  0.4987726  0.5237311  0.47255147 0.50864685
 0

In [35]:
y = target.copy()
y[0:100]
if not BINARY:
    y = np_utils.to_categorical(y, len(tagSet))
y.shape

(912, 5)

In [36]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_encoded, y, test_size=0.33)
X_train = X_train.reshape(X_train.shape[0], timestep, X_train.shape[1])
X_val = X_val.reshape(X_val.shape[0], timestep, X_val.shape[1])
print("X_train.shape =", X_train.shape)
print("y_train.shape =", y_train.shape)
print("X_val.shape =", X_val.shape)
print("y_val.shape =", y_val.shape)

tTarget = np.array([np.argmax(yy) for yy in y_train])
vTarget = np.array([np.argmax(yy) for yy in y_val])

for tag in tagSet:
    print("{0} % in training data = {1} %".format(tag, np.round(tTarget[tTarget==tag2int[tag]].size * 100 / tTarget.shape[0], 2)))
    print("{0} % in validation data = {1} %".format(tag, np.round(vTarget[vTarget==tag2int[tag]].size * 100 / vTarget.shape[0], 2)))

X_train.shape = (611, 1, 105)
y_train.shape = (611, 5)
X_val.shape = (301, 1, 105)
y_val.shape = (301, 5)
O % in training data = 88.54 %
O % in validation data = 88.04 %
MISC % in training data = 1.64 %
MISC % in validation data = 0.33 %
PER % in training data = 7.53 %
PER % in validation data = 10.3 %
LOC % in training data = 2.13 %
LOC % in validation data = 1.33 %
ORG % in training data = 0.16 %
ORG % in validation data = 0.0 %


In [37]:
def create_model(input_dim, output_dim):
    model = Sequential()
    model.add(SimpleRNN(640, input_shape=(None, input_dim), activation='sigmoid'))
    model.add(Dense(160, activation='sigmoid'))
    if BINARY:
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['binary_accuracy'])
    else:
        model.add(Dense(output_dim, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer="rmsprop", metrics=['accuracy'])
    return model

In [38]:
def train_model(model, X_train, y_train, X_val, y_val, epochs=epochs):
    time_history_cb = TimeHistory()
    best_model_cp = ModelCheckpoint(best_model_file, save_best_only=True, verbose=2) # saved best model
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, callbacks=[best_model_cp, time_history_cb])
    best_model = keras.models.load_model(best_model_file) #loading the best model
    return best_model, time_history_cb

In [39]:
def predict(model, X, y, binary=BINARY):
    if BINARY:
        y_pred = np.round(model.predict(X))
        y_true = y
    else:
        predictions = model.predict(X)
        y_pred = np.array([np.argmax(p) for p in predictions])
        y_true = np.array([np.argmax(t) for t in y ])
    return y_true, y_pred

In [40]:
def model_performance(y_true, y_pred):
    return P_R_F1(y_pred, y_true, tag2int['O']) #precision, recall, f1-score

In [41]:
def model_performace_by_tag(y_true, y_pred, tag):
    p, r, f1 = 0, 0, 0
    
    eq = y_pred[y_pred==y_true]
    correctly_pred = eq[eq==tag].size
    try:
        p = np.round(100 * correctly_pred / y_pred[y_pred==tag].size, 2)
    except ZeroDivisionError:
        pass
    
    try:
        r = np.round(100 * correctly_pred / y_true[y_true==tag].size, 2)
    except ZeroDivisionError:
        pass
    
    try:
        f1 = np.round(2 * r * p / (r + p), 2)
    except ZeroDivisionError:
        pass
    
    return p, r, f1

In [42]:
ewo_corpus, ewo_nb_of_phrases = load_corpus(ewo_corpus_file, max_nb_of_phrases)

In [43]:
ewo_nb_of_phrases

210

In [44]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(ewo_corpus[ewo_corpus['ne-tag']==tag].shape[0] * 100 / ewo_corpus[ewo_corpus['ne-tag']!='\n'].shape[0], 2)))

O % = 86.37 %
MISC % = 2.18 %
PER % = 5.76 %
LOC % = 0.89 %
ORG % = 0.05 %


In [45]:
for tag in tagSet:
    print("{0} % = {1} %".format(tag, np.round(ewo_corpus[ewo_corpus['ne-tag']==tag].word.unique().shape[0] * 100 / ewo_corpus[ewo_corpus['ne-tag']!='\n'].word.unique().shape[0], 2)))

O % = 90.0 %
MISC % = 1.17 %
PER % = 8.25 %
LOC % = 1.84 %
ORG % = 0.19 %


In [46]:
ewo_corpus.describe()

Unnamed: 0,ne-tag,word
count,4185,4394
unique,5,1030
top,O,","
freq,3795,413


In [47]:
ewo_corpus.head()

Unnamed: 0,ne-tag,word
0,MISC,Mfufub
1,MISC,Nsisim
2,O,ayi
3,O,sò
4,,\n


In [48]:
ewo_fingerprints = corpus_fingerprint(ewo_corpus, en_nb_of_phrases)

In [49]:
X2, target2, tokens = merge(max_depth, ewo_corpus, ewo_fingerprints)

In [50]:
if is_only_vocab:
    text = list(ewo_corpus[ewo_corpus.word != "\n"].word.unique())
else:
    text = list(ewo_corpus[ewo_corpus.word != "\n"].word)
ewo_vocab = pd.DataFrame({"text":text + list(tokens)})

In [51]:
if is_only_vocab:
    X_ewo = np.zeros((ewo_vocab.shape[0] * duplication, en_nb_of_phrases))
    ewo_target = np.zeros((ewo_vocab.shape[0] * duplication))
    p=0
    for i, row in ewo_vocab.iterrows():
        c = row.text
        for j in range(duplication):
            X_ewo[p] = ewo_fingerprints[c.split(" ")[0]]
            ewo_target[p] = tag2int[getTag(ewo_corpus[ewo_corpus.word == c.split(" ")[-1:][0]]['ne-tag'].iloc[0])]
            p+=1
    X_ewo, ewo_target = shuffle(X_ewo, ewo_target)

In [52]:
ewo_vocab[-20:]

Unnamed: 0,text
1009,nlo
1010,obë
1011,mbara
1012,yabyali
1013,dzili
1014,yasò
1015,oyolëge
1016,kode
1017,dili
1018,atoban


In [53]:
if not is_only_vocab:
    X_ewo, ewo_target = corpus2trainingdata(ewo_corpus[ewo_corpus.word != "\n"], ewo_fingerprints)

In [54]:
print(X_ewo.shape, ewo_target.shape)
if len(X_ewo.shape) == len(X2.shape):
    X_ewo = np.concatenate((X_ewo, X2))
    ewo_target = np.concatenate((ewo_target, target2))
    X_ewo, ewo_target = shuffle(X_ewo, ewo_target)
print(X_ewo.shape, ewo_target.shape)

(1029, 210) (1029,)
(1029, 210) (1029,)


In [55]:
X_ewo_encoded = X_ewo.copy()
if with_encoding:
    X_ewo_encoded = encoder.predict(X_ewo)
    print(X_ewo_encoded.shape)

(1029, 105)


In [56]:
y_ewo = ewo_target.copy()
print(y_ewo.shape, len(ewo_vocab))

(1029,) 1029


In [57]:
y_ewo = ewo_target.copy()
y_ewo[:20]
if not BINARY:
    y_ewo = np_utils.to_categorical(y_ewo)

In [58]:
X_ewo_encoded = X_ewo_encoded.reshape((X_ewo.shape[0], timestep, X_ewo_encoded.shape[1]))

In [59]:
def algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, epochs=epochs, model=None):
    test_precision, train_precision, ewo_precision = [], [], []
    test_recall, train_recall, ewo_recall = [], [], []
    test_fscore, train_fscore, ewo_fscore = [], [], []
    
    test_result_by_tag = {}
    train_result_by_tag = {}
    ewo_result_by_tag = {}
    for t in tagSet:
        f1_key = "F1-"+t
        p_key = "P-"+t
        r_key = "R-"+t
        train_result_by_tag[f1_key], train_result_by_tag[p_key], train_result_by_tag[r_key] = [], [], []
        test_result_by_tag[f1_key], test_result_by_tag[p_key], test_result_by_tag[r_key] = [], [], []
        ewo_result_by_tag[f1_key], ewo_result_by_tag[p_key], ewo_result_by_tag[r_key] = [], [], []

    m, time_history = train_model(model, X_train, y_train, X_val, y_val, epochs=epochs)
        
    y_true, y_pred = predict(m, X_train, y_train)
    p_train, r_train, f1_train = model_performance(y_true, y_pred)
        
    y_true_val, y_pred_val = predict(m, X_val, y_val)
    p_val, r_val, f1_val = model_performance(y_true_val, y_pred_val)
        
    y_true_ewo, y_pred_ewo = predict(m, X_ewo, y_ewo) 
    p_ewo, r_ewo, f1_ewo = model_performance(y_true_ewo, y_pred_ewo)
        
    for t in range(len(int2tag)):
        f1_key = "F1-" + int2tag[t]
        p_key = "P-" + int2tag[t]
        r_key = "R-" + int2tag[t]
            
        p, r, f1 = model_performace_by_tag(y_true, y_pred, t)
        train_result_by_tag[p_key].append(p)
        train_result_by_tag[r_key].append(r)
        train_result_by_tag[f1_key].append(f1)
            
        p, r, f1 = model_performace_by_tag(y_true_val, y_pred_val, t)
        test_result_by_tag[p_key].append(p)
        test_result_by_tag[r_key].append(r)
        test_result_by_tag[f1_key].append(f1)
            
        p, r, f1 = model_performace_by_tag(y_true_ewo, y_pred_ewo, t)
        ewo_result_by_tag[p_key].append(p)
        ewo_result_by_tag[r_key].append(r)
        ewo_result_by_tag[f1_key].append(f1)
                
    test_precision.append(p_val)
    train_precision.append(p_train)
    ewo_precision.append(p_ewo)
        
    test_recall.append(r_val)
    train_recall.append(r_train)
    ewo_recall.append(r_ewo)
        
    test_fscore.append(f1_val)
    train_fscore.append(f1_train)
    ewo_fscore.append(f1_ewo)
    return pd.DataFrame({
        'P_test': test_precision, 
        'P_train': train_precision, 
        'P_ewo': ewo_precision, 'R_test': test_recall, 'R_train': train_recall, 
        'R_ewo': ewo_recall, 'F1-test': test_fscore, 'F1-train': train_fscore, 'F1-ewo': ewo_fscore}), pd.DataFrame(train_result_by_tag), pd.DataFrame(test_result_by_tag), pd.DataFrame(ewo_result_by_tag), time_history

In [60]:
model = create_model(X_encoded.shape[1], len(tagSet))
if with_encoding:
    resultEval, train_by_tag, test_by_tag, ewo_by_tag, time_history_autoencoder = algoEval(X_train, y_train, X_val, y_val, X_ewo_encoded, y_ewo, model=model, epochs=50)
    pd.DataFrame(time_history_autoencoder.times).to_csv("time_history_autoencoder.csv")
else:
    resultEval, train_by_tag, test_by_tag, ewo_by_tag, time_history_default = algoEval(X_train, y_train, X_val, y_val, X_ewo_encoded, y_ewo, model=model, epochs=50)
    pd.DataFrame(time_history_default.times).to_csv("time_history_default.csv")

Train on 611 samples, validate on 301 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50


Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0


In [61]:
try:
    time_history_default = pd.read_csv("time_history_default.csv", usecols=[1])["0"].values.tolist()
    time_history_autoencoder = pd.read_csv("time_history_autoencoder.csv", usecols=[1])["0"].values.tolist()
    figure, axis = plt.subplots()
    axis.plot(time_history_default, label="default")
    axis.plot(time_history_autoencoder, ls="--", label="autoencoder")
    axis.set_ylabel("Training time")
    axis.set_xlabel("Epochs")
    axis.legend()
    figure.savefig(fname="time-by-epochs.png")
except FileNotFoundError as e:
    print("file not found", e)

file not found File b'time_history_default.csv' does not exist


In [62]:
# resultEval

In [63]:
# train_b`y_tag

In [64]:
# test_by_tag

In [65]:
# ewo_by_tag

In [66]:
# resultEval.mean()

In [67]:
# resultEval.std()

In [68]:
def algoCrossVal(model, X, y, X_ewo, y_ewo, k = 10, repeat=1): 
    block_size = int(X.shape[0] / 4)   
    output = None
    train_by_tags, test_by_tags, ewo_by_tags = None, None, None
    for it in range(repeat):
        print("AlgoCrossValIter -", it+1)
        model = create_model(X.shape[1], len(tagSet))
        results = None
        train_by_tagsTmp, test_by_tagsTmp, ewo_by_tagsTmp = None, None, None
        for i in range(k):
            X_val, y_val = X[i*block_size:i*block_size+block_size], y[i*block_size:i*block_size+block_size]
            X_train = np.concatenate((X[0:i*block_size], X[i*block_size+block_size:]))
            y_train = np.concatenate((y[0:i*block_size], y[i*block_size+block_size:]))

            X_train = X_train.reshape(X_train.shape[0], timestep, X_train.shape[1])
            X_val = X_val.reshape(X_val.shape[0], timestep, X_val.shape[1])

            result, train_by_tag, test_by_tag, ewo_by_tag, thistory = algoEval(X_train, y_train, X_val, y_val, X_ewo, y_ewo, model=model)
            if results is None:
                results = result.copy()
                train_by_tagsTmp, test_by_tagsTmp, ewo_by_tagsTmp = train_by_tag.copy(), test_by_tag.copy(), ewo_by_tag.copy()
            else:
                results = pd.concat([results, result], ignore_index=True)
                train_by_tagsTmp = pd.concat([train_by_tagsTmp, train_by_tag], ignore_index=True)
                test_by_tagsTmp = pd.concat([test_by_tagsTmp, test_by_tag], ignore_index=True)
                ewo_by_tagsTmp = pd.concat([ewo_by_tagsTmp, ewo_by_tag], ignore_index=True)
        
        if output is None:
            output = results.mean(axis=0).to_frame()
            train_by_tags = train_by_tagsTmp.mean(axis=0).to_frame()
            test_by_tags = test_by_tagsTmp.mean(axis=0).to_frame()
            ewo_by_tags = ewo_by_tagsTmp.mean(axis=0).to_frame()
        else:
            output = pd.concat([output, results.mean(axis=0).to_frame()], axis=1)
            train_by_tags = pd.concat([train_by_tags, train_by_tagsTmp.mean(axis=0).to_frame()], axis=1)
            test_by_tags = pd.concat([test_by_tags, test_by_tagsTmp.mean(axis=0).to_frame()], axis=1)
            ewo_by_tags = pd.concat([ewo_by_tags, ewo_by_tagsTmp.mean(axis=0).to_frame()], axis=1)

    return output, train_by_tags, test_by_tags, ewo_by_tags

In [69]:
m = create_model(X_encoded.shape[1], len(tagSet))
resultCrossVal, trainByTagResult, testByTagResult, ewoByTagResult = algoCrossVal(m, X_encoded, y, X_ewo_encoded, y_ewo, repeat=10)

AlgoCrossValIter - 1
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10



Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 

Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epo

Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
AlgoCrossValIter - 3
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positi

Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
AlgoCrossValIter - 4
Train on 684 

Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of pos

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
E

number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positiv

Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epo

number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
AlgoCrossValIter - 7
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of cor

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
AlgoCrossValIter - 8
Tr

Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of pos

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
E

number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positiv

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 684 samples, validate on 228 samples
Epoch 1/10
Epoch 2/10


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0
Train on 912 samples, validate on 0 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epo

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of correct positive predictions is 0
number of correct positive predictions is 0
number of position exple is 0
Recall and precision are 0
number of correct positive predictions is 0


In [70]:
if with_encoding:
    resultCrossVal.to_csv("results/merge-{}-encoding.csv".format(max_depth))
else:
    resultCrossVal.to_csv("results/merge-{}-no-encoding.csv".format(max_depth))
resultCrossVal

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-ewo,,,,,,,,,,
F1-test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F1-train,,,,,,,,,,
P_ewo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P_test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P_train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
R_ewo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
R_test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
R_train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
resultCrossVal.mean(axis=1).to_frame()

Unnamed: 0,0
F1-ewo,
F1-test,0.0
F1-train,
P_ewo,0.0
P_test,0.0
P_train,0.0
R_ewo,0.0
R_test,0.0
R_train,0.0


In [72]:
resultCrossVal.std(axis=1).to_frame()

Unnamed: 0,0
F1-ewo,
F1-test,0.0
F1-train,
P_ewo,0.0
P_test,0.0
P_train,0.0
R_ewo,0.0
R_test,0.0
R_train,0.0


In [73]:
if with_encoding:
    trainByTagResult.to_csv("results/train-by-tag-merge-{}-encoding.csv".format(max_depth))
else:
    trainByTagResult.to_csv("results/train-by-tag-merge-{}-no-encoding.csv".format(max_depth))
trainByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-LOC,,,,,,,,,,
F1-MISC,,,,,,,,,,
F1-O,93.829,93.829,93.829,93.829,93.829,93.829,93.829,93.829,93.829,93.829
F1-ORG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F1-PER,,,,,,,,,,
P-LOC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-MISC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-O,88.379,88.379,88.379,88.379,88.379,88.379,88.379,88.379,88.379,88.379
P-ORG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-PER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
trainByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,
F1-MISC,
F1-O,93.829
F1-ORG,0.0
F1-PER,
P-LOC,0.0
P-MISC,0.0
P-O,88.379
P-ORG,0.0
P-PER,0.0


In [75]:
trainByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,
F1-MISC,
F1-O,1.497956e-14
F1-ORG,0.0
F1-PER,
P-LOC,0.0
P-MISC,0.0
P-O,0.0
P-ORG,0.0
P-PER,0.0


In [76]:
if with_encoding:
    testByTagResult.to_csv("results/test-by-tag-merge-{}-encoding.csv".format(max_depth))
else:
    testByTagResult.to_csv("results/test-by-tag-merge-{}-no-encoding.csv".format(max_depth))
testByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-LOC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F1-MISC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F1-O,37.529,37.529,37.529,37.529,37.529,37.529,37.529,37.529,37.529,37.529
F1-ORG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F1-PER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-LOC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-MISC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-O,35.351,35.351,35.351,35.351,35.351,35.351,35.351,35.351,35.351,35.351
P-ORG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-PER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
testByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,0.0
F1-MISC,0.0
F1-O,37.529
F1-ORG,0.0
F1-PER,0.0
P-LOC,0.0
P-MISC,0.0
P-O,35.351
P-ORG,0.0
P-PER,0.0


In [78]:
testByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,0.0
F1-MISC,0.0
F1-O,0.0
F1-ORG,0.0
F1-PER,0.0
P-LOC,0.0
P-MISC,0.0
P-O,0.0
P-ORG,0.0
P-PER,0.0


In [79]:
if with_encoding:
    ewoByTagResult.to_csv("results/ewo-by-tag-merge-{}-encoding.csv".format(max_depth))
else:
    ewoByTagResult.to_csv("results/ewo-by-tag-merge-{}-no-encoding.csv".format(max_depth))
ewoByTagResult

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
F1-LOC,,,,,,,,,,
F1-MISC,,,,,,,,,,
F1-O,94.46,94.46,94.46,94.46,94.46,94.46,94.46,94.46,94.46,94.46
F1-ORG,,,,,,,,,,
F1-PER,,,,,,,,,,
P-LOC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-MISC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-O,89.5,89.5,89.5,89.5,89.5,89.5,89.5,89.5,89.5,89.5
P-ORG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-PER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
ewoByTagResult.mean(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,
F1-MISC,
F1-O,94.46
F1-ORG,
F1-PER,
P-LOC,0.0
P-MISC,0.0
P-O,89.5
P-ORG,0.0
P-PER,0.0


In [81]:
ewoByTagResult.std(axis=1).to_frame()

Unnamed: 0,0
F1-LOC,
F1-MISC,
F1-O,0.0
F1-ORG,
F1-PER,
P-LOC,0.0
P-MISC,0.0
P-O,0.0
P-ORG,0.0
P-PER,0.0
