In [1]:
from database import *
from pattern.en import conjugate, lemma, lexeme, PAST, SG, PRESENT

from Udep2Mono.binarization import BinaryDependencyTree
from Udep2Mono import polarization
from Udep2Mono.util import btreeToList
from copy import deepcopy

2020-12-21 22:07:44 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | gum       |
| pos       | gum       |
| lemma     | gum       |
| depparse  | gum       |
| ner       | ontonotes |

2020-12-21 22:07:44 INFO: Use device: gpu
2020-12-21 22:07:44 INFO: Loading: tokenize
2020-12-21 22:07:48 INFO: Loading: pos
2020-12-21 22:07:50 INFO: Loading: lemma
2020-12-21 22:07:50 INFO: Loading: depparse
2020-12-21 22:07:52 INFO: Loading: ner
2020-12-21 22:07:53 INFO: Done loading processors!


In [2]:
class ImplicativeGenerator:
    def __init__(self, length, kb, tree):
        self.kb = kb
        self.treeLog = []
        self.polarLog = []
        self.deptree = tree
        self.length = length

    def find_verbs(self, postags):
        verbs = []
        for word in postags:
            if 'VB' in postags[word][1]:
                verbs.append((word, postags[word][0]))
        return verbs
    
    def fix_tense(self, verb, pos):
        if pos == "VBD":
            return conjugate(verb=verb, tense=PAST, person=1)
        elif pos == "VBZ":
            return conjugate(verb=verb, tense=PRESENT, person=3)
        else:
            return verb

    def search(self):
        #verbs = self.find_verbs(postags)
        self.generate(self.deptree)

    
    def save_tree(self, tree=None):
        if tree is not None:
            generated, _, _, _ = btreeToList(tree, self.length, {}, 0)
        else:
            generated, _, _, _ = btreeToList(self.deptree, self.length, {}, 0)
        generated = '[%s]' % ', '.join(map(str, generated)).replace("'", "")
        generated = generated.replace(",", "")
        print("New tree: ", generated)

        if tree is not None:
            return deepcopy(self.deptree)
        else:
            return deepcopy(self.deptree)

    def generate(self, tree):
        if tree.val in ["ccomp", "xcomp"]:
            backup = deepcopy(tree)

            verb = conjugate(verb=tree.right.val, tense=PRESENT, person=1)
            pos = tree.right.npos

            impl_signs = self.kb.find({"Verb": verb})
            if impl_signs:
                sign = impl_signs[0]['Signature'].split('/')
                if sign[0] == "+" and sign[1] == "+":
                    self.treeLog.append(self.save_tree(tree.left.right))
                elif sign[0] == "+" and sign[1] == "-":
                    tree.val = tree.left.right.val
                    tree.mark = tree.left.right.mark
                    tree.id = tree.left.right.id
                    tree.right = tree.left.right.right
                    tree.right.val = self.fix_tense(tree.right.val, pos)
                    tree.left = tree.left.right.left
                    
                    self.treeLog.append(self.save_tree())

                    tree.val = backup.val
                    tree.mark = backup.mark
                    tree.id = backup.id
                    tree.left = deepcopy(backup.left)
                    tree.right = deepcopy(backup.right)
        else:
            if tree.left != "N":
                self.generate(tree.left)

            if tree.right != "N":
                self.generate(tree.right)       

In [3]:
#from Udep2Mono.dependency_parse import dependencyParse
#tree, postags, words = dependencyParse("All dogs eat food", parser="stanza")[0]

sentences = ["I managed to finish my homework", 
             "I recognized that this apple is good",
             "I realized that my homework is hard"]
annotations, _ = polarization.run_polarize_pipeline(
    sentences, verbose=2, parser="stanza")
print()
for annotation in annotations:
    annotated, original, polarized, postags, polarized_tree = annotation
    impgenerator = ImplicativeGenerator(len(original), db.implicative, polarized_tree)
    impgenerator.search()  

100%|██████████| 3/3 [00:01<00:00,  2.82it/s]
New tree:  [nsubj↑ [PRP i↑] [obj↑ [nmod:poss↑ [PRP$ my↑] [NN homework↑]] [VB finished↑]]]
New tree:  [nsubj↑ [det= [DT this=] [NN apple=]] [cop↑ [VBZ is↑] [JJ good↑]]]
New tree:  [nsubj↑ [nmod:poss= [PRP$ my↑] [NN homework↑]] [cop↑ [VBZ is↑] [JJ hard↑]]]



In [1]:
import numpy as np
import os
from tqdm import tqdm

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dense, Activation, LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

label_dict = {
    "+/+": 1,
    "+/-": 2,
    "+/o": 3,
    "-/+": 4,
    "-/-": 5,
    "-/o": 6,
    "o/+": 7,
    "o/-": 8,
    "o/o": 9}

In [2]:
def read_verbs(filename, label_dir):
    with open(filename, 'r') as f:
        with open(label_dir, 'r') as l:
            labels = []
            verbs = []
            for line in tqdm(f.readlines()):
                verb_data = line.split()
                verb = verb_data[0]
                verbs.append(verb)

            for line in tqdm(l.readlines()):
                lb = line.strip('\n')
                labels.append(label_dict[lb])

            #self.chars = self.flatten_list(list(map(lambda x: list(x), verbs)))
            #self.chars_size = len(self.chars)
            #self.vocab = sorted(list(set(self.chars)))
            #self.vocab_size = len(self.vocab)
            return verbs, labels
        
data_dir = '../data/VERB/verb.txt'
label_dir = '../data/VERB/signature.txt'

verbs, labels = read_verbs(data_dir, label_dir)
vocab = set(' '.join([str(i) for i in verbs]))
vocab.add('END')
len_vocab = len(vocab)
char_index = dict((c, i) for i, c in enumerate(vocab))

100%|██████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<?, ?it/s]


In [3]:
import numpy as np

X = []
y = []
maxlen=20

signature = [
     ["+/+", 1],
     ["+/-", 2],
     ["+/o", 3],
     ["-/+", 4],
     ["-/-", 5],
     ["-/o", 6],
     ["o/+", 7],
     ["o/-", 8],
     ["o/o", 9]]

# Builds an empty line with a 1 at the index of character
def set_flag(i):
    tmp = np.zeros(len_vocab);
    tmp[i] = 1
    return list(tmp)

# Truncate names and create the matrix
def prepare_X(X):
    new_list = []
    trunc_train_name = [str(i)[0:maxlen] for i in X]

    for i in trunc_train_name:
        tmp = [set_flag(char_index[j]) for j in str(i)]
        for k in range(0,maxlen - len(str(i))):
            tmp.append(set_flag(char_index["END"]))
        new_list.append(tmp)

    return new_list

def prepare_y(y):
    one_hot = []
    for idx in labels:
        init = np.zeros(9)
        init[idx-1] = 1
        one_hot.append(init)
    return one_hot

X = np.array(prepare_X(verbs))
y = np.array(prepare_y(labels))
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
model = Sequential()
model.add(Bidirectional(LSTM(512, return_sequences=True), backward_layer=LSTM(512, return_sequences=True, go_backwards=True), input_shape=(maxlen,len_vocab)))
model.add(Bidirectional(LSTM(512)))
model.add(Dense(100, activity_regularizer=l2(0.002)))
model.add(Dense(9, activity_regularizer=l2(0.002)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [18]:
callback = EarlyStopping(monitor='accuracy', patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='accuracy', mode='max', save_best_only=True, verbose=1)
reduce_lr_acc = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1, min_delta=1e-4, mode='min')

In [19]:
batch_size = 4
history = model.fit(
    X, y, 
    batch_size=batch_size, 
    epochs=50, verbose=1, 
    validation_data =(X, y), 
    callbacks=[callback, mc, reduce_lr_acc]
)

Epoch 1/50

Epoch 00001: accuracy improved from -inf to 0.65323, saving model to best_model.h5
Epoch 2/50

Epoch 00002: accuracy did not improve from 0.65323
Epoch 3/50

Epoch 00003: accuracy did not improve from 0.65323

Epoch 00003: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 4/50

Epoch 00004: accuracy did not improve from 0.65323
Epoch 5/50

Epoch 00005: accuracy did not improve from 0.65323

Epoch 00005: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-09.
Epoch 6/50

Epoch 00006: accuracy did not improve from 0.65323


In [4]:
new_names = ["manage", "get", "forget"]
X_pred = prepare_X([e for e in new_names])

In [5]:
from tensorflow import keras
model = keras.models.load_model('../model/implicative.h5')
#model.load_weights("../model/implicative.ckpt")
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
loss, acc = model.evaluate(X, y, verbose=2)
print("Trained model, accuracy: {:5.2f}%".format(100 * acc))

4/4 - 2s - loss: 3.6082 - accuracy: 0.3548
Trained model, accuracy: 35.48%


In [6]:
model1 = keras.models.load_model('../model/implicative.h5')
loss, acc = model1.evaluate(X, y, verbose=2)
print("Trained model, accuracy: {:5.2f}%".format(100 * acc))

4/4 - 1s - loss: 3.6082 - accuracy: 0.3548
Trained model, accuracy: 35.48%


In [6]:
def pred(new_names, prediction):
    return_results = []
    k = 0
    for i in prediction:
        if max(i) < 0.65:
            return_results.append([new_names[k], "N"])
        else:
            return_results.append([new_names[k], signature[np.argmax(i)]])
        k += 1
    return return_results

In [22]:
%env PYTHONHASHSEED=0

env: PYTHONHASHSEED=0


In [11]:
loss, acc = model.evaluate(X, y, verbose=2)
print("Untrained model, accuracy: {:5.2f}%".format(100 * acc))

4/4 - 1s - loss: 2.9703 - accuracy: 0.4919
Untrained model, accuracy: 49.19%


In [12]:
prediction = model.predict(X_pred)
pred(new_names, prediction)

[['manage', ['o/o', 9]], ['get', ['o/o', 9]], ['forget', ['o/o', 9]]]

In [112]:
with open('../data/VERB/verb.txt', 'w') as data:
    with open('../data/VERB/signature.txt', 'w') as label:
        verbs = set()
        for index, row in df.iterrows():
            verb = row['verb'] + '\n'
            if verb not in verbs:
                verbs.add(verb)
                data.write(verb)
                label.write(row['signature'] + '\n')