In [153]:
from stanfordnlp.server import CoreNLPClient

In [154]:
client = CoreNLPClient(start_server=False)

In [155]:
import string
punct = set(string.punctuation)
noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]

In [156]:
def assign_unknown(word):
    if any(char.isdigit() for char in word):
        return 'unk_digit'
    elif any(char in punct for char in word):
        return 'unk_punct'
    elif any(char.isupper() for char in word):
        return 'unk_upper'
    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return 'unk_noun'
    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return 'unk_verb'
    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return 'unk_adj'
    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return 'unk_adv'
    return 'unk'

In [157]:
def preprocess(vocab, path):
    data = []
    file = open(path).readlines()
    
    for index, word in enumerate(file):
        if not word.split():
            word = '--n--'
            data.append(word)
            continue
        elif word.strip() not in vocab:
            word = assign_unknown(word)
            data.append(word)
            continue
        data.append(word.strip())
    return data

In [158]:
vocab = open('vocab.txt').read().split('\n')
print('Number of vocabs:', len(vocab))
vocab[0:5]

Number of vocabs: 23777


['!', '#', '$', '%', '&']

In [159]:
test_words = preprocess(vocab, 'test.txt')
print('Number of test words:', len(test_words))
test_words[0:5]

Number of test words: 34199


['The', 'economy', "'s", 'temperature', 'will']

In [160]:
#code chia file test
with open('quan_test.txt', 'w') as quan:
    with open('anh_test.txt', 'w') as anh:
        with open('gold.txt') as root:
            i = 1
            for r in root.readlines():
                if i <= 60:
                    quan.write(r)
                else:
                    anh.write(r)
                if r == '.\t.\n':
                    i += 1
                    if i > 120:
                        break

In [161]:
test_ = open('anh_test.txt').readlines()
print('Number of gold words:', len(test_))

Number of gold words: 1596


In [162]:
def seperate_word_tag(word_tag, vocab): 
    if not word_tag.split():
        word = '--n--'
        tag = '--s--'
        return word, tag
    else:
        word, tag = word_tag.split()
        if word not in vocab: 
            word = assign_unknown(word)
        return word, tag
    return None

In [163]:
#code tách từ và tag ra riêng
word = []
tag_true = []
for word_tag in test_:
    w, t = seperate_word_tag(word_tag, vocab) 
    word.append(w)
    tag_true.append(t)

In [164]:
#hàm tính độ chính xác input tag_true, tag_pre => accuracy
from sklearn.metrics import precision_score, recall_score, accuracy_score
def evaluate(tag_true, tag_pre):
    precision = precision_score(tag_true, tag_pre, average='micro')
    recal = recall_score(tag_true, tag_pre, average='micro')
    accuracy = accuracy_score(tag_true, tag_pre)
    return precision, recal, accuracy

In [165]:
def predict_from_lib(sentence_test):
    tag_pre = []
    for sentence in sentence_test:
        ann = client.annotate(sentence, annotators=['pos'])
        sent = ann.sentence[0]
        token = sent.token
        tag_pre.append(token[0].pos)
    return tag_pre

In [166]:
tag_pre = predict_from_lib(word)

In [167]:
precision, recal, accuracy = evaluate(tag_pre, tag_true)
print('Precision: %.3f' % precision)
print('Recal: %.3f' % recal)
print('Accuracy: %.3f' % accuracy)

Precision: 0.845
Recal: 0.845
Accuracy: 0.845


In [168]:
def demo_output(sentence):
    output = ''
    for word in sentence.split():
        ann = client.annotate(sentence, annotators=['pos'])
        sent = ann.sentence[0]
        token = sent.token
        output += word + '/' + token[0].pos + ' '
    return output.strip()

In [169]:
#demo output
a = 'i love you'
demo_output(a)

'i/PRP love/PRP you/PRP'