In [517]:
%config IPCompleter.greedy=True

In [489]:
# to import conll
import os
import subprocess
import sys
sys.path.insert(0, os.path.abspath('./src/'))

from conll import evaluate
# for nice tables
import pandas as pd

In [490]:
def execute(cmd):
    return subprocess.check_output(cmd, shell=True).decode(sys.stdout.encoding)

In [491]:
# modified version to support fst-output
def read_fst4conll(fst_file, fs="\t", oov='<unk>', otag='O', sep='+', split=False):
    """
    :param corpus_file: corpus in conll format
    :param fs: field separator
    :param oov: token to map to otag (we need to get rid of <unk> in labels)
    :param otag: otag symbol
    :param sep: 
    :param split:
    :return: corpus 
    """
    sents = []  # list to hold words list sequences
    words = []  # list to hold feature tuples

    for line in open(fst_file):
        line = line.strip()
        if len(line.strip()) > 0:
            feats = tuple(line.strip().split(fs))
            # arc has minimum 3 columns, else final state
            if len(feats) >= 3:
                ist = feats[2]  # 3rd column (input)
                ost = feats[3]  # 4th column (output)
                # replace '<unk>' with 'O'
                ost = otag if ost == oov else ost
                # ignore for now
                ost = ost.split(sep)[1] if split and ost != otag else ost
                
                words.append((ist, ost))
            else:
                sents.append(words)
                words = []
        else:
            if len(words) > 0:
                sents.append(words) 
                words = []
    return sents

In [492]:
def read_corpus_conll(corpus_file, fs="\t"):
    """
    read corpus in CoNLL format
    :param corpus_file: corpus in conll format
    :param fs: field separator
    :return: corpus
    """
    featn = None  # number of features for consistency check
    sents = []  # list to hold words list sequences
    words = []  # list to hold feature tuples

    for line in open(corpus_file):
        line = line.strip()
        if len(line.strip()) > 0:
            feats = tuple(line.strip().split(fs))
            if not featn:
                featn = len(feats)
            elif featn != len(feats) and len(feats) != 0:
                raise ValueError("Unexpected number of columns {} ({})".format(len(feats), featn))

            words.append(feats)
        else:
            if len(words) > 0:
                sents.append(words)
                words = []
    return sents

In [493]:
def compute_frequency_list(corpus):
    """
    create frequency list for a corpus
    :param corpus: corpus as list of lists
    """
    frequencies = {}
    for sent in corpus:
        for token in sent:
            frequencies[token] = frequencies.setdefault(token, 0) + 1
    return frequencies

In [494]:
def cutoff(corpus, tf_min=2):
    """
    apply min cutoffs
    :param tf_min: minimum token frequency for lexicon elements (below removed); default 2
    :return: lexicon as set
    """
    frequencies = compute_frequency_list(corpus)
    return sorted([token for token, frequency in frequencies.items() if frequency >= tf_min])

In [495]:
temp_folder = 'tmp/'
wdir = temp_folder + 'wdir_wt/'

In [496]:
dpath = 'dataset/'

def init():
    execute('mkdir -p ' + temp_folder)

    execute('cp {}NL2SparQL4NLU.train.utterances.txt {}trn.txt'.format(dpath, temp_folder))
    execute('cp {}NL2SparQL4NLU.test.utterances.txt {}tst.txt'.format(dpath, temp_folder))

    execute('cp {}NL2SparQL4NLU.train.conll.txt {}trn.conll'.format(dpath, temp_folder))
    execute('cp {}NL2SparQL4NLU.test.conll.txt {}tst.conll'.format(dpath, temp_folder))

In [497]:
init()

## Let's start

In [498]:
def create_training_data(min_freq=2):
    # create training data in utterance-per-line format for output symbols (w+t)
    trn = read_corpus_conll(temp_folder + 'trn.conll')
    wt_sents = [["+".join(w) for w in s] for s in trn]
    wt_osyms = cutoff(wt_sents, min_freq)
    wt_isyms = [w.split('+')[0] for w in wt_osyms]

    with open(temp_folder + 'trn.wt.txt', 'w') as f:
        for s in wt_sents:
            f.write(" ".join(s) + "\n")

    with open(temp_folder + 'osyms.wt.lst.txt', 'w') as f:
        f.write("\n".join(wt_osyms) + "\n")

    with open(temp_folder + 'isyms.wt.lst.txt', 'w') as f:
        f.write("\n".join(wt_isyms) + "\n")

In [499]:
def create_symbol_table():
    execute('ngramsymbols {0}osyms.wt.lst.txt {0}osyms.wt.txt'.format(temp_folder))
    execute('ngramsymbols {0}isyms.wt.lst.txt {0}isyms.wt.txt'.format(temp_folder))

In [500]:
def compile_FAR():
    execute("farcompilestrings \
        --symbols={0}osyms.wt.txt \
        --keep_symbols \
        --unknown_symbol='<unk>' \
        {0}trn.wt.txt {0}trn.wt.far".format(temp_folder))

In [501]:
def train_ngram_model(ngram_order):
    execute('ngramcount --order={1} {0}trn.wt.far {0}trn.wt.cnt'.format(temp_folder, ngram_order))
    execute('ngrammake {0}trn.wt.cnt {0}wt2.lm'.format(temp_folder))
    execute('ngraminfo {0}wt2.lm'.format(temp_folder))

In [502]:
def make_w2t_wt(isyms, sep='+', out=temp_folder+'w2wt.tmp'):
    special = {'<epsilon>', '<s>', '</s>'}
    oov = '<unk>'  # unknown symbol
    state = '0'    # wfst specification state
    fs = " "       # wfst specification column separator
    
    ist = sorted(list(set([line.strip().split("\t")[0] for line in open(isyms, 'r')]) - special))
    
    with open(out, 'w') as f:
        for e in ist:
            f.write(fs.join([state, state, e.split(sep)[0], e]) + "\n")
        f.write(state + "\n")

In [503]:
def compile_w2wt_wt():
    execute('fstcompile \
        --isymbols={0}isyms.wt.txt \
        --osymbols={0}osyms.wt.txt \
        --keep_isymbols \
        --keep_osymbols \
        {0}w2wt_wt.txt {0}w2wt_wt.bin'.format(temp_folder))

    #info = execute('fstinfo {0}w2wt_wt.bin | head -n 8'.format(temp_folder))
    #print(info)

In [504]:
def compile_strings_and_extract():
    execute("farcompilestrings \
        --symbols={0}isyms.wt.txt \
        --keep_symbols \
        --initial_symbols=false \
        --unknown_symbol='<unk>' \
        {0}tst.txt {0}tst.wt.far".format(temp_folder))

    execute('rm -r ' + wdir)
    execute('mkdir ' + wdir)

    execute('farextract --filename_prefix="{1}" {0}tst.wt.far'.format(temp_folder, wdir))

### Evaluation functions

In [505]:
def compose_fst():
    fst_files = [f for f in os.listdir(wdir) if os.path.isfile(os.path.join(wdir, f))]

    fst_out = ''

    for f in sorted(fst_files):
        tmp = execute('fstcompose {1}{2} {0}w2wt_wt.bin | fstcompose - {0}wt2.lm |\
            fstshortestpath | fstrmepsilon | fsttopsort | fstprint --isymbols={0}isyms.wt.txt'
                  .format(temp_folder, wdir, f))
        if(len(tmp) == 0):
            print("empty", f)
            
        fst_out += tmp

    with open(temp_folder + 'w2wt_wt.wt2.out', 'w+') as f:
        f.write(fst_out)

In [506]:
def compute_accuracy(refs, hyps):
    assert len(refs) == len(hyps)

    correct_tags = 0
    wrong_tags = 0
    
    for i in range(len(refs)):
        ref = refs[i]
        hyp = hyps[i]
        assert len(ref) == len(hyp)        
        
        for j in range(len(ref)):
            if hyp[j][1] == ref[j][1]:
                correct_tags += 1
            else:
                wrong_tags += 1


    return correct_tags / (correct_tags + wrong_tags)

In [507]:
def show_metrics(refs, hyps, show_table=False):
    acc = compute_accuracy(refs, hyps)
    print("Accuracy:", acc)

    results = evaluate(refs, hyps)

    print("Precision:", results['total']['p'])
    print("Recall:", results['total']['r'])    
    print("F1-score:", results['total']['f'])
    
    if show_table:
        pd_tbl = pd.DataFrame().from_dict(results, orient='index')
        return pd_tbl.round(decimals=3)

In [508]:
def evaluate_fst():
    refs = read_corpus_conll(temp_folder + 'tst.conll')
    hyps = read_fst4conll(temp_folder + 'w2wt_wt.wt2.out', split=True)
    
    
    return show_metrics(refs, hyps)

In [509]:
def compute_SCLM(min_freq, ngram_degree):
    create_training_data(min_freq=min_freq)
    create_symbol_table()
    compile_FAR()
    train_ngram_model(ngram_degree)
    
    make_w2t_wt(temp_folder + 'osyms.wt.txt', out=temp_folder + 'w2wt_wt.txt')
    
    compile_w2wt_wt()
    compile_strings_and_extract()
    
    compose_fst()
    
    return evaluate_fst()

## HMM functions

In [510]:
from nltk.corpus.reader.conll import ConllChunkCorpusReader
import nltk.tag.hmm as hmm
import re

In [511]:
def parse_iob(t):
    m = re.match(r'^([^-]*)-(.*)$', t)
    return m.groups() if m else (t, None)

def get_chunks(corpus_file, fs="\t", otag="O"):
    sents = read_corpus_conll(corpus_file, fs=fs)
    return set([parse_iob(token[-1])[1] for sent in sents for token in sent if token[-1] != otag])

In [512]:
def load_dataset_hmm():
    trn = dpath + 'NL2SparQL4NLU.train.conll.txt'
    concepts = sorted(get_chunks(trn))

    trn_data = ConllChunkCorpusReader(dpath,  r'NL2SparQL4NLU.train.conll.txt', concepts)
    tst_data = ConllChunkCorpusReader(dpath,  r'NL2SparQL4NLU.test.conll.txt', concepts)
    
    return trn_data, tst_data

In [513]:
def train_hmm(trn_data):
    hmm_model = hmm.HiddenMarkovModelTrainer()
    hmm_tagger = hmm_model.train(trn_data.tagged_sents())
    return hmm_tagger

    
def evaluate_hmm(hmm_tagger, tst_data):
    accuracy = hmm_tagger.evaluate(tst_data.tagged_sents())
    
    refs = [s for s in tst_data.tagged_sents()]
    hyps = [hmm_tagger.tag(s) for s in tst_data.sents()]

    return show_metrics(refs, hyps)

In [514]:
def compute_HMM():
    trn_data, tst_data = load_dataset_hmm()
    hmm_tagger = train_hmm(trn_data)
    return evaluate_hmm(hmm_tagger, tst_data)

## Let's start computing

In [515]:
compute_SCLM(2, 2)

Accuracy: 0.9158353238724182
Precision: 0.7532210109018831
Recall: 0.696608615948671
F1-score: 0.7238095238095239


In [516]:
compute_HMM()

Accuracy: 0.9086693831670648
Precision: 0.7719112988384371
Recall: 0.6700274977085243
F1-score: 0.717369970559372
