In [992]:
%config IPCompleter.greedy=True

In [993]:
# to import conll
import os
import subprocess
import sys
sys.path.insert(0, os.path.abspath('./src/'))

from conll import evaluate
# for nice tables
import pandas as pd

In [994]:
def execute(cmd):
    return subprocess.check_output(cmd, shell=True).decode(sys.stdout.encoding)

In [995]:
# Let's define a function to simplify working with data
# get column from loaded corpus (tokens are tuples)
def get_column(corpus, column=-1):
    return [[word[column] for word in sent] for sent in corpus]

In [996]:
# modified version to support fst-output
def read_fst4conll(fst_file, fs="\t", oov='<unk>', otag='O', sep='+', split=False):
    """
    :param corpus_file: corpus in conll format
    :param fs: field separator
    :param oov: token to map to otag (we need to get rid of <unk> in labels)
    :param otag: otag symbol
    :param sep: 
    :param split:
    :return: corpus 
    """
    sents = []  # list to hold words list sequences
    words = []  # list to hold feature tuples

    for line in open(temp_folder + fst_file):
        line = line.strip()
        if len(line.strip()) > 0:
            feats = tuple(line.strip().split(fs))
            # arc has minimum 3 columns, else final state
            if len(feats) >= 3:
                ist = feats[2]  # 3rd column (input)
                ost = feats[3]  # 4th column (output)
                # replace '<unk>' with 'O'
                ost = otag if ost == oov else ost
                # ignore for now
                ost = ost.split(sep)[1] if split and ost != otag else ost
                
                words.append((ist, ost))
            else:
                sents.append(words)
                words = []
        else:
            if len(words) > 0:
                sents.append(words) 
                words = []
    return sents

In [1034]:
def read_corpus_conll(corpus_file, fs="\t"):
    """
    read corpus in CoNLL format
    :param corpus_file: corpus in conll format
    :param fs: field separator
    :return: corpus
    """
    featn = None  # number of features for consistency check
    sents = []  # list to hold words list sequences
    words = []  # list to hold feature tuples

    for line in open(corpus_file):
        line = line.strip()
        if len(line.strip()) > 0:
            feats = tuple(line.strip().split(fs))
            if not featn:
                featn = len(feats)
            elif featn != len(feats) and len(feats) != 0:
                raise ValueError("Unexpected number of columns {} ({})".format(len(feats), featn))

            words.append(feats)
        else:
            if len(words) > 0:
                sents.append(words)
                words = []
    return sents

In [998]:
def compute_frequency_list(corpus):
    """
    create frequency list for a corpus
    :param corpus: corpus as list of lists
    """
    frequencies = {}
    for sent in corpus:
        for token in sent:
            frequencies[token] = frequencies.setdefault(token, 0) + 1
    return frequencies

In [999]:
def cutoff(corpus, tf_min=2):
    """
    apply min cutoffs
    :param tf_min: minimum token frequency for lexicon elements (below removed); default 2
    :return: lexicon as set
    """
    frequencies = compute_frequency_list(corpus)
    return sorted([token for token, frequency in frequencies.items() if frequency >= tf_min])

In [1000]:
temp_folder = 'tmp/'
wdir = temp_folder + 'wdir_wt/'

In [1001]:
dpath = 'dataset/'

def init():
    execute('mkdir -p ' + temp_folder)

    execute('cp {}NL2SparQL4NLU.train.utterances.txt {}trn.txt'.format(dpath, temp_folder))
    execute('cp {}NL2SparQL4NLU.test.utterances.txt {}tst.txt'.format(dpath, temp_folder))

    execute('cp {}NL2SparQL4NLU.train.conll.txt {}trn.conll'.format(dpath, temp_folder))
    execute('cp {}NL2SparQL4NLU.test.conll.txt {}tst.conll'.format(dpath, temp_folder))

In [1002]:
init()

## SCLM functions

In [1003]:
from enum import Enum

class Baseline(Enum):
    none = 0
    random_path = 1
    output_symbol_priors = 2
    MLE = 3

In [1037]:
def create_training_data(min_freq=2):
    # create training data in utterance-per-line format for output symbols (w+t)
    trn = read_corpus_conll(temp_folder + 'trn.conll')
    wt_sents = [["+".join(w) for w in s] for s in trn]
    wt_osyms = cutoff(wt_sents, min_freq)
    wt_isyms = [w.split('+')[0] for w in wt_osyms]

    
    with open(temp_folder + 'trn.wt.txt', 'w') as f:
        for s in wt_sents:
            f.write(" ".join(s) + "\n")

    with open(temp_folder + 'osyms.wt.lst.txt', 'w') as f:
        f.write("\n".join(wt_osyms) + "\n")

    with open(temp_folder + 'isyms.wt.lst.txt', 'w') as f:
        f.write("\n".join(wt_isyms) + "\n")
        
        
    tags = get_column(trn, column=-1)
    with open(temp_folder + 'trn.t.txt', 'w') as f:
        for s in tags:
            f.write(" ".join(s) + "\n")

    t_osyms = list(set([ x.split('+')[1] for x in wt_osyms ]))
    with open(temp_folder + 'osyms.t.lst.txt', 'w') as f:
        f.write("\n".join(t_osyms) + "\n")

In [1005]:
def create_symbol_table():
    execute('ngramsymbols {0}osyms.wt.lst.txt {0}osyms.wt.txt'.format(temp_folder))
    execute('ngramsymbols {0}isyms.wt.lst.txt {0}isyms.wt.txt'.format(temp_folder))
    
    execute('ngramsymbols {0}osyms.t.lst.txt {0}osyms.t.txt'.format(temp_folder))

In [1039]:
def create_msyms(isyms, osyms, out_msyms, out_trn):
    execute('cat {0}{1} {0}{2} | cut -f 1 | sort | uniq > {0}msyms.m.lst.txt'
            .format(temp_folder, isyms, osyms))
    execute('ngramsymbols {0}msyms.m.lst.txt {0}{1}'.format(temp_folder, out_msyms))

    execute("cat {0}trn.conll | sed '/^$/d' | awk '{{print $2,$1}}' > {0}{1}".format(temp_folder, out_trn))

In [1007]:
def compile_FAR(symbols, inp, out):
    execute("farcompilestrings \
        --symbols={0}{1} \
        --keep_symbols \
        --unknown_symbol='<unk>' \
        {0}{2} {0}{3}".format(temp_folder, symbols, inp, out))

In [1008]:
def train_ngram_model(ngram_order, inp, out):
    execute('ngramcount --order={1} {0}{2} {0}trn.cnt'.format(temp_folder, ngram_order, inp))
    execute('ngrammake {0}trn.cnt {0}{1}'.format(temp_folder, out))
    return execute('ngraminfo {0}{1}'.format(temp_folder, out))

In [1009]:
def create_ngram_probs(symbols, inp, out):
    execute('ngramprint \
        --symbols={0}{1}\
        --negativelogs \
        {0}{2} {0}{3}'.format(temp_folder, symbols, inp, out))

In [1010]:
def make_w2t_wt(isyms, sep='+', out=''):
    special = {'<epsilon>', '<s>', '</s>'}
    oov = '<unk>'  # unknown symbol
    state = '0'    # wfst specification state
    fs = " "       # wfst specification column separator
    
    ist = sorted(list(set([line.strip().split("\t")[0] for line in open(temp_folder + isyms, 'r')]) - special))
    
    
    with open(temp_folder + out, 'w') as f:
        for e in ist:
            f.write(fs.join([state, state, e.split(sep)[0], e]) + "\n")
        f.write(state + "\n")

In [1011]:
def make_w2t(isyms, osyms, out):
    special = {'<epsilon>', '<s>', '</s>'}
    oov = '<unk>'  # unknown symbol
    state = '0'    # wfst specification state
    fs = " "       # wfst specification column separator
    
    ist = sorted(list(set([line.strip().split("\t")[0] for line in open(temp_folder + isyms, 'r')]) - special))
    ost = sorted(list(set([line.strip().split("\t")[0] for line in open(temp_folder + osyms, 'r')]) - special))
    
    with open(temp_folder + out, 'w') as f:
        for i in range(len(ist)):
            for j in range(len(ost)):
                f.write(fs.join([state, state, ist[i], ost[j]]) + "\n")
        f.write(state + "\n")

In [1012]:
def make_w2t_mle(probs, out):
    special = {'<epsilon>', '<s>', '</s>'}
    oov = '<unk>'  # unknown symbol
    state = '0'    # wfst specification state
    fs = " "       # wfst specification column separator
    otag = 'O'
    mcn = 3        # minimum column number
    
    lines = [line.strip().split("\t") for line in open(temp_folder + probs, 'r')]

    with open(temp_folder + out, 'w') as f:
        for line in lines:
            ngram = line[0]
            ngram_words = ngram.split()  # by space
            if len(ngram_words) == 2:
                if set(ngram_words).isdisjoint(set(special)):
                    if ngram_words[0] in [otag, oov]:
                        f.write(fs.join([state, state] + ngram_words + [line[1]]) + "\n")
                    elif ngram_words[0].startswith("B-") or ngram_words[0].startswith("I-"):
                        f.write(fs.join([state, state] + line) + "\n")
        f.write(state + "\n")

In [1013]:
def compile_w2wt_wt(isyms, osyms, inp, out, invert=False):
    execute('fstcompile \
        --isymbols={0}{1} \
        --osymbols={0}{2} \
        --keep_isymbols \
        --keep_osymbols \
        {0}{3} {0}{4}'.format(temp_folder, isyms, osyms, inp, out))
    
    if invert:
        execute('fstinvert {0}{1} {0}{1}'.format(temp_folder, out))

In [1014]:
def compile_testing_and_extract(symbols, inp):
    execute("farcompilestrings \
        --symbols={0}{1} \
        --keep_symbols \
        --initial_symbols=false \
        --unknown_symbol='<unk>' \
        {0}{2} {0}tst.wt.far".format(temp_folder, symbols, inp))

    execute('rm -r -f ' + wdir)
    execute('mkdir ' + wdir)

    execute('farextract --filename_prefix="{1}" {0}tst.wt.far'.format(temp_folder, wdir))

In [1015]:
def compose_fst(baseline:Baseline, inp_bin, model, out):
    fst_files = [f for f in os.listdir(wdir) if os.path.isfile(os.path.join(wdir, f))]

    fst_out = ''
    
    path_strategy = 'fstshortestpath'
    if baseline == Baseline.random_path:
        path_srtategy = 'fstrandgen'

    for f in sorted(fst_files):
        tmp = execute('fstcompose {1}{2} {0}{4} | fstcompose - {0}{5} |\
            {3} | fstrmepsilon | fsttopsort | fstprint --isymbols={0}isyms.wt.txt'
                  .format(temp_folder, wdir, f, path_strategy, inp_bin, model))
            
        fst_out += tmp

    with open(temp_folder + out, 'w+') as f:
        f.write(fst_out)

### Evaluation functions

In [1016]:
def compute_accuracy(refs, hyps):
    assert len(refs) == len(hyps), "Different sizes {} - {}".format(len(refs), len(hyps))

    correct_tags = 0
    wrong_tags = 0
    
    for i in range(len(refs)):
        ref = refs[i]
        hyp = hyps[i]
        assert len(ref) == len(hyp)        
        
        for j in range(len(ref)):
            if hyp[j][1] == ref[j][1]:
                correct_tags += 1
            else:
                wrong_tags += 1


    return correct_tags / (correct_tags + wrong_tags)

In [1058]:
def show_metrics(refs, hyps, show_table=False):
    acc = compute_accuracy(refs, hyps)
    print("\tAccuracy:\t", acc)

    results = evaluate(refs, hyps)

    print("\tPrecision:\t", results['total']['p'])
    print("\tRecall:\t\t", results['total']['r'])    
    print("\tF1-score:\t", results['total']['f'])
    
    if show_table:
        pd_tbl = pd.DataFrame().from_dict(results, orient='index')
        return pd_tbl.round(decimals=3)

In [1033]:
def evaluate_fst(show_table, refs_file, hyps_file, split):
    refs = read_corpus_conll(temp_folder + refs_file)
    hyps = read_fst4conll(hyps_file, split=split)
    
    
    return show_metrics(refs, hyps, show_table)

In [1051]:
def compute_SCLM(min_freq, ngram_degree, baseline=Baseline.none, prob_ngram_degree=2, show_table=False):
    print("\n" + "-" * 50 + "\n")
    print("SCLM | min freq: {} | ngram degree: {} | baseline: {} | prob deg: {}"
          .format(min_freq, ngram_degree, baseline, prob_ngram_degree))
    
    trn_ds = 'trn.wt.txt'

    isyms = 'isyms.wt.txt'
    osyms = 'osyms.wt.txt'
    
    split = True
    

    if baseline in [Baseline.output_symbol_priors, Baseline.MLE]:
        trn_ds = 'trn.t.txt'
        osyms = 'osyms.t.txt'
        split = False


    create_training_data(min_freq=min_freq)
    create_symbol_table()
    compile_FAR(osyms, inp=trn_ds, out='trn.far')
    train_ngram_model(ngram_degree, inp='trn.far', out='model.lm')

    if baseline in [Baseline.none]:
        make_w2t_wt(osyms, out='w2wt.txt')
    elif baseline in [Baseline.random_path, Baseline.output_symbol_priors]:
        make_w2t(isyms, osyms, out='w2wt.txt')

    if baseline != Baseline.MLE:
        compile_w2wt_wt(isyms, osyms, inp='w2wt.txt', out='w2wt.bin')
    else:
        #create ngram probabilities
        create_msyms(isyms, osyms, out_msyms='msyms.t.txt', out_trn='trn.w2t.txt')
        compile_FAR('msyms.t.txt', inp='trn.w2t.txt', out='trn.prob.far')
        train_ngram_model(prob_ngram_degree, inp='trn.prob.far', out='model.prob.lm')
        create_ngram_probs(symbols='msyms.t.txt', inp='model.prob.lm', out='trn.w2t.probs')
        
        make_w2t_mle('trn.w2t.probs', out="w2t_mle.txt")
        compile_w2wt_wt(osyms, isyms, inp='w2t_mle.txt', out='w2wt.bin', invert=True)

    compile_testing_and_extract(symbols=isyms, inp='tst.txt')

    compose_fst(baseline, inp_bin='w2wt.bin', model='model.lm', out='w2wt.wt2.out')

    return evaluate_fst(show_table, refs_file='tst.conll', hyps_file='w2wt.wt2.out', split=split)

## HMM functions

In [1020]:
from nltk.corpus.reader.conll import ConllChunkCorpusReader
import nltk.tag.hmm as hmm
import re

In [1021]:
def parse_iob(t):
    m = re.match(r'^([^-]*)-(.*)$', t)
    return m.groups() if m else (t, None)

def get_chunks(corpus_file, fs="\t", otag="O"):
    sents = read_corpus_conll(corpus_file, fs=fs)
    return set([parse_iob(token[-1])[1] for sent in sents for token in sent if token[-1] != otag])

In [1022]:
def load_dataset_hmm():
    trn = dpath + 'NL2SparQL4NLU.train.conll.txt'
    concepts = sorted(get_chunks(trn))

    trn_data = ConllChunkCorpusReader(dpath,  r'NL2SparQL4NLU.train.conll.txt', concepts)
    tst_data = ConllChunkCorpusReader(dpath,  r'NL2SparQL4NLU.test.conll.txt', concepts)

    return trn_data, tst_data

In [1023]:
def train_hmm(trn_data):
    hmm_model = hmm.HiddenMarkovModelTrainer()
    hmm_tagger = hmm_model.train(trn_data.tagged_sents())
    return hmm_tagger

    
def evaluate_hmm(hmm_tagger, tst_data):
    accuracy = hmm_tagger.evaluate(tst_data.tagged_sents())
    
    refs = [s for s in tst_data.tagged_sents()]
    hyps = [hmm_tagger.tag(s) for s in tst_data.sents()]

    return show_metrics(refs, hyps)

In [1024]:
def compute_HMM():
    trn_data, tst_data = load_dataset_hmm()
    
    hmm_tagger = train_hmm(trn_data)
    return evaluate_hmm(hmm_tagger, tst_data)

## Let's start computing

In [1035]:
compute_HMM()

Accuracy: 0.9086693831670648
Precision: 0.7719112988384371
Recall: 0.6700274977085243
F1-score: 0.717369970559372


In [1061]:
for min_freq in range(2, 5):
    for ngram_degree in range(1, 6):
        compute_SCLM(min_freq=min_freq, ngram_degree=ngram_degree, baseline=Baseline.none)
        compute_SCLM(min_freq=min_freq, ngram_degree=ngram_degree, baseline=Baseline.random_path)
        compute_SCLM(min_freq=min_freq, ngram_degree=ngram_degree, baseline=Baseline.output_symbol_priors)
        for prob_ngram_degree in range(2, 5):
            compute_SCLM(min_freq=min_freq, ngram_degree=ngram_degree, baseline=Baseline.MLE, 
                         prob_ngram_degree=prob_ngram_degree)


--------------------------------------------------

SCLM | min freq: 2 | ngram degree: 1 | baseline: Baseline.none | prob deg: 2
	Accuracy:	 0.8804271462694956
	Precision:	 0.553763440860215
	Recall:		 0.5664527956003667
	F1-score:	 0.5600362483008609

--------------------------------------------------

SCLM | min freq: 2 | ngram degree: 1 | baseline: Baseline.random_path | prob deg: 2
	Accuracy:	 0.7215118729801883
	Precision:	 1
	Recall:		 0.0
	F1-score:	 0.0

--------------------------------------------------

SCLM | min freq: 2 | ngram degree: 1 | baseline: Baseline.output_symbol_priors | prob deg: 2
	Accuracy:	 0.7215118729801883
	Precision:	 1
	Recall:		 0.0
	F1-score:	 0.0

--------------------------------------------------

SCLM | min freq: 2 | ngram degree: 1 | baseline: Baseline.MLE | prob deg: 2
	Accuracy:	 0.880567654910777
	Precision:	 0.5546594982078853
	Recall:		 0.5673693858845096
	F1-score:	 0.5609424558223832

--------------------------------------------------

SCLM 

	Accuracy:	 0.7215118729801883
	Precision:	 1
	Recall:		 0.0
	F1-score:	 0.0

--------------------------------------------------

SCLM | min freq: 3 | ngram degree: 1 | baseline: Baseline.MLE | prob deg: 2
	Accuracy:	 0.8679218771954476
	Precision:	 0.558736426456071
	Recall:		 0.5187901008249313
	F1-score:	 0.5380228136882129

--------------------------------------------------

SCLM | min freq: 3 | ngram degree: 1 | baseline: Baseline.MLE | prob deg: 3
	Accuracy:	 0.8679218771954476
	Precision:	 0.558736426456071
	Recall:		 0.5187901008249313
	F1-score:	 0.5380228136882129

--------------------------------------------------

SCLM | min freq: 3 | ngram degree: 1 | baseline: Baseline.MLE | prob deg: 4
	Accuracy:	 0.8679218771954476
	Precision:	 0.558736426456071
	Recall:		 0.5187901008249313
	F1-score:	 0.5380228136882129

--------------------------------------------------

SCLM | min freq: 3 | ngram degree: 2 | baseline: Baseline.none | prob deg: 2
	Accuracy:	 0.9016439511029928
	Preci

	Accuracy:	 0.8541520303498665
	Precision:	 0.5374592833876222
	Recall:		 0.4537121906507791
	F1-score:	 0.4920477137176939

--------------------------------------------------

SCLM | min freq: 4 | ngram degree: 2 | baseline: Baseline.none | prob deg: 2
	Accuracy:	 0.8882956301812561
	Precision:	 0.7283372365339579
	Recall:		 0.5701191567369386
	F1-score:	 0.6395886889460154

--------------------------------------------------

SCLM | min freq: 4 | ngram degree: 2 | baseline: Baseline.random_path | prob deg: 2
	Accuracy:	 0.4964170296473233
	Precision:	 0.026865671641791045
	Recall:		 0.016498625114573784
	F1-score:	 0.020442930153321975

--------------------------------------------------

SCLM | min freq: 4 | ngram degree: 2 | baseline: Baseline.output_symbol_priors | prob deg: 2
	Accuracy:	 0.7215118729801883
	Precision:	 1
	Recall:		 0.0
	F1-score:	 0.0

--------------------------------------------------

SCLM | min freq: 4 | ngram degree: 2 | baseline: Baseline.MLE | prob deg: 2
	Ac