In [67]:
# to import conll
import os
import sys
sys.path.insert(0, os.path.abspath('./src/'))

from conll import evaluate
# for nice tables
import pandas as pd

In [68]:
# modified version to support fst-output
def read_fst4conll(fst_file, fs="\t", oov='<unk>', otag='O', sep='+', split=False):
    """
    :param corpus_file: corpus in conll format
    :param fs: field separator
    :param oov: token to map to otag (we need to get rid of <unk> in labels)
    :param otag: otag symbol
    :param sep: 
    :param split:
    :return: corpus 
    """
    sents = []  # list to hold words list sequences
    words = []  # list to hold feature tuples

    for line in open(fst_file):
        line = line.strip()
        if len(line.strip()) > 0:
            feats = tuple(line.strip().split(fs))
            # arc has minimum 3 columns, else final state
            if len(feats) >= 3:
                ist = feats[2]  # 3rd column (input)
                ost = feats[3]  # 4th column (output)
                # replace '<unk>' with 'O'
                ost = otag if ost == oov else ost
                # ignore for now
                ost = ost.split(sep)[1] if split and ost != otag else ost
                
                words.append((ist, ost))
            else:
                sents.append(words)
                words = []
        else:
            if len(words) > 0:
                sents.append(words) 
                words = []
    return sents

In [69]:
def read_corpus_conll(corpus_file, fs="\t"):
    """
    read corpus in CoNLL format
    :param corpus_file: corpus in conll format
    :param fs: field separator
    :return: corpus
    """
    featn = None  # number of features for consistency check
    sents = []  # list to hold words list sequences
    words = []  # list to hold feature tuples

    for line in open(corpus_file):
        line = line.strip()
        if len(line.strip()) > 0:
            feats = tuple(line.strip().split(fs))
            if not featn:
                featn = len(feats)
            elif featn != len(feats) and len(feats) != 0:
                raise ValueError("Unexpected number of columns {} ({})".format(len(feats), featn))

            words.append(feats)
        else:
            if len(words) > 0:
                sents.append(words)
                words = []
    return sents

In [70]:
def compute_frequency_list(corpus):
    """
    create frequency list for a corpus
    :param corpus: corpus as list of lists
    """
    frequencies = {}
    for sent in corpus:
        for token in sent:
            frequencies[token] = frequencies.setdefault(token, 0) + 1
    return frequencies

In [71]:
def cutoff(corpus, tf_min=2):
    """
    apply min cutoffs
    :param tf_min: minimum token frequency for lexicon elements (below removed); default 2
    :return: lexicon as set
    """
    frequencies = compute_frequency_list(corpus)
    return sorted([token for token, frequency in frequencies.items() if frequency >= tf_min])

In [72]:
%%bash
dpath='dataset/NL2SparQL4NLU'
temp_folder='tmp'

mkdir -p $temp_folder

cp $dpath.train.utterances.txt $temp_folder/trn.txt
cp $dpath.test.utterances.txt $temp_folder/tst.txt

cp $dpath.train.conll.txt $temp_folder/trn.conll
cp $dpath.test.conll.txt $temp_folder/tst.conll

## Let's start

In [73]:
temp_folder = 'tmp/'

In [74]:
# create training data in utterance-per-line format for output symbols (w+t)
trn = read_corpus_conll(temp_folder + 'trn.conll')
wt_sents = [["+".join(w) for w in s] for s in trn]
wt_osyms = cutoff(wt_sents)
wt_isyms = [w.split('+')[0] for w in wt_osyms]

with open(temp_folder + 'trn.wt.txt', 'w') as f:
    for s in wt_sents:
        f.write(" ".join(s) + "\n")
        
with open(temp_folder + 'osyms.wt.lst.txt', 'w') as f:
    f.write("\n".join(wt_osyms) + "\n")
    
with open(temp_folder + 'isyms.wt.lst.txt', 'w') as f:
    f.write("\n".join(wt_isyms) + "\n")

In [75]:
%%bash
temp_folder='tmp'

ngramsymbols $temp_folder/osyms.wt.lst.txt $temp_folder/osyms.wt.txt
ngramsymbols $temp_folder/isyms.wt.lst.txt $temp_folder/isyms.wt.txt

In [76]:
%%bash
temp_folder='tmp'

# compile data into FAR
farcompilestrings \
    --symbols=$temp_folder/osyms.wt.txt \
    --keep_symbols \
    --unknown_symbol='<unk>' \
    $temp_folder/trn.wt.txt $temp_folder/trn.wt.far

# train ngram model
ngramcount --order=2 $temp_folder/trn.wt.far $temp_folder/trn.wt.cnt
ngrammake $temp_folder/trn.wt.cnt $temp_folder/wt2.lm
ngraminfo $temp_folder/wt2.lm

# of states                                       1096
# of ngram arcs                                   6179
# of backoff arcs                                 1095
initial state                                     1
unigram state                                     0
# of final states                                 533
ngram order                                       2
# of 1-grams                                      1095
# of 2-grams                                      5617
well-formed                                       y
normalized                                        y


In [77]:
def make_w2t_wt(isyms, sep='+', out=temp_folder+'w2wt.tmp'):
    special = {'<epsilon>', '<s>', '</s>'}
    oov = '<unk>'  # unknown symbol
    state = '0'    # wfst specification state
    fs = " "       # wfst specification column separator
    
    ist = sorted(list(set([line.strip().split("\t")[0] for line in open(isyms, 'r')]) - special))
    
    with open(out, 'w') as f:
        for e in ist:
            f.write(fs.join([state, state, e.split(sep)[0], e]) + "\n")
        f.write(state + "\n")

In [78]:
make_w2t_wt(temp_folder+'osyms.wt.txt', out=temp_folder+'w2wt_wt.txt')

In [79]:
%%bash
temp_folder='tmp'

# Let's compile it
fstcompile \
    --isymbols=$temp_folder/isyms.wt.txt \
    --osymbols=$temp_folder/osyms.wt.txt \
    --keep_isymbols \
    --keep_osymbols \
    $temp_folder/w2wt_wt.txt $temp_folder/w2wt_wt.bin

fstinfo $temp_folder/w2wt_wt.bin | head -n 8

fst type                                          vector
arc type                                          standard
input symbol table                                tmp/isyms.wt.txt
output symbol table                               tmp/osyms.wt.txt
# of states                                       1
# of arcs                                         1094
initial state                                     0
# of final states                                 1


In [102]:
%%bash
temp_folder='tmp'

farcompilestrings \
    --symbols=$temp_folder/isyms.wt.txt \
    --keep_symbols \
    --initial_symbols=false \
    --unknown_symbol='<unk>' \
    $temp_folder/tst.txt $temp_folder/tst.wt.far

wdir=$temp_folder/'wdir_wt'
mkdir -p $wdir

farextract --filename_prefix="$wdir/" $temp_folder/tst.wt.far

cp $wdir/tst.txt-0001 sent.wt.fsa

fstprint $temp_folder/sent.wt.fsa

In [95]:
%%bash
temp_folder='tmp'

fstcompose $temp_folder/sent.wt.fsa $temp_folder/w2wt_wt.bin | fstcompose - $temp_folder/wt2.lm | fstshortestpath | fstrmepsilon | fsttopsort | fstprint

0	1	star	star+O	7.93891811
1	2	of	of+O	1.55421352
2	3	<unk>	<unk>	2.84977818
3	1.10391009


### Evaluation

In [82]:
%%bash
temp_folder='tmp'
wdir=$temp_folder/'wdir_wt'
farr=($(ls $wdir))

for f in ${farr[@]}
do
    fstcompose $wdir/$f $temp_folder/w2wt_wt.bin | fstcompose - $temp_folder/wt2.lm |\
        fstshortestpath | fstrmepsilon | fsttopsort | fstprint --isymbols=$temp_folder/isyms.wt.txt
done > $temp_folder/w2wt_wt.wt2.out

In [83]:
refs = read_corpus_conll(temp_folder+'tst.conll')
hyps = read_fst4conll(temp_folder+'w2wt_wt.wt2.out', split=True)

results = evaluate(refs, hyps)

pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

Unnamed: 0,p,r,f,s
movie.location,0.0,0.0,0.0,7
character.name,0.667,0.267,0.381,15
movie.gross_revenue,0.0,0.0,0.0,5
director.nationality,1.0,0.0,0.0,1
award.category,1.0,0.0,0.0,2
movie.type,1.0,0.0,0.0,4
award.ceremony,0.714,0.714,0.714,7
actor.nationality,1.0,1.0,1.0,1
movie.release_date,0.412,0.483,0.444,29
movie.star_rating,1.0,0.0,0.0,1
