data, metrics from is13 repo converted to python3

chsasank · Jun 3, 2016 · 9e9ec55 · 9e9ec55
commit 9e9ec55
Show file tree

Hide file tree

Showing 4 changed files with 178 additions and 0 deletions.
diff --git a/data/__init__.py b/data/__init__.py
diff --git a/data/load.py b/data/load.py
@@ -0,0 +1,91 @@
+from __future__ import print_function
+import gzip
+try: #python2
+    import cPickle as pickle
+    from urllib import urlretrieve
+except ImportError: #python3
+    import pickle
+    from urllib.request import urlretrieve
+
+import os
+import random
+from os.path import isfile
+
+PREFIX = os.getenv('ATISDATA', '')
+
+def download(origin):
+    '''
+    download the corresponding atis file
+    from http://www-etud.iro.umontreal.ca/~mesnilgr/atis/
+    '''
+    print('Downloading data from %s' % origin)
+    name = origin.split('/')[-1]
+    urlretrieve(origin, name)
+
+def download_dropbox():
+    ''' 
+    download from drop box in the meantime
+    '''
+    print('Downloading data from https://www.dropbox.com/s/3lxl9jsbw0j7h8a/atis.pkl?dl=0')
+    os.system('wget -O atis.pkl https://www.dropbox.com/s/3lxl9jsbw0j7h8a/atis.pkl?dl=0')
+
+def load_dropbox(filename):
+    if not isfile(filename):
+        #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/'+filename)
+        download_dropbox()
+    #f = gzip.open(filename,'rb')
+    f = open(filename,'rb')
+    return f
+
+def load_udem(filename):
+    if not isfile(filename):
+        download('http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/'+filename)
+    f = gzip.open(filename,'rb')
+    return f
+
+
+def atisfull():
+    f = load_dropbox(PREFIX + 'atis.pkl')
+
+    try:
+        train_set, test_set, dicts = pickle.load(f)
+    except UnicodeDecodeError:
+        train_set, test_set, dicts = pickle.load(f, encoding='latin1')
+    return train_set, test_set, dicts
+
+def atisfold(fold):
+    assert fold in range(5)
+    f = load_udem(PREFIX + 'atis.fold'+str(fold)+'.pkl.gz')
+    try:
+        train_set, valid_set, test_set, dicts = pickle.load(f)
+    except UnicodeDecodeError:
+        train_set, valid_set, test_set, dicts = pickle.load(f, encoding='latin1')
+
+    return train_set, valid_set, test_set, dicts
+
+if __name__ == '__main__':
+
+    ''' visualize a few sentences '''
+
+    import pdb
+
+    w2ne, w2la = {}, {}
+    train, test, dic = atisfull()
+    train, _, test, dic = atisfold(1)
+
+    w2idx, ne2idx, labels2idx = dic['words2idx'], dic['tables2idx'], dic['labels2idx']
+
+    idx2w  = {w2idx[k]:k for k in w2idx}
+    idx2ne = {ne2idx[k]:k for k in ne2idx}
+    idx2la = {labels2idx[k]:k for k in labels2idx}
+
+    test_x,  test_ne,  test_label  = test
+    train_x, train_ne, train_label = train
+    wlength = 35
+
+    for e in ['train','test']:
+      for sw, se, sl in zip(eval(e+'_x'), eval(e+'_ne'), eval(e+'_label')):
+        print('WORD'.rjust(wlength), 'LABEL'.rjust(wlength))
+        for wx, la in zip(sw, sl): print(idx2w[wx].rjust(wlength), idx2la[la].rjust(wlength))
+        print('\n'+'**'*30+'\n')
+        pdb.set_trace()
diff --git a/metrics/__init__.py b/metrics/__init__.py
diff --git a/metrics/accuracy.py b/metrics/accuracy.py
@@ -0,0 +1,87 @@
+import numpy
+import random
+import os
+import stat
+import subprocess
+from os.path import isfile, join
+from os import chmod
+
+PREFIX = os.getenv('ATISDATA', '')
+
+def conlleval(p, g, w, filename):
+    '''
+    INPUT:
+    p :: predictions
+    g :: groundtruth
+    w :: corresponding words
+
+    OUTPUT:
+    filename :: name of the file where the predictions
+    are written. it will be the input of conlleval.pl script
+    for computing the performance in terms of precision
+    recall and f1 score
+    '''
+    out = ''
+    for sl, sp, sw in zip(g, p, w):
+        out += 'BOS O O\n'
+        for wl, wp, w in zip(sl, sp, sw):
+            out += w + ' ' + wl + ' ' + wp + '\n'
+        out += 'EOS O O\n\n'
+
+    f = open(filename,'w')
+    f.writelines(out)
+    f.close()
+
+    return get_perf(filename)
+
+def get_perf(filename):
+    ''' run conlleval.pl perl script to obtain
+    precision/recall and F1 score '''
+    _conlleval = PREFIX + 'conlleval.pl'
+    if not isfile(_conlleval):
+        #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') 
+        os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl')
+        chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions
+
+    proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    stdout, _ = proc.communicate(open(filename,'rb').read())
+    for line in stdout.decode("utf-8").split('\n'):
+        if 'accuracy' in line:
+            out = line.split()
+            break
+
+    # out = ['accuracy:', '16.26%;', 'precision:', '0.00%;', 'recall:', '0.00%;', 'FB1:', '0.00']
+
+    precision = float(out[3][:-2])
+    recall    = float(out[5][:-2])
+    f1score   = float(out[7])
+
+    return {'p':precision, 'r':recall, 'f1':f1score}
+
+def get_perfo(filename):
+    ''' 
+    work around for using a PERL script in python
+    dirty but still works.
+    '''
+    tempfile = str(random.randint(1,numpy.iinfo('i').max)) + '.txt'
+    if not isfile(PREFIX + 'conlleval.pl'):
+        os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl')
+        #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') 
+        chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions
+    if len(PREFIX) > 0:
+        chmod(PREFIX + 'conlleval.pl', stat.S_IRWXU) # give the execute permissions
+        cmd = PREFIX + 'conlleval.pl < %s | grep accuracy > %s'%(filename,tempfile)
+    else:
+        cmd = './conlleval.pl < %s | grep accuracy > %s'%(filename,tempfile)
+    print(cmd)
+    out = os.system(cmd)
+    out = open(tempfile).readlines()[0].split()
+    os.system('rm %s'%tempfile)
+    precision = float(out[6][:-2])
+    recall    = float(out[8][:-2])
+    f1score   = float(out[10])
+    return {'p':precision, 'r':recall, 'f1':f1score}
+
+if __name__ == '__main__':
+    #print get_perf('valid.txt')
+    print(get_perf('valid.txt'))