Skip to content

Commit

Permalink
data, metrics from is13 repo converted to python3
Browse files Browse the repository at this point in the history
  • Loading branch information
chsasank committed Jun 3, 2016
0 parents commit 9e9ec55
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 0 deletions.
Empty file added data/__init__.py
Empty file.
91 changes: 91 additions & 0 deletions data/load.py
@@ -0,0 +1,91 @@
from __future__ import print_function
import gzip
try: #python2
import cPickle as pickle
from urllib import urlretrieve
except ImportError: #python3
import pickle
from urllib.request import urlretrieve

import os
import random
from os.path import isfile

PREFIX = os.getenv('ATISDATA', '')

def download(origin):
'''
download the corresponding atis file
from http://www-etud.iro.umontreal.ca/~mesnilgr/atis/
'''
print('Downloading data from %s' % origin)
name = origin.split('/')[-1]
urlretrieve(origin, name)

def download_dropbox():
'''
download from drop box in the meantime
'''
print('Downloading data from https://www.dropbox.com/s/3lxl9jsbw0j7h8a/atis.pkl?dl=0')
os.system('wget -O atis.pkl https://www.dropbox.com/s/3lxl9jsbw0j7h8a/atis.pkl?dl=0')

def load_dropbox(filename):
if not isfile(filename):
#download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/'+filename)
download_dropbox()
#f = gzip.open(filename,'rb')
f = open(filename,'rb')
return f

def load_udem(filename):
if not isfile(filename):
download('http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/'+filename)
f = gzip.open(filename,'rb')
return f


def atisfull():
f = load_dropbox(PREFIX + 'atis.pkl')

try:
train_set, test_set, dicts = pickle.load(f)
except UnicodeDecodeError:
train_set, test_set, dicts = pickle.load(f, encoding='latin1')
return train_set, test_set, dicts

def atisfold(fold):
assert fold in range(5)
f = load_udem(PREFIX + 'atis.fold'+str(fold)+'.pkl.gz')
try:
train_set, valid_set, test_set, dicts = pickle.load(f)
except UnicodeDecodeError:
train_set, valid_set, test_set, dicts = pickle.load(f, encoding='latin1')

return train_set, valid_set, test_set, dicts

if __name__ == '__main__':

''' visualize a few sentences '''

import pdb

w2ne, w2la = {}, {}
train, test, dic = atisfull()
train, _, test, dic = atisfold(1)

w2idx, ne2idx, labels2idx = dic['words2idx'], dic['tables2idx'], dic['labels2idx']

idx2w = {w2idx[k]:k for k in w2idx}
idx2ne = {ne2idx[k]:k for k in ne2idx}
idx2la = {labels2idx[k]:k for k in labels2idx}

test_x, test_ne, test_label = test
train_x, train_ne, train_label = train
wlength = 35

for e in ['train','test']:
for sw, se, sl in zip(eval(e+'_x'), eval(e+'_ne'), eval(e+'_label')):
print('WORD'.rjust(wlength), 'LABEL'.rjust(wlength))
for wx, la in zip(sw, sl): print(idx2w[wx].rjust(wlength), idx2la[la].rjust(wlength))
print('\n'+'**'*30+'\n')
pdb.set_trace()
Empty file added metrics/__init__.py
Empty file.
87 changes: 87 additions & 0 deletions metrics/accuracy.py
@@ -0,0 +1,87 @@
import numpy
import random
import os
import stat
import subprocess
from os.path import isfile, join
from os import chmod

PREFIX = os.getenv('ATISDATA', '')

def conlleval(p, g, w, filename):
'''
INPUT:
p :: predictions
g :: groundtruth
w :: corresponding words
OUTPUT:
filename :: name of the file where the predictions
are written. it will be the input of conlleval.pl script
for computing the performance in terms of precision
recall and f1 score
'''
out = ''
for sl, sp, sw in zip(g, p, w):
out += 'BOS O O\n'
for wl, wp, w in zip(sl, sp, sw):
out += w + ' ' + wl + ' ' + wp + '\n'
out += 'EOS O O\n\n'

f = open(filename,'w')
f.writelines(out)
f.close()

return get_perf(filename)

def get_perf(filename):
''' run conlleval.pl perl script to obtain
precision/recall and F1 score '''
_conlleval = PREFIX + 'conlleval.pl'
if not isfile(_conlleval):
#download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl')
os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl')
chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions

proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
stdout, _ = proc.communicate(open(filename,'rb').read())
for line in stdout.decode("utf-8").split('\n'):
if 'accuracy' in line:
out = line.split()
break

# out = ['accuracy:', '16.26%;', 'precision:', '0.00%;', 'recall:', '0.00%;', 'FB1:', '0.00']

precision = float(out[3][:-2])
recall = float(out[5][:-2])
f1score = float(out[7])

return {'p':precision, 'r':recall, 'f1':f1score}

def get_perfo(filename):
'''
work around for using a PERL script in python
dirty but still works.
'''
tempfile = str(random.randint(1,numpy.iinfo('i').max)) + '.txt'
if not isfile(PREFIX + 'conlleval.pl'):
os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl')
#download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl')
chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions
if len(PREFIX) > 0:
chmod(PREFIX + 'conlleval.pl', stat.S_IRWXU) # give the execute permissions
cmd = PREFIX + 'conlleval.pl < %s | grep accuracy > %s'%(filename,tempfile)
else:
cmd = './conlleval.pl < %s | grep accuracy > %s'%(filename,tempfile)
print(cmd)
out = os.system(cmd)
out = open(tempfile).readlines()[0].split()
os.system('rm %s'%tempfile)
precision = float(out[6][:-2])
recall = float(out[8][:-2])
f1score = float(out[10])
return {'p':precision, 'r':recall, 'f1':f1score}

if __name__ == '__main__':
#print get_perf('valid.txt')
print(get_perf('valid.txt'))

0 comments on commit 9e9ec55

Please sign in to comment.