Skip to content

Commit

Permalink
Add RE2::Set support
Browse files Browse the repository at this point in the history
Adds support for RE2::Set, which is a set of patterns that can all be matched
against at once, returning the indexes of the added patterns that match.

* Add re2.Set class.
* Add re2.UNANCHORED, re2.ANCHOR_START, and re2.ANCHOR_BOTH constants for
  specifying how a pattern should be anchored. This is currently only used by
  re2.Set.
* Add tests for re2.Set.
  • Loading branch information
Gabriel Gironda authored and dreiss committed Apr 29, 2016
1 parent 1b82311 commit 33b2273
Show file tree
Hide file tree
Showing 3 changed files with 302 additions and 0 deletions.
205 changes: 205 additions & 0 deletions _re2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
using std::nothrow;

#include <re2/re2.h>
#include <re2/set.h>
using re2::RE2;
using re2::StringPiece;

Expand Down Expand Up @@ -64,6 +65,13 @@ typedef struct _MatchObject2 {
StringPiece* groups;
} MatchObject2;

typedef struct _RegexpSetObject2 {
PyObject_HEAD
// True iff re2_set_obj has been compiled.
bool compiled;
RE2::Set* re2_set_obj;
} RegexpSetObject2;


// Imported from sre_constants.
static PyObject* error_class;
Expand All @@ -86,6 +94,11 @@ static PyObject* match_groupdict(MatchObject2* self, PyObject* args, PyObject* k
static PyObject* match_start(MatchObject2* self, PyObject* args);
static PyObject* match_end(MatchObject2* self, PyObject* args);
static PyObject* match_span(MatchObject2* self, PyObject* args);
static void regexp_set_dealloc(RegexpSetObject2* self);
static PyObject* regexp_set_new(PyTypeObject* type, PyObject* args, PyObject* kwds);
static PyObject* regexp_set_add(RegexpSetObject2* self, PyObject* pattern);
static PyObject* regexp_set_compile(RegexpSetObject2* self);
static PyObject* regexp_set_match(RegexpSetObject2* self, PyObject* text);


static PyMethodDef regexp_methods[] = {
Expand Down Expand Up @@ -139,6 +152,22 @@ static PyMethodDef match_methods[] = {
{NULL} /* Sentinel */
};

static PyMethodDef regexp_set_methods[] = {
{"add", (PyCFunction)regexp_set_add, METH_O,
"add(pattern) --> index or raises an exception.\n"
" Add a pattern to the set, raising if the pattern doesn't parse."
},
{"compile", (PyCFunction)regexp_set_compile, METH_NOARGS,
"compile() --> None or raises an exception.\n"
" Compile the set to prepare it for matching."
},
{"match", (PyCFunction)regexp_set_match, METH_O,
"match(text) --> list\n"
" Match text against the set, returning the indexes of the added patterns."
},
{NULL} /* Sentinel */
};


// Simple method to block setattr.
static int
Expand Down Expand Up @@ -236,6 +265,48 @@ static PyTypeObject Match_Type2 = {
0, /*tp_new*/
};

static PyTypeObject RegexpSet_Type2 = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"_re2.RE2_Set", /*tp_name*/
sizeof(RegexpSetObject2), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)regexp_set_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash*/
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
_no_setattr, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT, /*tp_flags*/
"RE2 regexp set objects", /*tp_doc*/
0, /*tp_traverse*/
0, /*tp_clear*/
0, /*tp_richcompare*/
0, /*tp_weaklistoffset*/
0, /*tp_iter*/
0, /*tp_iternext*/
regexp_set_methods, /*tp_methods*/
0, /*tp_members*/
0, /*tp_getset*/
0, /*tp_base*/
0, /*tp_dict*/
0, /*tp_descr_get*/
0, /*tp_descr_set*/
0, /*tp_dictoffset*/
0, /*tp_init*/
0, /*tp_alloc*/
regexp_set_new, /*tp_new*/
};


static void
regexp_dealloc(RegexpObject2* self)
Expand Down Expand Up @@ -685,6 +756,129 @@ match_span(MatchObject2* self, PyObject* args)
}


static void
regexp_set_dealloc(RegexpSetObject2* self)
{
delete self->re2_set_obj;
PyObject_Del(self);
}

static PyObject*
regexp_set_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
{
int anchoring = RE2::UNANCHORED;

if (!PyArg_ParseTuple(args, "|I", &anchoring)) {
anchoring = -1;
}

switch (anchoring) {
case RE2::UNANCHORED:
case RE2::ANCHOR_START:
case RE2::ANCHOR_BOTH:
break;
default:
PyErr_SetString(PyExc_ValueError,
"anchoring must be one of re2.UNANCHORED, re2.ANCHOR_START, or re2.ANCHOR_BOTH");
return NULL;
}

RegexpSetObject2* self = (RegexpSetObject2*)type->tp_alloc(type, 0);

if (self == NULL) {
return NULL;
}
self->compiled = false;
self->re2_set_obj = NULL;

RE2::Options options;
options.set_log_errors(false);

self->re2_set_obj = new(nothrow) RE2::Set(options, (RE2::Anchor)anchoring);

if (self->re2_set_obj == NULL) {
PyErr_NoMemory();
Py_DECREF(self);
return NULL;
}

return (PyObject*)self;
}

static PyObject*
regexp_set_add(RegexpSetObject2* self, PyObject* pattern)
{
if (self->compiled) {
PyErr_SetString(PyExc_RuntimeError, "Can't add() on an already compiled Set");
return NULL;
}

const char* raw_pattern = PyString_AsString(pattern);
if (!raw_pattern) {
return NULL;
}
Py_ssize_t len_pattern = PyString_GET_SIZE(pattern);

std::string add_error;
int seq = self->re2_set_obj->Add(StringPiece(raw_pattern, (int)len_pattern), &add_error);

if (seq < 0) {
PyErr_SetString(PyExc_ValueError, add_error.c_str());
return NULL;
}

return PyInt_FromLong(seq);
}

static PyObject*
regexp_set_compile(RegexpSetObject2* self)
{
if (self->compiled) {
Py_RETURN_NONE;
}

bool compiled = self->re2_set_obj->Compile();

if (!compiled) {
PyErr_SetString(PyExc_MemoryError, "Ran out of memory during regexp compile");
return NULL;
}

self->compiled = true;
Py_RETURN_NONE;
}

static PyObject*
regexp_set_match(RegexpSetObject2* self, PyObject* text)
{
if (!self->compiled) {
PyErr_SetString(PyExc_RuntimeError, "Can't match() on an uncompiled Set");
return NULL;
}

const char* raw_text = PyString_AsString(text);
if (!raw_text) {
return NULL;
}
Py_ssize_t len_text = PyString_GET_SIZE(text);

std::vector<int> idxes;
bool matched = self->re2_set_obj->Match(StringPiece(raw_text, (int)len_text), &idxes);

if (matched) {
PyObject* match_indexes = PyList_New(idxes.size());

for(std::vector<int>::size_type i = 0; i < idxes.size(); ++i) {
PyList_SET_ITEM(match_indexes, (Py_ssize_t)i, PyInt_FromLong(idxes[i]));
}

return match_indexes;
} else {
return PyList_New(0);
}
}


static PyObject*
_compile(PyObject* self, PyObject* args, PyObject* kwds)
{
Expand Down Expand Up @@ -735,6 +929,10 @@ init_re2(void)
return;
}

if (PyType_Ready(&RegexpSet_Type2) < 0) {
return;
}

PyObject* sre_mod = PyImport_ImportModuleNoBlock("sre_constants");
if (sre_mod == NULL) {
return;
Expand All @@ -748,4 +946,11 @@ init_re2(void)

Py_INCREF(error_class);
PyModule_AddObject(mod, "error", error_class);

Py_INCREF(&RegexpSet_Type2);
PyModule_AddObject(mod, "Set", (PyObject*)&RegexpSet_Type2);

PyModule_AddIntConstant(mod, "UNANCHORED", RE2::UNANCHORED);
PyModule_AddIntConstant(mod, "ANCHOR_START", RE2::ANCHOR_START);
PyModule_AddIntConstant(mod, "ANCHOR_BOTH", RE2::ANCHOR_BOTH);
}
9 changes: 9 additions & 0 deletions re2.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,22 @@
"search",
"match",
"fullmatch",
"Set",
"UNANCHORED",
"ANCHOR_START",
"ANCHOR_BOTH",
]

# Module-private compilation function, for future caching, other enhancements
_compile = _re2._compile

error = _re2.error
escape = _re2.escape
Set = _re2.Set
UNANCHORED = _re2.UNANCHORED
ANCHOR_START = _re2.ANCHOR_START
ANCHOR_BOTH = _re2.ANCHOR_BOTH


def compile(pattern):
"Compile a regular expression pattern, returning a pattern object."
Expand Down
88 changes: 88 additions & 0 deletions tests/test_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,91 @@ def test_match_raise(self):
'''test that using the API incorrectly fails'''
r = re2.compile('ab([cde]fg)')
self.assertRaises(TypeError, lambda: re2.match(r, 'abdfghij'))

def test_set_unanchored(self):
s = re2.Set(re2.UNANCHORED)
s_with_default_anchoring = re2.Set()

for re2_set in [s, s_with_default_anchoring]:
re2_set.add('foo')
re2_set.add('bar')
re2_set.add('baz')
re2_set.compile()

self.assertEqual(re2_set.match('foo'), [0])
self.assertEqual(re2_set.match('bar'), [1])
self.assertEqual(re2_set.match('baz'), [2])
self.assertEqual(re2_set.match('afoobaryo'), [0, 1])

self.assertEqual(re2_set.match('ooba'), [])

def test_set_anchor_both(self):
s = re2.Set(re2.ANCHOR_BOTH)
self.assertEqual(s.add('foo'), 0)
self.assertEqual(s.add('bar'), 1)
s.compile()

self.assertEqual(s.match('foobar'), [])
self.assertEqual(s.match('fooba'), [])
self.assertEqual(s.match('oobar'), [])
self.assertEqual(s.match('foo'), [0])
self.assertEqual(s.match('bar'), [1])

def test_set_anchor_start(self):
s = re2.Set(re2.ANCHOR_START)
self.assertEqual(s.add('foo'), 0)
self.assertEqual(s.add('bar'), 1)
s.compile()

self.assertEqual(s.match('foobar'), [0])
self.assertEqual(s.match('oobar'), [])
self.assertEqual(s.match('foo'), [0])
self.assertEqual(s.match('ofoobaro'), [])
self.assertEqual(s.match('baro'), [1])

def test_bad_anchoring(self):
with self.assertRaisesRegexp(ValueError, 'anchoring must be one of.*'):
re2.Set(None)

with self.assertRaisesRegexp(ValueError, 'anchoring must be one of.*'):
re2.Set(15)

with self.assertRaisesRegexp(ValueError, 'anchoring must be one of.*'):
re2.Set({})

def test_match_without_compile(self):
s = re2.Set()
s.add('foo')

with self.assertRaisesRegexp(RuntimeError, 'Can\'t match\(\) on an.*'):
s.match('bar')

def test_add_after_compile(self):
s = re2.Set()
s.add('foo')
s.compile()

with self.assertRaisesRegexp(RuntimeError,
'Can\'t add\(\) on an already compiled Set'):
s.add('bar')

# Segfaults on Debain Jessie
#def test_compile_on_empty(self):
# s = re2.Set()
# s.compile()
# self.assertEqual(s.match('nah'), [])

def test_double_compile(self):
s = re2.Set()
s.add('foo')
s.compile()
s.compile()

def test_add_with_bad_pattern(self):
s = re2.Set()

with self.assertRaisesRegexp(ValueError, 'missing \)'):
s.add('(')

with self.assertRaises(TypeError):
s.add(3)

0 comments on commit 33b2273

Please sign in to comment.