Skip to content

Commit

Permalink
NumericFields replaced with Point queries.
Browse files Browse the repository at this point in the history
  • Loading branch information
coady committed Dec 29, 2017
1 parent de0c885 commit cb94204
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 73 deletions.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ so falling back to using PyLucene directly is always an option, but should never

Usage
==================
PyLucene requries initializing the VM.
PyLucene requires initializing the VM.

.. code-block:: python
Expand Down
10 changes: 1 addition & 9 deletions docs/engine.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ engine

* `TokenFilter`_, `Analyzer`_
* `IndexSearcher`_, `MultiSearcher`_, `IndexWriter`_, `Indexer`_
* `Document`_, `Field`_, `NestedField`_, `NumericField`_, `DateTimeField`_, `SpatialField`_
* `Document`_, `Field`_, `NestedField`_, `DateTimeField`_, `SpatialField`_
* `Query`_


Expand Down Expand Up @@ -151,14 +151,6 @@ NestedField
:show-inheritance:
:members:

NumericField
^^^^^^^^^^^^^
.. versionchanged:: 1.5 recommended to specify initial int or float type
.. versionchanged:: 1.6 custom step removed in favor of numericPrecisionStep
.. autoclass:: NumericField
:show-inheritance:
:members:

DateTimeField
^^^^^^^^^^^^^
.. autoclass:: DateTimeField
Expand Down
6 changes: 3 additions & 3 deletions examples/searching.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import lucene
from lupyne import engine
assert lucene.getVMEnv() or lucene.initVM()
Q = engine.Query

docs = [
{'city': 'San Francisco', 'state': 'CA', 'incorporated': '1850-04-15', 'population': 808976, 'longitude': -122.4192, 'latitude': 37.7752},
Expand All @@ -38,7 +39,7 @@
# set method supports custom field types inheriting their default settings
indexer.set('incorporated', engine.DateTimeField)
indexer.set('year-month-day', engine.NestedField, sep='-')
indexer.set('population', engine.NumericField)
indexer.set('population', dimensions=1)
indexer.set('point', engine.SpatialField)
# assigned fields can have a different key from their underlying field name
indexer.fields['location'] = engine.NestedField('state.city')
Expand All @@ -63,8 +64,7 @@
assert str(query) == 'year-month-day:[1850-04-10 TO *}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

query = indexer.fields['population'].range(0, 1000000)
assert str(query) == 'population:[0 TO 999999]'
query = Q.ranges('population', (0, 1000000))
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

cities = ['San Francisco', 'Los Angeles', 'Portland']
Expand Down
2 changes: 1 addition & 1 deletion lupyne/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import lucene # flake8: noqa
from .analyzers import Analyzer, TokenFilter
from .queries import Query
from .documents import Document, Field, NestedField, NumericField, DateTimeField, SpatialField
from .documents import Document, Field, NestedField, DateTimeField, SpatialField
from .indexers import IndexSearcher, MultiSearcher, IndexWriter, Indexer

version = tuple(map(int, lucene.VERSION.split('.')))
Expand Down
52 changes: 11 additions & 41 deletions lupyne/engine/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,47 +147,14 @@ def range(self, start, stop, lower=True, upper=False):
return Query.range(self.names[index], start, stop, lower, upper)


class NumericField(Field):
"""Field which indexes numbers in a prefix tree.
class DateTimeField(Field):
"""Field which indexes datetimes as Point fields of timestamps.
:param name: name of field
:param type: optional int, float, or lucene NumericType string
Supports datetimes, dates, and any prefix of time tuples.
"""
def __init__(self, name, dimensions=1, **settings):
Field.__init__(self, name, dimensions=dimensions, **settings)

def range(self, start, stop, lower=True, upper=False):
"""Return lucene NumericRangeQuery."""
if isinstance(start, float) or isinstance(stop, float):
if start is None:
start = Double.NEGATIVE_INFINITY
elif not lower:
start = document.DoublePoint.nextUp(start)
if stop is None:
stop = Double.POSITIVE_INFINITY
elif not upper:
stop = document.DoublePoint.nextDown(stop)
return document.DoublePoint.newRangeQuery(self.name, start, stop)
if start is None:
start = Long.MIN_VALUE
elif not lower:
start += 1
if stop is None:
stop = Long.MAX_VALUE
elif not upper:
stop -= 1
return document.LongPoint.newRangeQuery(self.name, long(start), long(stop))

def term(self, value):
"""Return range query to match single term."""
return self.range(value, value, upper=True)


class DateTimeField(NumericField):
"""Field which indexes datetimes as a NumericField of timestamps.
Supports datetimes, dates, and any prefix of time tuples.
"""
@classmethod
def timestamp(cls, date):
"""Return utc timestamp from date or time tuple."""
Expand All @@ -197,12 +164,12 @@ def timestamp(cls, date):

def items(self, *dates):
"""Generate lucene NumericFields of timestamps."""
return NumericField.items(self, *map(self.timestamp, dates))
return Field.items(self, *map(self.timestamp, dates))

def range(self, start, stop, lower=True, upper=False):
def range(self, start, stop, **inclusive):
"""Return NumericRangeQuery of timestamps."""
start, stop = (date and self.timestamp(date) for date in (start, stop))
return NumericField.range(self, start, stop, lower, upper)
interval = (date and self.timestamp(date) for date in (start, stop))
return Query.ranges(self.name, interval, **inclusive)

def prefix(self, date):
"""Return range query which matches the date prefix."""
Expand Down Expand Up @@ -238,8 +205,11 @@ def within(self, days=0, weeks=0, utc=True, **delta):
return self.duration(date, days, weeks=weeks, **delta)


class SpatialField(NumericField):
class SpatialField(Field):
"""Geospatial points, indexed with optional docvalues."""
def __init__(self, name, dimensions=1, **settings):
Field.__init__(self, name, dimensions=dimensions, **settings)

def items(self, *points):
"""Generate lucene LatLon fields from points (lng, lat)."""
for lng, lat in points:
Expand Down
44 changes: 41 additions & 3 deletions lupyne/engine/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@

import contextlib
import lucene
from java.lang import Integer
from java.lang import Double, Integer, Long
from java.util import Arrays
from org.apache.lucene import index, search, util
from org.apache.lucene import document, index, search, util
from org.apache.lucene.search import spans
from org.apache.pylucene.queryparser.classic import PythonQueryParser
from six import string_types
from six.moves import map, range
from ..utils import method
from ..utils import long, method

lucene6 = lucene.VERSION.startswith('6.')

Expand Down Expand Up @@ -132,6 +132,44 @@ def regexp(cls, name, value, *args):
"""Return lucene RegexpQuery."""
return cls(search.RegexpQuery, index.Term(name, value), *args)

@staticmethod
def points(name, *values):
"""Return lucene set query of one dimensional points."""
if any(isinstance(value, float) for value in values):
return document.DoublePoint.newSetQuery(name, values)
return document.LongPoint.newSetQuery(name, tuple(map(long, values)))

@staticmethod
def ranges(name, *intervals, **inclusive):
"""Return lucene multidimensional point range query, by default with half-open intervals."""
lower, upper = inclusive.pop('lower', True), inclusive.pop('upper', False)
starts, stops = [], []
for start, stop in intervals:
if isinstance(start, float) or isinstance(stop, float):
if start is None:
start = Double.NEGATIVE_INFINITY
elif not lower:
start = document.DoublePoint.nextUp(start)
if stop is None:
stop = Double.POSITIVE_INFINITY
elif not upper:
stop = document.DoublePoint.nextDown(stop)
else:
if start is None:
start = Long.MIN_VALUE
elif not lower:
start += 1
if stop is None:
stop = Long.MAX_VALUE
elif not upper:
stop -= 1
start, stop = long(start), long(stop)
starts.append(start)
stops.append(stop)
if any(isinstance(value, float) for value in starts):
return document.DoublePoint.newRangeQuery(name, starts, stops)
return document.LongPoint.newRangeQuery(name, starts, stops)

def constant(self):
"""Return lucene ConstantScoreQuery."""
return Query(search.ConstantScoreQuery, self)
Expand Down
33 changes: 19 additions & 14 deletions tests/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from org.apache.lucene import analysis, document, index, search, store, util
from six.moves import map
from lupyne import engine
from lupyne.utils import long
Q = engine.Query


Expand Down Expand Up @@ -325,6 +324,19 @@ def test_queries():
assert str(span.containing(span)) == 'SpanContaining(text:lucene, text:lucene)'
assert str(span.within(span)) == 'SpanWithin(text:lucene, text:lucene)'

assert str(Q.points('point', 0.0)) == 'point:{0.0}'
assert str(Q.points('point', 0.0, 1.0)) == 'point:{0.0 1.0}'
assert str(Q.points('point', 0)) == 'point:{0}'
assert str(Q.points('point', 0, 1)) == 'point:{0 1}'
assert str(Q.ranges('point', (0.0, 1.0), (2.0, 3.0), upper=True)) == 'point:[0.0 TO 1.0],[2.0 TO 3.0]'
assert str(Q.ranges('point', (0.0, 1.0), lower=False)).startswith('point:[4.9E-324 TO 0.9999')
assert str(Q.ranges('point', (None, 0.0), upper=True)) == 'point:[-Infinity TO 0.0]'
assert str(Q.ranges('point', (0.0, None))) == 'point:[0.0 TO Infinity]'
assert str(Q.ranges('point', (0, 1), (2, 3), upper=True)) == 'point:[0 TO 1],[2 TO 3]'
assert str(Q.ranges('point', (0, 3), lower=False)) == 'point:[1 TO 2]'
assert str(Q.ranges('point', (None, 0), upper=True)) == 'point:[-9223372036854775808 TO 0]'
assert str(Q.ranges('point', (0, None))) == 'point:[0 TO 9223372036854775807]'


def test_grouping(tempdir, indexer, zipcodes):
field = indexer.fields['location'] = engine.NestedField('state.county.city', docValuesType='sorted')
Expand Down Expand Up @@ -383,7 +395,7 @@ def test_grouping(tempdir, indexer, zipcodes):

def test_spatial(indexer, zipcodes):
for name in ('longitude', 'latitude'):
indexer.set(name, engine.NumericField, stored=True)
indexer.set(name, dimensions=1, stored=True)
field = indexer.set('location', engine.SpatialField, docValuesType='numeric')
for doc in zipcodes:
if doc['state'] == 'CA':
Expand Down Expand Up @@ -421,7 +433,6 @@ def test_fields(indexer, constitution):
document.Field('name', 'value', document.FieldType())
assert str(engine.Field.String('')) == str(document.StringField('', '', document.Field.Store.NO).fieldType())
assert str(engine.Field.Text('')) == str(document.TextField('', '', document.Field.Store.NO).fieldType())
assert str(engine.NumericField('')) == str(document.LongPoint('', long(0)).fieldType())
assert str(engine.DateTimeField('')) == str(document.DoublePoint('', 0.0).fieldType())
settings = {'docValuesType': 'NUMERIC', 'indexOptions': 'DOCS'}
field = engine.Field('', **settings)
Expand Down Expand Up @@ -471,9 +482,9 @@ def test_fields(indexer, constitution):


def test_numeric(indexer, constitution):
indexer.set('amendment', engine.NumericField, stored=True)
indexer.set('amendment', dimensions=1, stored=True)
field = indexer.set('date', engine.DateTimeField, stored=True)
indexer.set('size', engine.NumericField, stored=True, docValuesType='numeric')
indexer.set('size', dimensions=1, stored=True, docValuesType='numeric')
for doc in constitution:
if 'amendment' in doc:
indexer.add(amendment=int(doc['amendment']), date=[tuple(map(int, doc['date'].split('-')))], size=len(doc['text']))
Expand All @@ -491,22 +502,16 @@ def test_numeric(indexer, constitution):
assert indexer.count(field.within(seconds=100)) == indexer.count(field.within(weeks=1)) == 0
query = field.duration([2009], days=-100 * 365)
assert indexer.count(query) == 12
field = indexer.fields['size']
sizes = {id: int(indexer[id]['size']) for id in indexer}
ids = sorted((id for id in sizes if sizes[id] >= 1000), key=sizes.get)
query = field.range(1000, None)
query = Q.ranges('size', (1000, None))
hits = indexer.search(query).sorted(sizes.get)
assert list(hits.ids) == ids
hits = indexer.search(query, count=3, sort=indexer.sortfield('size', type=int))
assert list(hits.ids) == ids[:len(hits)]
query = field.range(None, 1000)
query = Q.ranges('size', (None, 1000))
assert indexer.count(query) == len(sizes) - len(ids)
assert str(field.range(0, None)) == 'size:[0 TO 9223372036854775807]'
assert str(field.range(0, 10, lower=False)) == 'size:[1 TO 9]'
assert str(field.range(0.0, None, lower=False)) == 'size:[4.9E-324 TO Infinity]'
assert str(field.range(None, 0)) == 'size:[-9223372036854775808 TO -1]'
assert str(field.range(None, 0.0, upper=True)) == 'size:[-Infinity TO 0.0]'
hit, = indexer.search(indexer.fields['amendment'].term(1))
hit, = indexer.search(Q.points('amendment', 1))
assert hit['amendment'] == 1


Expand Down
2 changes: 1 addition & 1 deletion tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def test_facets(tempdir, servers, zipcodes):
writer = engine.IndexWriter(tempdir)
writer.commit()
resource = servers.start(servers.ports[0], '-r', tempdir)
writer.set('zipcode', engine.NumericField, stored=True)
writer.set('zipcode', dimensions=1, stored=True)
writer.fields['location'] = engine.NestedField('county.city', docValuesType='sorted')
for doc in zipcodes:
if doc['state'] == 'CA':
Expand Down

0 comments on commit cb94204

Please sign in to comment.