Unanalyzed Fields #34

Closed
wants to merge 10 commits into
from
View
@@ -24,6 +24,11 @@ The other hook is only useful if you are using Solr for geospatail information,
we provide a trait called SolrGeoHash which has two required functions, namely
coverString and rectCoverString. Most people will not need to implement this.
+### Multiple Solr cores & non-stanrd query paths
+
+Support for multiple cores is done by overriding "core" in the model (wich is an Option[String]).
+If you have a non-standard query path you can override queryPath in your model.
+
## Examples
[QueryTest.scala](https://github.com/foursquare/slashem/blob/master/src/test/scala/com/foursquare/slashem/QueryTest.scala) contains sample queries and shows the corresponding query.
@@ -48,11 +53,17 @@ to do this:
## Dependencies
lift, joda-time, junit, finagle, jackson. These dependencies are managed by
-the build system.
+the build system. Note: some of the transitive dependencies may fail to resolve
+from the central maven. If you are using sbt you can fix this by adding
-## Warnings
+ ivyXML := (
+ <dependencies>
+ <exclude module="jmxtools"/>
+ <exclude module="jmxri"/>
+ </dependencies>
+ )
-This is still a very early version. There are likely bugs (sorry!). Let us know
+## Warnings still a very early version. There are likely bugs (sorry!). Let us know
if you find any. While we can't promise timely fixes, it will help :)
## Maintainers
View
@@ -1,6 +1,6 @@
name := "slashem"
-version := "0.9.13"
+version := "0.9.15a"
organization := "com.foursquare"
@@ -123,4 +123,11 @@ pomExtra := (
<email>aalix@foursquare.com</email>
</developer>
</developers>
+)
+
+ivyXML := (
+<dependencies>
+ <exclude module="jmxtools"/>
+ <exclude module="jmxri"/>
+</dependencies>
)
@@ -7,6 +7,7 @@ import org.elasticsearch.index.query.{FilterBuilder => ElasticFilterBuilder,
QueryBuilder => ElasticQueryBuilder,
QueryBuilders => EQueryBuilders,
QueryStringQueryBuilder}
+import scalaj.collection.Imports._
/**
* Abstract Syntax Tree used to represent queries.
@@ -242,7 +243,7 @@ object Ast {
}
def elasticBoost(): Pair[List[String],String] = {
weight match {
- case 1.0 => Pair(Nil,"(doc['" + fieldName + "'].value)")
+ case 1.0 => Pair(Nil,"doc['" + fieldName + "'].value")
case _ => Pair(Nil,"(doc['" + fieldName + "'].value *" + weight.toString + ")")
}
}
@@ -359,8 +360,10 @@ object Ast {
def extend(): String = "\"\""
/** @inheritdoc */
def elasticExtend(qf: List[WeightedField], pf: List[PhraseWeightedField], mm: Option[String]): ElasticQueryBuilder = {
- val q = EQueryBuilders.queryString(this.extend())
- qf.map(f => q.field(f.fieldName,f.weight.toFloat))
+ //An empty query matches no documents, so it is the same as the negation of matchAll
+ //Note: this is kind of ugly since this is may likely an OR clause or negated up above
+ //so we should try and avoid generating this
+ val q = EQueryBuilders.boolQuery.mustNot(EQueryBuilders.matchAllQuery())
q
}
}
@@ -405,6 +408,51 @@ object Ast {
}
}
+ /**
+ * A term query. Used for queries that don't need to be analyzed
+ *
+ * By default, elasticFilter() will always be cached!
+ */
+ case class Term[T](query: Iterable[T], escaped: Boolean = true, cached: Boolean = true) extends Query[T] {
+ // hack for single term queries
+ def this(query: T) = this(List(query))
+ /** @inheritdoc */
+ //def extend() = throw new UnimplementedException("Slashem does not support Term queries Solr")
+ def extend(): String = {
+ escaped match {
+ // hack to fix wrapping the queries in a List()
+ case true => {
+ val queries = query.map(q => {'"' + escape(q.toString) + '"'})
+ queries.mkString(" OR ")
+ }
+// case true => {'"' + query.mkString("\" OR \"")
+ case false => '"' + query.mkString(" OR ") + '"'
+ }
+ }
+ /** @inheritdoc */
+ def elasticExtend(qf: List[WeightedField], pf: List[PhraseWeightedField], mm: Option[String]): ElasticQueryBuilder = {
+ val fieldName = qf.head.fieldName
+ val weight = qf.head.weight.toFloat
+ query match {
+ case term::Nil => EQueryBuilders.termQuery(fieldName, term).boost(weight)
+ case terms => {
+ val moarTerms = terms.toSeq.map(_.toString)
+ EQueryBuilders.termsQuery(fieldName, moarTerms: _*).boost(weight)
@holdenk

holdenk May 11, 2012

Contributor

Is this going to work with an empty terms? Or do we need to special case it?

+ }
+ }
+ }
+ /** @inheritdoc */
+ override def elasticFilter(qf: List[WeightedField]): ElasticFilterBuilder = {
+ val fieldName = qf.head.fieldName
+ query match {
+ case term::Nil => EFilterBuilders.termFilter(fieldName, term).cache(cached)
+ case terms => {
+ val moarTerms = terms.toSeq.map(_.toString)
+ EFilterBuilders.termsFilter(fieldName, moarTerms: _*).cache(cached)
@holdenk

holdenk May 11, 2012

Contributor

Is this going to work with an empty terms? Or do we need to special case it?

+ }
+ }
+ }
+ }
case class Range[T](q1: Query[T],q2: Query[T]) extends Query[T] {
/** @inheritdoc */
@@ -485,7 +533,7 @@ object Ast {
}
/**
- * Class representing clauses ANDed together
+ * Class representing queries ANDed together
*/
case class And[T](queries: Query[T]*) extends Query[T] {
/** @inheritdoc */
@@ -505,7 +553,7 @@ object Ast {
}
}
/**
- * Case class representing a list of clauses ORed together
+ * Case class representing a list of queries ORed together
*/
case class Or[T](queries: Query[T]*) extends Query[T] {
/** @inheritdoc */
@@ -55,6 +55,12 @@ case class SolrResponseException(code: Int, reason: String, solrName: String, qu
}
}
+case class UnimplementedException(reason: String) extends RuntimeException {
+ override def getMessage(): String = {
+ "Not implemented: %s".format(reason)
+ }
+}
+
/** The response header. There are normally more fields in the response header we could extract, but
* we don't at present. */
case class ResponseHeader @JsonCreator()(@JsonProperty("status")status: Int, @JsonProperty("QTime")QTime: Int)
@@ -433,8 +439,8 @@ trait SolrGeoHash {
}
//Default geohash, does nothing.
object NoopSolrGeoHash extends SolrGeoHash {
- def coverString (geoLat: Double, geoLong: Double, radiusInMeters: Int, maxCells: Int ): Seq[String] = List("pleaseUseaRealGeoHash")
- def rectCoverString(topRight: (Double, Double), bottomLeft: (Double, Double), maxCells: Int = 0, minLevel: Int = 0, maxLevel: Int = 0): Seq[String] = List("pleaseUseaRealGeoHash")
+ def coverString (geoLat: Double, geoLong: Double, radiusInMeters: Int, maxCells: Int ): Seq[String] = List("pleaseUseaRealGeoHash", "thisIsForFunctionalityTests")
+ def rectCoverString(topRight: (Double, Double), bottomLeft: (Double, Double), maxCells: Int = 0, minLevel: Int = 0, maxLevel: Int = 0): Seq[String] = List("pleaseUseaRealGeoHash", "thisIsForFunctionalityTests")
}
trait SlashemSchema[M <: Record[M]] extends Record[M] {
@@ -782,28 +788,54 @@ trait SolrSchema[M <: Record[M]] extends SlashemSchema[M] {
}
+/**
+ * A field type for unanalyzed queries. Results in using Term[V] queries.
@holdenk

holdenk May 11, 2012

Contributor

Maybe add available for ES only.

@adamalix

adamalix May 11, 2012

Contributor

Then do we want separate SlashemGeoFields for Solr / ES? This should behave the same for ES / Solr with the current impl.

@holdenk

holdenk May 11, 2012

Contributor

If SlashemGeoFields generate Term[V] queries, then Term[V] queries need to
have query-gen support for Solr.

On Thu, May 10, 2012 at 5:32 PM, Adam Alix <
reply@reply.github.com

wrote:

@@ -782,28 +788,54 @@ trait SolrSchema[M <: Record[M]] extends
SlashemSchema[M] {

}

+/**

  • * A field type for unanalyzed queries. Results in using Term[V]
    queries.

Then do we want separate SlashemGeoFields for Solr / ES? This should
behave the same for ES / Solr with the current impl.


Reply to this email directly or view it on GitHub:
https://github.com/foursquare/slashem/pull/34/files#r806027

Cell : 425-233-8271

@adamalix

adamalix May 11, 2012

Contributor

It does, you're not seeing the latest because of the way this thing does reviews.

@holdenk

holdenk May 11, 2012

Contributor

Fuck github. Sorry for the dumb comment :p

+ */
+trait SlashemUnanalyzedField[V, M <: Record[M]] extends SlashemField[V, M] {
+ self: Field[V, M] =>
+
+ override val unanalyzed = true
+}
trait SlashemField[V, M <: Record[M]] extends OwnedField[M] {
self: Field[V, M] =>
import Helpers._
- //Note eqs and neqs results in phrase queries!
- def eqs(v: V) = Clause[V](self.queryName, Group(Phrase(v)))
- def neqs(v: V) = Clause[V](self.queryName, Phrase(v),false)
+ // Override this value to produce unanalyzed queries!
+ val unanalyzed = false
+
+ def produceQuery(v: V): Query[V] = {
+ unanalyzed match {
+ // use new to use Term's additional non-default constructor
+ case true => new Term(v)
+ case false => Phrase(v)
+ }
+ }
+
+ def produceGroupedQuery(v: Iterable[V]): Query[V] = {
+ unanalyzed match {
+ // we don't want to groupWithOr and instead take advantage of "terms" queries
+ case true => Term(v)
+ case false => groupWithOr(v.map({x: V => produceQuery(x)}))
+ }
+ }
+
+ def eqs(v: V) = Clause[V](self.queryName, Group(produceQuery(v)))
+ def neqs(v: V) = Clause[V](self.queryName, produceQuery(v),false)
//With a boost
- def eqs(v: V, b: Float) = Clause[V](self.queryName, Boost(Group(Phrase(v)),b))
- def neqs(v: V, b:Float) = Clause[V](self.queryName, Boost(Phrase(v),b),false)
+ def eqs(v: V, b: Float) = Clause[V](self.queryName, Boost(Group(produceQuery(v)),b))
+ def neqs(v: V, b:Float) = Clause[V](self.queryName, Boost(produceQuery(v),b),false)
//This allows for bag of words style matching.
def contains(v: V) = Clause[V](self.queryName, Group(BagOfWords(v)))
def contains(v: V, b: Float) = Clause[V](self.queryName, Boost(Group(BagOfWords(v)),b))
- def in(v: Iterable[V]) = Clause[V](self.queryName, groupWithOr(v.map({x: V => Phrase(x)})))
- def nin(v: Iterable[V]) = Clause[V](self.queryName, groupWithOr(v.map({x: V => Phrase(x)})),false)
+ def in(v: Iterable[V]) = Clause[V](self.queryName, produceGroupedQuery(v))
+ def nin(v: Iterable[V]) = Clause[V](self.queryName, produceGroupedQuery(v),false)
- def in(v: Iterable[V], b: Float) = Clause[V](self.queryName, Boost(groupWithOr(v.map({x: V => Phrase(x)})),b))
- def nin(v: Iterable[V], b: Float) = Clause[V](self.queryName, Boost(groupWithOr(v.map({x: V => Phrase(x)})),b),false)
+ def in(v: Iterable[V], b: Float) = Clause[V](self.queryName, Boost(produceGroupedQuery(v),b))
+ def nin(v: Iterable[V], b: Float) = Clause[V](self.queryName, Boost(produceGroupedQuery(v),b),false)
def inRange(v1: V, v2: V) = Clause[V](self.queryName, Group(Range(BagOfWords(v1),BagOfWords(v2))))
def ninRange(v1: V, v2: V) = Clause[V](self.queryName, Group(Range(BagOfWords(v1),BagOfWords(v2))),false)
@@ -843,6 +875,17 @@ trait SlashemField[V, M <: Record[M]] extends OwnedField[M] {
//Slashem field types
class SlashemStringField[T <: Record[T]](owner: T) extends StringField[T](owner, 0) with SlashemField[String, T]
+/**
+ * Field type that can be queried without analyzing.
+ *
+ * Ex: multi-value field or a whitespace tokenized field where
+ * search terms are always for a specific token.
+ *
+ * @see SlashemStringField
+ */
+class SlashemUnanalyzedStringField[T <: Record[T]](owner: T)
+ extends StringField[T](owner, 0) with SlashemUnanalyzedField[String, T]
+
//Allows for querying against the default filed in solr. This field doesn't have a name
class SlashemDefaultStringField[T <: Record[T]](owner: T) extends StringField[T](owner, 0) with SlashemField[String, T] {
override def name = ""
@@ -951,7 +994,7 @@ class SlashemPointField[T <: Record[T]](owner: T) extends PointField[T](owner) w
class SlashemBooleanField[T <: Record[T]](owner: T) extends BooleanField[T](owner) with SlashemField[Boolean, T]
class SlashemDateTimeField[T <: Record[T]](owner: T) extends JodaDateTimeField[T](owner) with SlashemField[DateTime, T]
//More restrictive type so we can access the geohash
-class SlashemGeoField[T <: SlashemSchema[T]](owner: T) extends StringField[T](owner,0) with SlashemField[String, T] {
+class SlashemGeoField[T <: SlashemSchema[T]](owner: T) extends SlashemUnanalyzedStringField[T](owner) {
def inRadius(geoLat: Double, geoLong: Double, radiusInMeters: Int, maxCells: Int = owner.geohash.maxCells) = {
val cellIds = owner.geohash.coverString(geoLat, geoLong, radiusInMeters, maxCells = maxCells)
//If we have an empty cover we default to everything.
@@ -79,6 +79,14 @@ class ElasticQueryTest extends SpecsMatchers with ScalaCheckMatchers {
}
@Test
+ def simpleBoostTest {
+ val fullQuery = ESimplePanda.where(_.name contains "lol")
+ .limit(5).boostField(_.followers)
+ val r = fullQuery fetch()
+ }
+
+
+ @Test
def testEmptySearch {
try {
val r = ESimplePanda where (_.name eqs "lolsdonotinsertsomethingwiththisinit") fetch()
@@ -305,10 +313,26 @@ class ElasticQueryTest extends SpecsMatchers with ScalaCheckMatchers {
def testListFieldIn {
val response1 = ESimplePanda where (_.favnums in List(2, 3, 4, 5)) fetch()
val response2 = ESimplePanda where (_.favnums in List(99)) fetch()
- //val response3 = ESimplePanda where (_.favnums in List()) fetch()
+ val response3 = ESimplePanda where (_.termsfield in List("termhit", "lol")) fetch()
Assert.assertEquals(response1.response.results.length, 2)
Assert.assertEquals(response2.response.results.length, 0)
- //Assert.assertEquals(response3.response.results.length, 0)
+ Assert.assertEquals(response3.response.results.length, 1)
+ }
+
+ @Test
+ def testIntListFieldEmptyIn {
+ val response1 = ESimplePanda where (_.favnums in List()) fetch()
+ val response2 = ESimplePanda where (_.termsfield in List()) fetch()
+ Assert.assertEquals(response1.response.results.length, 0)
+ Assert.assertEquals(response2.response.results.length, 0)
+ }
+
+ @Test
+ def testIntListFieldEmptyNin {
+ val response1 = ESimplePanda where (_.favnums nin List()) fetch()
+ val response2 = ESimplePanda where (_.termsfield nin List()) fetch()
+ Assert.assertEquals(response1.response.results.length, 8)
+ Assert.assertEquals(response2.response.results.length, 8)
}
@Test
@@ -326,6 +350,26 @@ class ElasticQueryTest extends SpecsMatchers with ScalaCheckMatchers {
val ids2 = response2.response.oids
// All three docs with favnums should be returned, none contain 99
Assert.assertEquals(ids2.intersect(idsWithFavNums).length, 3)
+
+ val response3 = ESimplePanda where (_.termsfield nin List("termhit")) fetch()
+ val ids3 = response3.response.oids
+ // All three docs with favnums should be returned, none contain 99
+ Assert.assertEquals(ids3.intersect(idsWithFavNums).length, 2)
+ }
+
+ @Test
+ def testTermQueries {
+ val res1 = ESimplePanda where (_.termsfield eqs "termhit") fetch()
+ val res2 = ESimplePanda where (_.termsfield in List("randomterm", "termhit")) fetch()
+ Assert.assertEquals(res1.response.results.length, 1)
+ Assert.assertEquals(res2.response.results.length, 1)
+ }
+
+ @Test
+ def testTermFilters {
+ // grab 2 results, filter to 1
+ val res1 = ESimplePanda where (_.hugenums contains 1L) filter(_.termsfield in List("termhit", "randomterm")) fetch()
+ Assert.assertEquals(res1.response.results.length, 1)
}
@Before
@@ -376,6 +420,7 @@ class ElasticQueryTest extends SpecsMatchers with ScalaCheckMatchers {
val favnums1 = List(1, 2, 3, 4, 5).asJava
val favnums2 = List(1, 2, 3, 4, 5).asJava
val favnums3 = List(6, 7, 8, 9, 10).asJava
+ val terms1 = List("termhit", "nohit").asJava
val nicknames1 = List("jerry", "dawg", "xzibit").asJava
val nicknames2 = List("xzibit", "alvin").asJava
val nicknames3 = List("alvin", "nathaniel", "joiner").asJava
@@ -389,6 +434,7 @@ class ElasticQueryTest extends SpecsMatchers with ScalaCheckMatchers {
.field("favnums", favnums1)
.field("nicknames", nicknames1)
.field("hugenums", hugenums1)
+ .field("termsfield", terms1)
.endObject()
).execute()
.actionGet();
@@ -18,6 +18,7 @@ class ESimplePanda extends ElasticSchema[ESimplePanda] {
object favnums extends SlashemIntListField(this)
object nicknames extends SlashemStringListField(this)
object hugenums extends SlashemLongListField(this)
+ object termsfield extends SlashemUnanalyzedStringField(this)
}
object ESimpleGeoPanda extends ESimpleGeoPanda with ElasticMeta[ESimpleGeoPanda] {
Oops, something went wrong.