Skip to content

Commit

Permalink
feat: support pipeline analyzer (#96) (#97)
Browse files Browse the repository at this point in the history
* pipeline analyzer added

* stopwords analyzer added
  • Loading branch information
muhammad-davatgar committed Sep 21, 2022
1 parent 01b009a commit 71cb2dc
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 12 deletions.
112 changes: 101 additions & 11 deletions src/analyzer.rs
Original file line number Diff line number Diff line change
@@ -1,38 +1,38 @@
use serde::{Deserialize, Serialize};
use typed_builder::TypedBuilder;

#[derive(Debug, Serialize, Deserialize, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum AnalyzerFeature {
Frequency,
Norm,
Position,
}

#[derive(Debug, Serialize, Deserialize, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum AnalyzerCase {
Lower,
None,
Upper,
}

#[derive(Debug, Serialize, Deserialize, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum NgramStreamType {
Binary,
Utf8,
}

#[derive(Debug, Serialize, Deserialize, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum GeoJsonType {
Shape,
Centroid,
Point,
}

#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[builder(doc)]
pub struct DelimiterAnalyzerProperties {
/// The value will be used as delimiter to split text into tokens as specified
Expand All @@ -42,14 +42,14 @@ pub struct DelimiterAnalyzerProperties {
pub delimiter: Option<String>,
}

#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[builder(doc)]
pub struct StemAnalyzerProperties {
/// Format: `language[_COUNTRY][.encoding][@variant]`
pub locale: String,
}

#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[builder(doc)]
pub struct NormAnalyzerProperties {
/// Format: `language[_COUNTRY][.encoding][@variant]`
Expand All @@ -66,7 +66,7 @@ pub struct NormAnalyzerProperties {
pub accent: Option<bool>,
}

#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[builder(doc)]
#[serde(rename_all = "camelCase")]
pub struct NgramAnalyzerProperties {
Expand All @@ -85,7 +85,7 @@ pub struct NgramAnalyzerProperties {
pub stream_type: Option<NgramStreamType>,
}

#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[builder(doc)]
#[serde(rename_all = "camelCase")]
pub struct TextAnalyzerProperties {
Expand Down Expand Up @@ -122,7 +122,7 @@ pub struct TextAnalyzerProperties {
pub stemming: Option<bool>,
}

#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[builder(doc)]
pub struct GeoJsonAnalyzerProperties {
/// Whether to index all GeoJSON geometry types, just the centroid, or just points
Expand All @@ -132,6 +132,22 @@ pub struct GeoJsonAnalyzerProperties {
// Skip the options as they "generally should remain unchanged"
}

#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[builder(doc)]
#[serde(rename_all = "camelCase")]
pub struct PipelineAnalyzerProperties {
pub pipeline: Vec<PipelineAnalyzers>,
}
#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
#[builder(doc)]
#[serde(rename_all = "camelCase")]
pub struct StopwordsAnalyzerProperties {
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub hex: Option<bool>,
pub stopwords: Vec<String>,
}

#[derive(Debug, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "camelCase", tag = "type")]
pub enum AnalyzerInfo {
Expand Down Expand Up @@ -201,9 +217,83 @@ pub enum AnalyzerInfo {
#[serde(skip_serializing_if = "Option::is_none")]
properties: Option<GeoJsonAnalyzerProperties>,
},
Stopwords {
name: String,
properties: StopwordsAnalyzerProperties,
#[serde(skip_serializing_if = "Option::is_none")]
features: Option<Vec<AnalyzerFeature>>,
},
Pipeline {
name: String,
properties: PipelineAnalyzerProperties,
},
}

#[derive(Debug, Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct AnalyzerDescription {
pub name: String,
}

//these are the exact same analyzer types , but customized to be used in a pipeline analyzer
//since in pipeline analyzers `name` is not required for each sub-analyzer, the name filed is deleted
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "camelCase", tag = "type")]
pub enum PipelineAnalyzers {
/// The `identity` Analyzer does not take additional properties.
Identity {
#[serde(skip_serializing_if = "Option::is_none")]
features: Option<Vec<AnalyzerFeature>>,
},
Delimiter {
#[serde(skip_serializing_if = "Option::is_none")]
features: Option<Vec<AnalyzerFeature>>,

#[serde(skip_serializing_if = "Option::is_none")]
properties: Option<DelimiterAnalyzerProperties>,
},

Stem {
#[serde(skip_serializing_if = "Option::is_none")]
features: Option<Vec<AnalyzerFeature>>,

#[serde(skip_serializing_if = "Option::is_none")]
properties: Option<StemAnalyzerProperties>,
},

Norm {
#[serde(skip_serializing_if = "Option::is_none")]
features: Option<Vec<AnalyzerFeature>>,

#[serde(skip_serializing_if = "Option::is_none")]
properties: Option<NormAnalyzerProperties>,
},

Ngram {
#[serde(skip_serializing_if = "Option::is_none")]
features: Option<Vec<AnalyzerFeature>>,

#[serde(skip_serializing_if = "Option::is_none")]
properties: Option<NgramAnalyzerProperties>,
},

Text {
#[serde(skip_serializing_if = "Option::is_none")]
features: Option<Vec<AnalyzerFeature>>,

#[serde(skip_serializing_if = "Option::is_none")]
properties: Option<TextAnalyzerProperties>,
},

Geojson {
#[serde(skip_serializing_if = "Option::is_none")]
features: Option<Vec<AnalyzerFeature>>,

#[serde(skip_serializing_if = "Option::is_none")]
properties: Option<GeoJsonAnalyzerProperties>,
},
Stopwords {
properties: StopwordsAnalyzerProperties,
#[serde(skip_serializing_if = "Option::is_none")]
features: Option<Vec<AnalyzerFeature>>,
},
}
62 changes: 61 additions & 1 deletion tests/analyzer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ use uclient::ClientExt;

use arangors::analyzer::{
AnalyzerCase, AnalyzerFeature, AnalyzerInfo, GeoJsonAnalyzerProperties, GeoJsonType,
NgramAnalyzerProperties, NgramStreamType, NormAnalyzerProperties,
NgramAnalyzerProperties, NgramStreamType, NormAnalyzerProperties, PipelineAnalyzerProperties,
PipelineAnalyzers,
};
use arangors::{
collection::{
Expand Down Expand Up @@ -83,6 +84,43 @@ async fn create_geo_analyzer<C: ClientExt>(
database.create_analyzer(info).await
}

#[maybe_async]
async fn create_pipeline_analyzer<C: ClientExt>(
database: &Database<C>,
analyzer_name: String,
) -> Result<AnalyzerInfo, ClientError> {
let norm_analyzer = PipelineAnalyzers::Norm {
features: Some(vec![AnalyzerFeature::Frequency, AnalyzerFeature::Norm]),
properties: Some(
NormAnalyzerProperties::builder()
.locale("en.utf-8".to_string())
.case(AnalyzerCase::Lower)
.build(),
),
};

let ngram_analyzer = PipelineAnalyzers::Ngram {
features: Some(vec![AnalyzerFeature::Frequency, AnalyzerFeature::Norm]),
properties: Some(
NgramAnalyzerProperties::builder()
.min(2)
.max(2)
.preserve_original(false)
.stream_type(NgramStreamType::Utf8)
.build(),
),
};

let pipe = AnalyzerInfo::Pipeline {
name: analyzer_name,
properties: PipelineAnalyzerProperties::builder()
.pipeline(vec![norm_analyzer, ngram_analyzer])
.build(),
};

database.create_analyzer(pipe).await
}

#[maybe_async::test(
any(feature = "reqwest_blocking"),
async(any(feature = "reqwest_async"), tokio::test),
Expand Down Expand Up @@ -149,6 +187,28 @@ async fn test_create_and_drop_geo_analyzer() {
assert_eq!(result.is_err(), false);
}

#[maybe_async::test(
any(feature = "reqwest_blocking"),
async(any(feature = "reqwest_async"), tokio::test),
async(any(feature = "surf_async"), async_std::test)
)]
async fn test_create_and_drop_pipeline_analyzer() {
test_setup();
let analyzer_name = "test_analyzer_pipeline_create".to_string();
let conn = connection().await;
let database = conn.db("test_db").await.unwrap();

let analyzer = create_pipeline_analyzer(&database, analyzer_name.clone()).await;

trace!("{:?}", analyzer);

assert_eq!(analyzer.is_err(), false);

let result = database.drop_analyzer(&analyzer_name).await;

assert_eq!(result.is_err(), false);
}

#[maybe_async::test(
any(feature = "reqwest_blocking"),
async(any(feature = "reqwest_async"), tokio::test),
Expand Down

0 comments on commit 71cb2dc

Please sign in to comment.