feat: support pipeline analyzer (#96) (#97)

* pipeline analyzer added * stopwords analyzer added
fMeow · Sep 21, 2022 · 71cb2dc · 71cb2dc
1 parent 01b009a
commit 71cb2dc
Show file tree

Hide file tree

Showing 2 changed files with 162 additions and 12 deletions.
diff --git a/src/analyzer.rs b/src/analyzer.rs
@@ -1,38 +1,38 @@
 use serde::{Deserialize, Serialize};
 use typed_builder::TypedBuilder;
 
-#[derive(Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum AnalyzerFeature {
     Frequency,
     Norm,
     Position,
 }
 
-#[derive(Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum AnalyzerCase {
     Lower,
     None,
     Upper,
 }
 
-#[derive(Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum NgramStreamType {
     Binary,
     Utf8,
 }
 
-#[derive(Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum GeoJsonType {
     Shape,
     Centroid,
     Point,
 }
 
-#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
 #[builder(doc)]
 pub struct DelimiterAnalyzerProperties {
     /// The value will be used as delimiter to split text into tokens as specified
@@ -42,14 +42,14 @@ pub struct DelimiterAnalyzerProperties {
     pub delimiter: Option<String>,
 }
 
-#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
 #[builder(doc)]
 pub struct StemAnalyzerProperties {
     /// Format: `language[_COUNTRY][.encoding][@variant]`
     pub locale: String,
 }
 
-#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
 #[builder(doc)]
 pub struct NormAnalyzerProperties {
     /// Format: `language[_COUNTRY][.encoding][@variant]`
@@ -66,7 +66,7 @@ pub struct NormAnalyzerProperties {
     pub accent: Option<bool>,
 }
 
-#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
 #[builder(doc)]
 #[serde(rename_all = "camelCase")]
 pub struct NgramAnalyzerProperties {
@@ -85,7 +85,7 @@ pub struct NgramAnalyzerProperties {
     pub stream_type: Option<NgramStreamType>,
 }
 
-#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
 #[builder(doc)]
 #[serde(rename_all = "camelCase")]
 pub struct TextAnalyzerProperties {
@@ -122,7 +122,7 @@ pub struct TextAnalyzerProperties {
     pub stemming: Option<bool>,
 }
 
-#[derive(Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
 #[builder(doc)]
 pub struct GeoJsonAnalyzerProperties {
     /// Whether to index all GeoJSON geometry types, just the centroid, or just points
@@ -132,6 +132,22 @@ pub struct GeoJsonAnalyzerProperties {
     // Skip the options as they "generally should remain unchanged"
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
+#[builder(doc)]
+#[serde(rename_all = "camelCase")]
+pub struct PipelineAnalyzerProperties {
+    pub pipeline: Vec<PipelineAnalyzers>,
+}
+#[derive(Clone, Debug, Serialize, Deserialize, TypedBuilder, PartialEq)]
+#[builder(doc)]
+#[serde(rename_all = "camelCase")]
+pub struct StopwordsAnalyzerProperties {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[builder(default, setter(strip_option))]
+    pub hex: Option<bool>,
+    pub stopwords: Vec<String>,
+}
+
 #[derive(Debug, Serialize, Deserialize, PartialEq)]
 #[serde(rename_all = "camelCase", tag = "type")]
 pub enum AnalyzerInfo {
@@ -201,9 +217,83 @@ pub enum AnalyzerInfo {
         #[serde(skip_serializing_if = "Option::is_none")]
         properties: Option<GeoJsonAnalyzerProperties>,
     },
+    Stopwords {
+        name: String,
+        properties: StopwordsAnalyzerProperties,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        features: Option<Vec<AnalyzerFeature>>,
+    },
+    Pipeline {
+        name: String,
+        properties: PipelineAnalyzerProperties,
+    },
 }
 
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct AnalyzerDescription {
     pub name: String,
 }
+
+//these are the exact same analyzer types , but customized to be used in a pipeline analyzer
+//since in pipeline analyzers `name` is not required for each sub-analyzer, the name filed is deleted
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[serde(rename_all = "camelCase", tag = "type")]
+pub enum PipelineAnalyzers {
+    /// The `identity` Analyzer does not take additional properties.
+    Identity {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        features: Option<Vec<AnalyzerFeature>>,
+    },
+    Delimiter {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        features: Option<Vec<AnalyzerFeature>>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        properties: Option<DelimiterAnalyzerProperties>,
+    },
+
+    Stem {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        features: Option<Vec<AnalyzerFeature>>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        properties: Option<StemAnalyzerProperties>,
+    },
+
+    Norm {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        features: Option<Vec<AnalyzerFeature>>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        properties: Option<NormAnalyzerProperties>,
+    },
+
+    Ngram {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        features: Option<Vec<AnalyzerFeature>>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        properties: Option<NgramAnalyzerProperties>,
+    },
+
+    Text {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        features: Option<Vec<AnalyzerFeature>>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        properties: Option<TextAnalyzerProperties>,
+    },
+
+    Geojson {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        features: Option<Vec<AnalyzerFeature>>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        properties: Option<GeoJsonAnalyzerProperties>,
+    },
+    Stopwords {
+        properties: StopwordsAnalyzerProperties,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        features: Option<Vec<AnalyzerFeature>>,
+    },
+}
diff --git a/tests/analyzer.rs b/tests/analyzer.rs
@@ -10,7 +10,8 @@ use uclient::ClientExt;
 
 use arangors::analyzer::{
     AnalyzerCase, AnalyzerFeature, AnalyzerInfo, GeoJsonAnalyzerProperties, GeoJsonType,
-    NgramAnalyzerProperties, NgramStreamType, NormAnalyzerProperties,
+    NgramAnalyzerProperties, NgramStreamType, NormAnalyzerProperties, PipelineAnalyzerProperties,
+    PipelineAnalyzers,
 };
 use arangors::{
     collection::{
@@ -83,6 +84,43 @@ async fn create_geo_analyzer<C: ClientExt>(
     database.create_analyzer(info).await
 }
 
+#[maybe_async]
+async fn create_pipeline_analyzer<C: ClientExt>(
+    database: &Database<C>,
+    analyzer_name: String,
+) -> Result<AnalyzerInfo, ClientError> {
+    let norm_analyzer = PipelineAnalyzers::Norm {
+        features: Some(vec![AnalyzerFeature::Frequency, AnalyzerFeature::Norm]),
+        properties: Some(
+            NormAnalyzerProperties::builder()
+                .locale("en.utf-8".to_string())
+                .case(AnalyzerCase::Lower)
+                .build(),
+        ),
+    };
+
+    let ngram_analyzer = PipelineAnalyzers::Ngram {
+        features: Some(vec![AnalyzerFeature::Frequency, AnalyzerFeature::Norm]),
+        properties: Some(
+            NgramAnalyzerProperties::builder()
+                .min(2)
+                .max(2)
+                .preserve_original(false)
+                .stream_type(NgramStreamType::Utf8)
+                .build(),
+        ),
+    };
+
+    let pipe = AnalyzerInfo::Pipeline {
+        name: analyzer_name,
+        properties: PipelineAnalyzerProperties::builder()
+            .pipeline(vec![norm_analyzer, ngram_analyzer])
+            .build(),
+    };
+
+    database.create_analyzer(pipe).await
+}
+
 #[maybe_async::test(
     any(feature = "reqwest_blocking"),
     async(any(feature = "reqwest_async"), tokio::test),
@@ -149,6 +187,28 @@ async fn test_create_and_drop_geo_analyzer() {
     assert_eq!(result.is_err(), false);
 }
 
+#[maybe_async::test(
+    any(feature = "reqwest_blocking"),
+    async(any(feature = "reqwest_async"), tokio::test),
+    async(any(feature = "surf_async"), async_std::test)
+)]
+async fn test_create_and_drop_pipeline_analyzer() {
+    test_setup();
+    let analyzer_name = "test_analyzer_pipeline_create".to_string();
+    let conn = connection().await;
+    let database = conn.db("test_db").await.unwrap();
+
+    let analyzer = create_pipeline_analyzer(&database, analyzer_name.clone()).await;
+
+    trace!("{:?}", analyzer);
+
+    assert_eq!(analyzer.is_err(), false);
+
+    let result = database.drop_analyzer(&analyzer_name).await;
+
+    assert_eq!(result.is_err(), false);
+}
+
 #[maybe_async::test(
     any(feature = "reqwest_blocking"),
     async(any(feature = "reqwest_async"), tokio::test),