elastic · sethmlarson · May 4, 2020 · Apr 23, 2020 · Apr 25, 2020 · Apr 25, 2020
diff --git a/eland/dataframe.py b/eland/dataframe.py
@@ -266,6 +266,40 @@ def tail(self, n: int = 5) -> "DataFrame":
         """
         return DataFrame(query_compiler=self._query_compiler.tail(n))
 
+    def sample(self, n=None, frac=None):
+        """
+        Return n randomly sample rows or the specify fraction of rows
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of documents from index to return. Cannot be used with `frac`.
+            Default = 1 if `frac` = None.
+        frac : float, optional
+            Fraction of axis items to return. Cannot be used with `n`.
+
+        Returns
+        -------
+        eland.DataFrame:
+            eland DataFrame filtered containing n rows randomly sampled
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.DataFrame.sample`
+        """
+
+        if frac is not None and frac > 1:
+            raise ValueError(
+                "Replace has to be set to `True` when "
+                "upsampling the population `frac` > 1."
+            )
+        elif n is not None and frac is None and n % 1 != 0:
+            raise ValueError("Only integers accepted as `n` values")
+        elif n is not None and frac is not None:
+            raise ValueError("Please enter a value for `frac` OR `n`, not both")
+
+        return DataFrame(query_compiler=self._query_compiler.sample(n=n, frac=frac))
+
     def drop(
         self,
         labels=None,

diff --git a/eland/operations.py b/eland/operations.py
@@ -34,6 +34,7 @@
 from eland.tasks import (
     HeadTask,
     TailTask,
+    SampleTask,
     BooleanFilterTask,
     ArithmeticOpFieldsTask,
     QueryTermsTask,
@@ -91,6 +92,10 @@ def tail(self, index, n):
         task = TailTask(index.sort_field, n)
         self._tasks.append(task)
 
+    def sample(self, n):
+        task = SampleTask(n)
+        self._tasks.append(task)
+
     def arithmetic_op_fields(self, display_name, arithmetic_series):
         if self._arithmetic_op_fields_task is None:
             self._arithmetic_op_fields_task = ArithmeticOpFieldsTask(

diff --git a/eland/query.py b/eland/query.py
@@ -17,6 +17,7 @@
 from typing import Optional, Dict, List, Any
 
 from eland.filter import BooleanFilter, NotNull, IsNull, IsIn
+from eland.score import RandomScore
 
 
 class Query:
@@ -169,5 +170,8 @@ def update_boolean_filter(self, boolean_filter: BooleanFilter) -> None:
         else:
             self._query = self._query & boolean_filter
 
+    def random_score(self) -> None:
+        self._query = RandomScore(self._query)
+
     def __repr__(self) -> str:
         return repr(self.to_search_body())
diff --git a/eland/query_compiler.py b/eland/query_compiler.py
@@ -19,10 +19,10 @@
 import numpy as np
 import pandas as pd
 
-from eland import Index
 from eland.field_mappings import FieldMappings
-from eland.operations import Operations
 from eland.filter import QueryFilter
+from eland.operations import Operations
+from eland import Index
 from eland.common import (
     ensure_es_client,
     DEFAULT_PROGRESS_REPORTING_NUM_ROWS,
@@ -403,6 +403,28 @@ def tail(self, n):
 
         return result
 
+    def sample(self, n=None, frac=None):
+        result = self.copy()
+
+        if n is None and frac is None:
+            n = 1
+        elif n is None and frac is not None:
+            # fetch index size
+            stats = self._client.indices.stats(
+                index=self._index_pattern, metric="indexing"
+            )
+            index_length = stats["_all"]["primaries"]["indexing"]["index_total"]
+            n = int(round(frac * index_length))
+
+        if n < 0:
+            raise ValueError(
+                "A negative number of rows requested. Please provide positive value."
+            )
+
+        result._operations.sample(n)
+
+        return result
+
     def es_query(self, query):
         return self._update_query(QueryFilter(query))
 

diff --git a/eland/score.py b/eland/score.py
@@ -0,0 +1,31 @@
+# Copyright 2020 Elasticsearch BV
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class RandomScore:
+    def __init__(self, query):
+
+        q = {"match_all": {}}
+        if not query.empty():
+            q = query.build()
+
+        self._score = {"function_score": {"query": q, "random_score": {}}}
+
+    def empty(self):
+        if self._score is None:
+            return True
+        return False
+
+    def build(self):
+        return self._score
diff --git a/eland/tasks.py b/eland/tasks.py
@@ -19,7 +19,6 @@
 from eland.actions import HeadAction, TailAction, SortIndexAction
 from eland.arithmetics import ArithmeticSeries
 
-
 if TYPE_CHECKING:
     from .actions import PostProcessingAction  # noqa: F401
     from .filter import BooleanFilter  # noqa: F401
@@ -185,6 +184,33 @@ def __repr__(self) -> str:
         return f"('{self._task_type}': ('sort_field': '{self._sort_field}', 'count': {self._count}))"
 
 
+class SampleTask(SizeTask):
+    def __init__(self, count):
+        super().__init__("sample")
+        self._count = count
+
+    def resolve_task(
+        self,
+        query_params: QUERY_PARAMS_TYPE,
+        post_processing: List["PostProcessingAction"],
+        query_compiler: "QueryCompiler",
+    ) -> RESOLVED_TASK_TYPE:
+        query_params["query"].random_score()
+
+        if query_params.get("query_size") is not None:
+            query_params["query_size"] = min(self._count, query_params["query_size"])
+        else:
+            query_params["query_size"] = self._count
+
+        return query_params, post_processing
+
+    def size(self) -> int:
+        return self._count
+
+    def __repr__(self) -> str:
+        return f"('{self._task_type}': ('count': {self._count}))"
+
+
 class QueryIdsTask(Task):
     def __init__(self, must: bool, ids: List[str]):
         """

diff --git a/eland/tests/dataframe/test_sample_pytest.py b/eland/tests/dataframe/test_sample_pytest.py
@@ -0,0 +1,55 @@
+# Copyright 2020 Elasticsearch BV
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# File called _pytest for PyCharm compatibility
+
+from eland.tests.common import TestData
+from eland.tests.common import assert_pandas_eland_frame_equal
+
+
+class TestDataFrameSample(TestData):
+    def test_sample_basic(self):
+        ed_flights = self.ed_flights()
+        sample_ed_flights = ed_flights.sample(n=10)._to_pandas()
+        assert len(sample_ed_flights) == 10
+
+    def test_sample_on_boolean_filter(self):
+        ed_flights = self.ed_flights()
+        columns = ["timestamp", "OriginAirportID", "DestAirportID", "FlightDelayMin"]
+        shape = ed_flights[columns].sample(n=5)._to_pandas().shape
+        assert (5, 4) == shape
+
+    def test_sample_head(self):
+        ed_flights = self.ed_flights()
+        pd_flights = self.pd_flights()
+
+        pd_head_5 = pd_flights.head(5)
+        ed_head_5 = ed_flights.head(5).sample(5)
+        assert_pandas_eland_frame_equal(pd_head_5, ed_head_5)
+
+    def test_sample_frac_values(self):
+        ed_flights = self.ed_flights()
+        pd_flights = self.pd_flights()
+
+        pd_head_5 = pd_flights.head(5)
+        ed_head_5 = ed_flights.head(5).sample(frac=1)
+        assert_pandas_eland_frame_equal(pd_head_5, ed_head_5)
+
+    def test_sample_frac_is(self):
+        frac = 0.1
+        ed_flights = self.ed_flights()
+
+        ed_flights_sample = ed_flights.sample(frac=frac)._to_pandas()
+        size = len(ed_flights._to_pandas())
+        assert len(ed_flights_sample) <= int(round(frac * size))