Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing var, std and flaky tests #185

Merged
merged 5 commits into from Apr 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
45 changes: 45 additions & 0 deletions eland/operations.py
Expand Up @@ -15,6 +15,8 @@
import copy
import warnings

import numpy as np

import pandas as pd
from pandas.core.dtypes.common import is_datetime_or_timedelta_dtype
from elasticsearch.helpers import scan
Expand Down Expand Up @@ -285,6 +287,49 @@ def _metric_aggs(
results[field] = response["aggregations"][
"percentiles_" + field
]["values"]["50.0"]

# If 0-length dataframe we get None here
if results[field] is None:
results[field] = np.float64(np.NaN)
elif func[1] == "variance":
stevedodson marked this conversation as resolved.
Show resolved Hide resolved
# pandas computes the sample variance
# Elasticsearch computes the population variance
count = response["aggregations"][func[0] + "_" + field][
"count"
]

results[field] = response["aggregations"][
func[0] + "_" + field
][func[1]]

# transform population variance into sample variance
if count <= 1:
results[field] = np.float64(np.NaN)
else:
results[field] = count / (count - 1.0) * results[field]
elif func[1] == "std_deviation":
# pandas computes the sample std
# Elasticsearch computes the population std
count = response["aggregations"][func[0] + "_" + field][
"count"
]

results[field] = response["aggregations"][
func[0] + "_" + field
][func[1]]

# transform population std into sample std
# sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2}
# population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2}
# sample_std=\sqrt{\frac{N}{N-1}population_std}
if count <= 1:
results[field] = np.float64(np.NaN)
else:
results[field] = np.sqrt(
(count / (count - 1.0))
* results[field]
* results[field]
)
else:
results[field] = response["aggregations"][
func[0] + "_" + field
Expand Down
37 changes: 36 additions & 1 deletion eland/tests/dataframe/test_metrics_pytest.py
Expand Up @@ -37,11 +37,46 @@ def test_flights_extended_metrics(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()

# Test on reduced set of data for more consistent
# median behaviour + better var, std test for sample vs population
pd_flights = pd_flights[pd_flights.DestAirportID == "AMS"]
ed_flights = ed_flights[ed_flights.DestAirportID == "AMS"]

for func in self.extended_funcs:
pd_metric = getattr(pd_flights, func)(numeric_only=True)
ed_metric = getattr(ed_flights, func)(numeric_only=True)

assert_series_equal(pd_metric, ed_metric, check_less_precise=True)
assert_series_equal(
pd_metric, ed_metric, check_exact=False, check_less_precise=True
)

def test_flights_extended_metrics_nan(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()

# Test on single row to test NaN behaviour of sample std/variance
pd_flights_1 = pd_flights[pd_flights.FlightNum == "9HY9SWR"]
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"]

for func in self.extended_funcs:
pd_metric = getattr(pd_flights_1, func)(numeric_only=True)
ed_metric = getattr(ed_flights_1, func)(numeric_only=True)

assert_series_equal(
pd_metric, ed_metric, check_exact=False, check_less_precise=True
)

# Test on zero rows to test NaN behaviour of sample std/variance
pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"]
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"]

for func in self.extended_funcs:
pd_metric = getattr(pd_flights_0, func)(numeric_only=True)
ed_metric = getattr(ed_flights_0, func)(numeric_only=True)

assert_series_equal(
pd_metric, ed_metric, check_exact=False, check_less_precise=True
)

def test_ecommerce_selected_non_numeric_source_fields(self):
# None of these are numeric
Expand Down
12 changes: 6 additions & 6 deletions eland/tests/ml/test_imported_ml_model_pytest.py
Expand Up @@ -42,7 +42,7 @@ def test_decision_tree_classifier(self):
)
es_results = es_model.predict(test_data)

np.testing.assert_almost_equal(test_results, es_results, decimal=4)
np.testing.assert_almost_equal(test_results, es_results, decimal=2)

# Clean up
es_model.delete_model()
Expand All @@ -66,7 +66,7 @@ def test_decision_tree_regressor(self):
)
es_results = es_model.predict(test_data)

np.testing.assert_almost_equal(test_results, es_results, decimal=4)
np.testing.assert_almost_equal(test_results, es_results, decimal=2)

# Clean up
es_model.delete_model()
Expand All @@ -90,7 +90,7 @@ def test_random_forest_classifier(self):
)
es_results = es_model.predict(test_data)

np.testing.assert_almost_equal(test_results, es_results, decimal=4)
np.testing.assert_almost_equal(test_results, es_results, decimal=2)

# Clean up
es_model.delete_model()
Expand All @@ -114,7 +114,7 @@ def test_random_forest_regressor(self):
)
es_results = es_model.predict(test_data)

np.testing.assert_almost_equal(test_results, es_results, decimal=4)
np.testing.assert_almost_equal(test_results, es_results, decimal=2)

# Clean up
es_model.delete_model()
Expand All @@ -138,7 +138,7 @@ def test_xgb_classifier(self):
)
es_results = es_model.predict(test_data)

np.testing.assert_almost_equal(test_results, es_results, decimal=4)
np.testing.assert_almost_equal(test_results, es_results, decimal=2)

# Clean up
es_model.delete_model()
Expand All @@ -162,7 +162,7 @@ def test_xgb_regressor(self):
)
es_results = es_model.predict(test_data)

np.testing.assert_almost_equal(test_results, es_results, decimal=4)
np.testing.assert_almost_equal(test_results, es_results, decimal=2)

# Clean up
es_model.delete_model()