Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issues following update to pandas 1.0.1 #141

Merged
merged 6 commits into from Mar 27, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -43,6 +43,7 @@ ipython_config.py
# Environments
.env
.venv
.nox
env/
venv/
ENV/
Expand Down
2 changes: 1 addition & 1 deletion docs/requirements-docs.txt
@@ -1,5 +1,5 @@
elasticsearch>=7.0.5
pandas==0.25.3
pandas>=1
matplotlib
pytest>=5.2.1
git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master
Expand Down
238 changes: 130 additions & 108 deletions docs/source/examples/demo_notebook.ipynb

Large diffs are not rendered by default.

100 changes: 79 additions & 21 deletions eland/dataframe.py
Expand Up @@ -22,7 +22,7 @@
from pandas.core.computation.eval import eval
from pandas.core.dtypes.common import is_list_like
from pandas.core.indexing import check_bool_indexer
from pandas.io.common import _expand_user, _stringify_path
from pandas.io.common import _expand_user, stringify_path
from pandas.io.formats import console
from pandas.io.formats import format as fmt
from pandas.io.formats.printing import pprint_thing
Expand Down Expand Up @@ -249,12 +249,19 @@ def tail(self, n=5):
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
>>> df.tail()
Origin Dest
13054 Pisa International Airport Xi'an Xianyang International Airport
13055 Winnipeg / James Armstrong Richardson Internat... Zurich Airport
13056 Licenciado Benito Juarez International Airport Ukrainka Air Base
13057 Itami Airport Ministro Pistarini International Airport
13058 Adelaide International Airport Washington Dulles International Airport
Origin \\
13054 Pisa International Airport
13055 Winnipeg / James Armstrong Richardson International Airport
13056 Licenciado Benito Juarez International Airport
13057 Itami Airport
13058 Adelaide International Airport
<BLANKLINE>
Dest
13054 Xi'an Xianyang International Airport
13055 Zurich Airport
13056 Ukrainka Air Base
13057 Ministro Pistarini International Airport
13058 Washington Dulles International Airport
<BLANKLINE>
[5 rows x 2 columns]
"""
Expand Down Expand Up @@ -602,22 +609,25 @@ def info(
<class 'eland.dataframe.DataFrame'>
Index: 4675 entries, 0 to 4674
Data columns (total 2 columns):
customer_first_name 4675 non-null object
geoip.city_name 4094 non-null object
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customer_first_name 4675 non-null object
1 geoip.city_name 4094 non-null object
dtypes: object(2)
memory usage: ...
memory usage: 80.0 bytes
"""
if buf is None: # pragma: no cover
buf = sys.stdout

lines = [str(type(self)), self._index_summary()]

if len(self.columns) == 0:
lines.append(f"Empty {type(self).__name__}")
lines.append("Empty {name}".format(name=type(self).__name__))
sethmlarson marked this conversation as resolved.
Show resolved Hide resolved
fmt.buffer_put_lines(buf, lines)
return

cols = self.columns
col_count = len(self.columns)

# hack
if max_cols is None:
Expand All @@ -637,30 +647,74 @@ def _put_str(s, space):

def _verbose_repr():
lines.append(f"Data columns (total {len(self.columns)} columns):")
space = max(len(pprint_thing(k)) for k in self.columns) + 4

id_head = " # "
column_head = "Column"
col_space = 2

max_col = max(len(pprint_thing(k)) for k in cols)
len_column = len(pprint_thing(column_head))
space = max(max_col, len_column) + col_space

max_id = len(pprint_thing(col_count))
len_id = len(pprint_thing(id_head))
space_num = max(max_id, len_id) + col_space
counts = None

tmpl = "{count}{dtype}"
header = _put_str(id_head, space_num) + _put_str(column_head, space)
sethmlarson marked this conversation as resolved.
Show resolved Hide resolved
if show_counts:
counts = self.count()
if len(cols) != len(counts): # pragma: no cover
raise AssertionError(
f"Columns must equal counts "
f"({len(cols):d} != {len(counts):d})"
"Columns must equal counts "
"({cols:d} != {counts:d})".format(
cols=len(cols), counts=len(counts)
)
)
tmpl = "{count} non-null {dtype}"
count_header = "Non-Null Count"
len_count = len(count_header)
non_null = " non-null"
max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
space_count = max(len_count, max_count) + col_space
count_temp = "{count}" + non_null
else:
count_header = ""
space_count = len(count_header)
len_count = space_count
count_temp = "{count}"

dtype_header = "Dtype"
len_dtype = len(dtype_header)
max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes)
space_dtype = max(len_dtype, max_dtypes)
header += _put_str(count_header, space_count) + _put_str(
dtype_header, space_dtype
)

lines.append(header)
lines.append(
_put_str("-" * len_id, space_num)
+ _put_str("-" * len_column, space)
+ _put_str("-" * len_count, space_count)
+ _put_str("-" * len_dtype, space_dtype)
)

dtypes = self.dtypes
for i, col in enumerate(self.columns):
dtype = dtypes.iloc[i]
col = pprint_thing(col)

line_no = _put_str(" {num}".format(num=i), space_num)

count = ""
if show_counts:
count = counts.iloc[i]

lines.append(
_put_str(col, space) + tmpl.format(count=count, dtype=dtype)
line_no
+ _put_str(col, space)
+ _put_str(count_temp.format(count=count), space_count)
+ _put_str(dtype, space_dtype)
)

def _non_verbose_repr():
Expand All @@ -670,9 +724,13 @@ def _sizeof_fmt(num, size_qualifier):
# returns size in human readable format
for x in ["bytes", "KB", "MB", "GB", "TB"]:
if num < 1024.0:
return f"{num:3.1f}{size_qualifier} {x}"
return "{num:3.1f}{size_q} " "{x}".format(
sethmlarson marked this conversation as resolved.
Show resolved Hide resolved
num=num, size_q=size_qualifier, x=x
)
num /= 1024.0
return f"{num:3.1f}{size_qualifier} PB"
return "{num:3.1f}{size_q} {pb}".format(
num=num, size_q=size_qualifier, pb="PB"
)

if verbose:
_verbose_repr()
Expand Down Expand Up @@ -769,7 +827,7 @@ def to_html(
df = self._build_repr(max_rows + 1)

if buf is not None:
_buf = _expand_user(_stringify_path(buf))
_buf = _expand_user(stringify_path(buf))
else:
_buf = StringIO()

Expand Down Expand Up @@ -866,7 +924,7 @@ def to_string(
df = self._build_repr(max_rows + 1)

if buf is not None:
_buf = _expand_user(_stringify_path(buf))
_buf = _expand_user(stringify_path(buf))
else:
_buf = StringIO()

Expand Down
40 changes: 20 additions & 20 deletions eland/ndframe.py
Expand Up @@ -238,16 +238,16 @@ def min(self, numeric_only=True):
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.min()
AvgTicketPrice 100.020531
Cancelled 0.000000
DistanceKilometers 0.000000
DistanceMiles 0.000000
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 0.000000
FlightTimeMin 0.000000
dayOfWeek 0.000000
dtype: float64
AvgTicketPrice 100.021
Cancelled False
DistanceKilometers 0
DistanceMiles 0
FlightDelay False
FlightDelayMin 0
FlightTimeHour 0
FlightTimeMin 0
dayOfWeek 0
dtype: object
"""
return self._query_compiler.min(numeric_only=numeric_only)

Expand All @@ -270,16 +270,16 @@ def max(self, numeric_only=True):
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.max()
AvgTicketPrice 1199.729004
Cancelled 1.000000
DistanceKilometers 19881.482422
DistanceMiles 12353.780273
FlightDelay 1.000000
FlightDelayMin 360.000000
FlightTimeHour 31.715034
FlightTimeMin 1902.901978
dayOfWeek 6.000000
dtype: float64
AvgTicketPrice 1199.73
Cancelled True
DistanceKilometers 19881.5
DistanceMiles 12353.8
FlightDelay True
FlightDelayMin 360
FlightTimeHour 31.715
FlightTimeMin 1902.9
dayOfWeek 6
dtype: object
"""
return self._query_compiler.max(numeric_only=numeric_only)

Expand Down
23 changes: 20 additions & 3 deletions eland/operations.py
Expand Up @@ -126,10 +126,14 @@ def sum(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)

def max(self, query_compiler, numeric_only=True):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the above frames this keep_domain change looks pretty good, would also like @stevedodson to confirm here.

return self._metric_aggs(query_compiler, "max", numeric_only=numeric_only)
return self._metric_aggs(
query_compiler, "max", numeric_only=numeric_only, keep_original_dtype=True
)

def min(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "min", numeric_only=numeric_only)
return self._metric_aggs(
query_compiler, "min", numeric_only=numeric_only, keep_original_dtype=True
)

def nunique(self, query_compiler):
return self._metric_aggs(
Expand All @@ -142,13 +146,22 @@ def value_counts(self, query_compiler, es_size):
def hist(self, query_compiler, bins):
return self._hist_aggs(query_compiler, bins)

def _metric_aggs(self, query_compiler, func, field_types=None, numeric_only=None):
def _metric_aggs(
self,
query_compiler,
func,
field_types=None,
numeric_only=None,
keep_original_dtype=False,
):
"""
Parameters
----------
field_types: str, default None
if `aggregatable` use only field_names whose fields in elasticseach are aggregatable.
If `None`, use only numeric fields.
keep_original_dtype : bool, default False
if `True` the output values should keep the same domain as the input values, i.e. booleans should be booleans

Returns
-------
Expand Down Expand Up @@ -235,6 +248,10 @@ def _metric_aggs(self, query_compiler, func, field_types=None, numeric_only=None
results[field] = elasticsearch_date_to_pandas_date(
response["aggregations"][field]["value_as_string"], date_format
)
elif keep_original_dtype:
sethmlarson marked this conversation as resolved.
Show resolved Hide resolved
results[field] = pd_dtype.type(
response["aggregations"][field]["value"]
)
else:
results[field] = response["aggregations"][field]["value"]

Expand Down
4 changes: 2 additions & 2 deletions eland/series.py
Expand Up @@ -35,7 +35,7 @@

import numpy as np
import pandas as pd
from pandas.io.common import _expand_user, _stringify_path
from pandas.io.common import _expand_user, stringify_path

import eland.plotting
from eland import NDFrame
Expand Down Expand Up @@ -365,7 +365,7 @@ def to_string(
temp_series = self._build_repr(max_rows + 1)

if buf is not None:
_buf = _expand_user(_stringify_path(buf))
_buf = _expand_user(stringify_path(buf))
else:
_buf = StringIO()

Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
@@ -1,5 +1,5 @@
elasticsearch>=7.0.5
pandas==0.25.3
pandas>=1
matplotlib
pytest>=5.2.1
nbval
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
@@ -1,3 +1,3 @@
elasticsearch>=7.0.5
pandas==0.25.3
pandas>=1
matplotlib
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -187,6 +187,6 @@
classifiers=CLASSIFIERS,
keywords="elastic eland pandas python",
packages=find_packages(include=["eland", "eland.*"]),
install_requires=["elasticsearch>=7.0.5, <8", "pandas==0.25.3", "matplotlib"],
install_requires=["elasticsearch>=7.0.5, <8", "pandas>=1", "matplotlib"],
python_requires=">=3.6",
)