elastic · sethmlarson · Mar 27, 2020 · Mar 15, 2020 · Mar 27, 2020 · Mar 27, 2020
diff --git a/.gitignore b/.gitignore
@@ -43,6 +43,7 @@ ipython_config.py
 # Environments
 .env
 .venv
+.nox
 env/
 venv/
 ENV/

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
@@ -1,5 +1,5 @@
 elasticsearch>=7.0.5
-pandas==0.25.3
+pandas>=1
 matplotlib
 pytest>=5.2.1
 git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master

diff --git a/docs/source/examples/demo_notebook.ipynb b/docs/source/examples/demo_notebook.ipynb
diff --git a/eland/dataframe.py b/eland/dataframe.py
@@ -22,7 +22,7 @@
 from pandas.core.computation.eval import eval
 from pandas.core.dtypes.common import is_list_like
 from pandas.core.indexing import check_bool_indexer
-from pandas.io.common import _expand_user, _stringify_path
+from pandas.io.common import _expand_user, stringify_path
 from pandas.io.formats import console
 from pandas.io.formats import format as fmt
 from pandas.io.formats.printing import pprint_thing
@@ -249,12 +249,19 @@ def tail(self, n=5):
         --------
         >>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
         >>> df.tail()
-                                                          Origin                                      Dest
-        13054                         Pisa International Airport      Xi'an Xianyang International Airport
-        13055  Winnipeg / James Armstrong Richardson Internat...                            Zurich Airport
-        13056     Licenciado Benito Juarez International Airport                         Ukrainka Air Base
-        13057                                      Itami Airport  Ministro Pistarini International Airport
-        13058                     Adelaide International Airport   Washington Dulles International Airport
+                                                                    Origin  \\
+        13054                                   Pisa International Airport   
+        13055  Winnipeg / James Armstrong Richardson International Airport   
+        13056               Licenciado Benito Juarez International Airport   
+        13057                                                Itami Airport   
+        13058                               Adelaide International Airport   
+        <BLANKLINE>
+                                                   Dest  
+        13054      Xi'an Xianyang International Airport  
+        13055                            Zurich Airport  
+        13056                         Ukrainka Air Base  
+        13057  Ministro Pistarini International Airport  
+        13058   Washington Dulles International Airport  
         <BLANKLINE>
         [5 rows x 2 columns]
         """
@@ -602,22 +609,25 @@ def info(
         <class 'eland.dataframe.DataFrame'>
         Index: 4675 entries, 0 to 4674
         Data columns (total 2 columns):
-        customer_first_name    4675 non-null object
-        geoip.city_name        4094 non-null object
+         #   Column               Non-Null Count  Dtype 
+        ---  ------               --------------  ----- 
+         0   customer_first_name  4675 non-null   object
+         1   geoip.city_name      4094 non-null   object
         dtypes: object(2)
-        memory usage: ...
+        memory usage: 80.0 bytes
         """
         if buf is None:  # pragma: no cover
             buf = sys.stdout
 
         lines = [str(type(self)), self._index_summary()]
 
         if len(self.columns) == 0:
-            lines.append(f"Empty {type(self).__name__}")
+            lines.append("Empty {name}".format(name=type(self).__name__))
             fmt.buffer_put_lines(buf, lines)
             return
 
         cols = self.columns
+        col_count = len(self.columns)
 
         # hack
         if max_cols is None:
@@ -637,30 +647,74 @@ def _put_str(s, space):
 
         def _verbose_repr():
             lines.append(f"Data columns (total {len(self.columns)} columns):")
-            space = max(len(pprint_thing(k)) for k in self.columns) + 4
+
+            id_head = " # "
+            column_head = "Column"
+            col_space = 2
+
+            max_col = max(len(pprint_thing(k)) for k in cols)
+            len_column = len(pprint_thing(column_head))
+            space = max(max_col, len_column) + col_space
+
+            max_id = len(pprint_thing(col_count))
+            len_id = len(pprint_thing(id_head))
+            space_num = max(max_id, len_id) + col_space
             counts = None
 
-            tmpl = "{count}{dtype}"
+            header = _put_str(id_head, space_num) + _put_str(column_head, space)
             if show_counts:
                 counts = self.count()
                 if len(cols) != len(counts):  # pragma: no cover
                     raise AssertionError(
-                        f"Columns must equal counts "
-                        f"({len(cols):d} != {len(counts):d})"
+                        "Columns must equal counts "
+                        "({cols:d} != {counts:d})".format(
+                            cols=len(cols), counts=len(counts)
+                        )
                     )
-                tmpl = "{count} non-null {dtype}"
+                count_header = "Non-Null Count"
+                len_count = len(count_header)
+                non_null = " non-null"
+                max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
+                space_count = max(len_count, max_count) + col_space
+                count_temp = "{count}" + non_null
+            else:
+                count_header = ""
+                space_count = len(count_header)
+                len_count = space_count
+                count_temp = "{count}"
+
+            dtype_header = "Dtype"
+            len_dtype = len(dtype_header)
+            max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes)
+            space_dtype = max(len_dtype, max_dtypes)
+            header += _put_str(count_header, space_count) + _put_str(
+                dtype_header, space_dtype
+            )
+
+            lines.append(header)
+            lines.append(
+                _put_str("-" * len_id, space_num)
+                + _put_str("-" * len_column, space)
+                + _put_str("-" * len_count, space_count)
+                + _put_str("-" * len_dtype, space_dtype)
+            )
 
             dtypes = self.dtypes
             for i, col in enumerate(self.columns):
                 dtype = dtypes.iloc[i]
                 col = pprint_thing(col)
 
+                line_no = _put_str(" {num}".format(num=i), space_num)
+
                 count = ""
                 if show_counts:
                     count = counts.iloc[i]
 
                 lines.append(
-                    _put_str(col, space) + tmpl.format(count=count, dtype=dtype)
+                    line_no
+                    + _put_str(col, space)
+                    + _put_str(count_temp.format(count=count), space_count)
+                    + _put_str(dtype, space_dtype)
                 )
 
         def _non_verbose_repr():
@@ -670,9 +724,13 @@ def _sizeof_fmt(num, size_qualifier):
             # returns size in human readable format
             for x in ["bytes", "KB", "MB", "GB", "TB"]:
                 if num < 1024.0:
-                    return f"{num:3.1f}{size_qualifier} {x}"
+                    return "{num:3.1f}{size_q} " "{x}".format(
+                        num=num, size_q=size_qualifier, x=x
+                    )
                 num /= 1024.0
-            return f"{num:3.1f}{size_qualifier} PB"
+            return "{num:3.1f}{size_q} {pb}".format(
+                num=num, size_q=size_qualifier, pb="PB"
+            )
 
         if verbose:
             _verbose_repr()
@@ -769,7 +827,7 @@ def to_html(
         df = self._build_repr(max_rows + 1)
 
         if buf is not None:
-            _buf = _expand_user(_stringify_path(buf))
+            _buf = _expand_user(stringify_path(buf))
         else:
             _buf = StringIO()
 
@@ -866,7 +924,7 @@ def to_string(
         df = self._build_repr(max_rows + 1)
 
         if buf is not None:
-            _buf = _expand_user(_stringify_path(buf))
+            _buf = _expand_user(stringify_path(buf))
         else:
             _buf = StringIO()
 

diff --git a/eland/ndframe.py b/eland/ndframe.py
@@ -238,16 +238,16 @@ def min(self, numeric_only=True):
         --------
         >>> df = ed.DataFrame('localhost', 'flights')
         >>> df.min()
-        AvgTicketPrice        100.020531
-        Cancelled               0.000000
-        DistanceKilometers      0.000000
-        DistanceMiles           0.000000
-        FlightDelay             0.000000
-        FlightDelayMin          0.000000
-        FlightTimeHour          0.000000
-        FlightTimeMin           0.000000
-        dayOfWeek               0.000000
-        dtype: float64
+        AvgTicketPrice        100.021
+        Cancelled               False
+        DistanceKilometers          0
+        DistanceMiles               0
+        FlightDelay             False
+        FlightDelayMin              0
+        FlightTimeHour              0
+        FlightTimeMin               0
+        dayOfWeek                   0
+        dtype: object
         """
         return self._query_compiler.min(numeric_only=numeric_only)
 
@@ -270,16 +270,16 @@ def max(self, numeric_only=True):
         --------
         >>> df = ed.DataFrame('localhost', 'flights')
         >>> df.max()
-        AvgTicketPrice         1199.729004
-        Cancelled                 1.000000
-        DistanceKilometers    19881.482422
-        DistanceMiles         12353.780273
-        FlightDelay               1.000000
-        FlightDelayMin          360.000000
-        FlightTimeHour           31.715034
-        FlightTimeMin          1902.901978
-        dayOfWeek                 6.000000
-        dtype: float64
+        AvgTicketPrice        1199.73
+        Cancelled                True
+        DistanceKilometers    19881.5
+        DistanceMiles         12353.8
+        FlightDelay              True
+        FlightDelayMin            360
+        FlightTimeHour         31.715
+        FlightTimeMin          1902.9
+        dayOfWeek                   6
+        dtype: object
         """
         return self._query_compiler.max(numeric_only=numeric_only)
 

diff --git a/eland/operations.py b/eland/operations.py
@@ -126,10 +126,14 @@ def sum(self, query_compiler, numeric_only=True):
         return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)
 
     def max(self, query_compiler, numeric_only=True):
-        return self._metric_aggs(query_compiler, "max", numeric_only=numeric_only)
+        return self._metric_aggs(
+            query_compiler, "max", numeric_only=numeric_only, keep_original_dtype=True
+        )
 
     def min(self, query_compiler, numeric_only=True):
-        return self._metric_aggs(query_compiler, "min", numeric_only=numeric_only)
+        return self._metric_aggs(
+            query_compiler, "min", numeric_only=numeric_only, keep_original_dtype=True
+        )
 
     def nunique(self, query_compiler):
         return self._metric_aggs(
@@ -142,13 +146,22 @@ def value_counts(self, query_compiler, es_size):
     def hist(self, query_compiler, bins):
         return self._hist_aggs(query_compiler, bins)
 
-    def _metric_aggs(self, query_compiler, func, field_types=None, numeric_only=None):
+    def _metric_aggs(
+        self,
+        query_compiler,
+        func,
+        field_types=None,
+        numeric_only=None,
+        keep_original_dtype=False,
+    ):
         """
         Parameters
         ----------
         field_types: str, default None
             if `aggregatable` use only field_names whose fields in elasticseach are aggregatable.
             If `None`, use only numeric fields.
+        keep_original_dtype : bool, default False
+            if `True` the output values should keep the same domain as the input values, i.e. booleans should be booleans
 
         Returns
         -------
@@ -235,6 +248,10 @@ def _metric_aggs(self, query_compiler, func, field_types=None, numeric_only=None
                     results[field] = elasticsearch_date_to_pandas_date(
                         response["aggregations"][field]["value_as_string"], date_format
                     )
+                elif keep_original_dtype:
+                    results[field] = pd_dtype.type(
+                        response["aggregations"][field]["value"]
+                    )
                 else:
                     results[field] = response["aggregations"][field]["value"]
 

diff --git a/eland/series.py b/eland/series.py
@@ -35,7 +35,7 @@
 
 import numpy as np
 import pandas as pd
-from pandas.io.common import _expand_user, _stringify_path
+from pandas.io.common import _expand_user, stringify_path
 
 import eland.plotting
 from eland import NDFrame
@@ -365,7 +365,7 @@ def to_string(
         temp_series = self._build_repr(max_rows + 1)
 
         if buf is not None:
-            _buf = _expand_user(_stringify_path(buf))
+            _buf = _expand_user(stringify_path(buf))
         else:
             _buf = StringIO()
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,5 +1,5 @@
 elasticsearch>=7.0.5
-pandas==0.25.3
+pandas>=1
 matplotlib
 pytest>=5.2.1
 nbval

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 elasticsearch>=7.0.5
-pandas==0.25.3
+pandas>=1
 matplotlib
diff --git a/setup.py b/setup.py
@@ -187,6 +187,6 @@
     classifiers=CLASSIFIERS,
     keywords="elastic eland pandas python",
     packages=find_packages(include=["eland", "eland.*"]),
-    install_requires=["elasticsearch>=7.0.5, <8", "pandas==0.25.3", "matplotlib"],
+    install_requires=["elasticsearch>=7.0.5, <8", "pandas>=1", "matplotlib"],
     python_requires=">=3.6",
 )