Skip to content

Commit

Permalink
ENH: Add DataFrame tabular repr
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks committed Oct 8, 2016
1 parent 5895750 commit 6cccf4f
Show file tree
Hide file tree
Showing 3 changed files with 403 additions and 12 deletions.
80 changes: 79 additions & 1 deletion dask/dataframe/core.py
Expand Up @@ -270,7 +270,7 @@ def __setstate__(self, state):
def _keys(self):
return [(self._name, i) for i in range(self.npartitions)]

def __repr__(self):
def _repr_header(self):
name = self._name if len(self._name) < 10 else self._name[:7] + '...'
if self.known_divisions:
div_text = ', divisions=%s' % repr_long_list(self.divisions)
Expand All @@ -280,6 +280,28 @@ def __repr__(self):
return ("dd.%s<%s, npartitions=%s%s>" %
(self.__class__.__name__, name, self.npartitions, div_text))

@property
def _repr_data(self):
raise NotImplementedError

@property
def _repr_divisions(self):
if self.known_divisions:
divisions = pd.Index(self.divisions, name='divisions')
else:
# avoid to be converted to NaN
divisions = pd.Index(['None'] * (self.npartitions + 1),
name='divisions')
return divisions

def __repr__(self):
return """{name}

Dask {klass} Structure:
{data}""".format(name=self._repr_header(),
klass=self.__class__.__name__,
data=repr(self._repr_data))

@property
def index(self):
"""Return dask Index instance"""
Expand Down Expand Up @@ -1467,6 +1489,27 @@ def __array__(self, dtype=None, **kwargs):
def __array_wrap__(self, array, context=None):
return pd.Series(array, name=self.name)

@cache_readonly
def _repr_data(self):
values = [str(self.dtype)] + ['...'] * self.npartitions
return pd.Series(values, index=self._repr_divisions, name=self.name)

def __repr__(self):
""" have to overwrite footer """
if self.name is not None:
footer = "Name: {name}, dtype: {dtype}".format(name=self.name,
dtype=self.dtype)
else:
footer = "dtype: {dtype}".format(dtype=self.dtype)
return """{name}

Dask {klass} Structure:
{data}
{footer}""".format(name=self._repr_header(),
klass=self.__class__.__name__,
data=self._repr_data.to_string(),
footer=footer)

@cache_readonly
def dt(self):
return DatetimeAccessor(self)
Expand Down Expand Up @@ -1648,6 +1691,10 @@ def to_frame(self, name=None):
return self.map_partitions(M.to_frame, name,
meta=self._meta.to_frame(name))

@derived_from(pd.Series)
def to_string(self):
return self._repr_data.to_string()

@classmethod
def _bind_operator_method(cls, name, op):
""" bind operator method like DataFrame.add to this class """
Expand Down Expand Up @@ -2128,6 +2175,10 @@ def to_bag(self, index=False):
from .io import to_bag
return to_bag(self, index)

@derived_from(pd.DataFrame)
def to_string(self):
return self._repr_data.to_string()

def _get_numeric_data(self, how='any', subset=None):
# calculate columns to avoid unnecessary calculation
numerics = self._meta._get_numeric_data()
Expand Down Expand Up @@ -2389,6 +2440,29 @@ def info(self, buf=None, verbose=False, memory_usage=False):

put_lines(buf, lines)

@cache_readonly
def _repr_data(self):
dtypes = self.dtypes
values = {key: [value] + ['...'] * self.npartitions for key, value
in zip(dtypes.index, dtypes.values)}
return pd.DataFrame(values,
index=self._repr_divisions,
columns=self.columns)

_HTML_FMT = """{name}
<div><strong>Dask DataFrame Structure:</strong></div>
{data}"""

@derived_from(pd.DataFrame)
def to_html(self):
# pd.Series doesn't have html repr
return self._HTML_FMT.format(name=_escape_html_tag(self._repr_header()),
data=self._repr_data.to_html())

def _repr_html_(self):
return self._HTML_FMT.format(name=_escape_html_tag(self._repr_header()),
data=self._repr_data._repr_html_())


# bind operators
for op in [operator.abs, operator.add, operator.and_, operator_div,
Expand Down Expand Up @@ -3417,3 +3491,7 @@ def safe_head(df, n):
"`npartitions` to `head`.")
warnings.warn(msg.format(n, len(r)))
return r


def _escape_html_tag(s):
return s.replace('<', r'&lt;', 1).replace('>', r'&gt;', 1)
11 changes: 0 additions & 11 deletions dask/dataframe/tests/test_dataframe.py
Expand Up @@ -108,17 +108,6 @@ def test_Series():
assert repr(d.a).startswith('dd.Series')


def test_repr():
df = pd.DataFrame({'x': list(range(100))})
ddf = dd.from_pandas(df, 3)

for x in [ddf, ddf.index, ddf.x]:
assert type(x).__name__ in repr(x)
assert x._name[:5] in repr(x)
assert str(x.npartitions) in repr(x)
assert len(repr(x)) < 80


def test_Index():
for case in [pd.DataFrame(np.random.randn(10, 5), index=list('abcdefghij')),
pd.DataFrame(np.random.randn(10, 5),
Expand Down

0 comments on commit 6cccf4f

Please sign in to comment.