Skip to content

Commit

Permalink
BUG, DOC: Improve dialect handling in read_csv
Browse files Browse the repository at this point in the history
1) Update documentation about how the dialect
parameter is handled.

2) Verify that the dialect parameter passed in
is valid before accessing the dialect attributes.

Closes pandas-devgh-14898.
  • Loading branch information
gfyoung committed Dec 29, 2016
1 parent 975a440 commit 4a6bcce
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 51 deletions.
7 changes: 5 additions & 2 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -325,8 +325,11 @@ encoding : str, default ``None``
Python standard encodings
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_.
dialect : str or :class:`python:csv.Dialect` instance, default ``None``
If ``None`` defaults to Excel dialect. Ignored if sep longer than 1 char. See
:class:`python:csv.Dialect` documentation for more details.
If provided, this parameter will override values (default or not) for the
following parameters: `delimiter`, `doublequote`, `escapechar`,
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
override values, a ParserWarning will be issued. See :class:`python:csv.Dialect`
documentation for more details.
tupleize_cols : boolean, default ``False``
Leave a list of tuples on columns as is (default is to convert to a MultiIndex
on the columns).
Expand Down
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ Other API Changes
- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)
- ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`)

- ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`)

.. _whatsnew_0200.deprecations:

Deprecations
Expand Down Expand Up @@ -288,6 +290,8 @@ Bug Fixes
- Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
- Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`)
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)



- Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`)
Expand Down
40 changes: 32 additions & 8 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,11 @@
standard encodings
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_
dialect : str or csv.Dialect instance, default None
If None defaults to Excel dialect. Ignored if sep longer than 1 char
See csv.Dialect documentation for more details
If provided, this parameter will override values (default or not) for the
following parameters: `delimiter`, `doublequote`, `escapechar`,
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
override values, a ParserWarning will be issued. See csv.Dialect
documentation for more details.
tupleize_cols : boolean, default False
Leave a list of tuples on columns as is (default is to convert to
a Multi Index on the columns)
Expand Down Expand Up @@ -698,12 +701,33 @@ def __init__(self, f, engine=None, **kwds):
dialect = kwds['dialect']
if dialect in csv.list_dialects():
dialect = csv.get_dialect(dialect)
kwds['delimiter'] = dialect.delimiter
kwds['doublequote'] = dialect.doublequote
kwds['escapechar'] = dialect.escapechar
kwds['skipinitialspace'] = dialect.skipinitialspace
kwds['quotechar'] = dialect.quotechar
kwds['quoting'] = dialect.quoting

# Any valid dialect should have these attributes.
# If any are missing, we will raise automatically.
for param in ('delimiter', 'doublequote', 'escapechar',
'skipinitialspace', 'quotechar', 'quoting'):
try:
dialect_val = getattr(dialect, param)
except AttributeError:
raise ValueError("Invalid dialect '{dialect}' provided"
.format(dialect=kwds['dialect']))
provided = kwds.get(param, _parser_defaults[param])

# Messages for conflicting values between the dialect instance
# and the actual parameters provided.
conflict_msgs = []

if dialect_val != provided:
conflict_msgs.append((
"Conflicting values for '{param}': '{val}' was "
"provided, but the dialect specifies '{diaval}'. "
"Using the dialect-specified value.".format(
param=param, val=provided, diaval=dialect_val)))

if conflict_msgs:
warnings.warn('\n\n'.join(conflict_msgs), ParserWarning,
stacklevel=2)
kwds[param] = dialect_val

if kwds.get('header', 'infer') == 'infer':
kwds['header'] = 0 if kwds.get('names') is None else None
Expand Down
35 changes: 0 additions & 35 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,41 +77,6 @@ def test_read_csv(self):
fname = prefix + compat.text_type(self.csv1)
self.read_csv(fname, index_col=0, parse_dates=True)

def test_dialect(self):
data = """\
label1,label2,label3
index1,"a,c,e
index2,b,d,f
"""

dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
df = self.read_csv(StringIO(data), dialect=dia)

data = '''\
label1,label2,label3
index1,a,c,e
index2,b,d,f
'''
exp = self.read_csv(StringIO(data))
exp.replace('a', '"a', inplace=True)
tm.assert_frame_equal(df, exp)

def test_dialect_str(self):
data = """\
fruit:vegetable
apple:brocolli
pear:tomato
"""
exp = DataFrame({
'fruit': ['apple', 'pear'],
'vegetable': ['brocolli', 'tomato']
})
dia = csv.register_dialect('mydialect', delimiter=':') # noqa
df = self.read_csv(StringIO(data), dialect='mydialect')
tm.assert_frame_equal(df, exp)
csv.unregister_dialect('mydialect')

def test_1000_sep(self):
data = """A|B|C
1|2,334|5
Expand Down
68 changes: 68 additions & 0 deletions pandas/io/tests/parser/dialect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-

"""
Tests that dialects are properly handled during parsing
for all of the parsers defined in parsers.py
"""

import csv

from pandas import DataFrame
from pandas.compat import StringIO
from pandas.io.common import ParserWarning

import pandas.util.testing as tm


class DialectTests(object):

def test_dialect(self):
data = """\
label1,label2,label3
index1,"a,c,e
index2,b,d,f
"""

dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
with tm.assert_produces_warning(ParserWarning):
df = self.read_csv(StringIO(data), dialect=dia)

data = '''\
label1,label2,label3
index1,a,c,e
index2,b,d,f
'''
exp = self.read_csv(StringIO(data))
exp.replace('a', '"a', inplace=True)
tm.assert_frame_equal(df, exp)

def test_dialect_str(self):
data = """\
fruit:vegetable
apple:brocolli
pear:tomato
"""
exp = DataFrame({
'fruit': ['apple', 'pear'],
'vegetable': ['brocolli', 'tomato']
})
csv.register_dialect('mydialect', delimiter=':')
with tm.assert_produces_warning(ParserWarning):
df = self.read_csv(StringIO(data), dialect='mydialect')

tm.assert_frame_equal(df, exp)
csv.unregister_dialect('mydialect')

def test_dialect_conflict(self):
data = 'a,b\n1,2'
dialect = 'excel'
exp = DataFrame({'a': [1], 'b': [2]})

with tm.assert_produces_warning(None):
df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect)
tm.assert_frame_equal(df, exp)

with tm.assert_produces_warning(ParserWarning):
df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect)
tm.assert_frame_equal(df, exp)
13 changes: 7 additions & 6 deletions pandas/io/tests/parser/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .common import ParserTests
from .header import HeaderTests
from .comment import CommentTests
from .dialect import DialectTests
from .quoting import QuotingTests
from .usecols import UsecolsTests
from .skiprows import SkipRowsTests
Expand All @@ -26,12 +27,12 @@


class BaseParser(CommentTests, CompressionTests,
ConverterTests, HeaderTests,
IndexColTests, MultithreadTests,
NAvaluesTests, ParseDatesTests,
ParserTests, SkipRowsTests,
UsecolsTests, QuotingTests,
DtypeTests):
ConverterTests, DialectTests,
HeaderTests, IndexColTests,
MultithreadTests, NAvaluesTests,
ParseDatesTests, ParserTests,
SkipRowsTests, UsecolsTests,
QuotingTests, DtypeTests):
def read_csv(self, *args, **kwargs):
raise NotImplementedError

Expand Down

0 comments on commit 4a6bcce

Please sign in to comment.