Skip to content

Commit

Permalink
BUG, DOC: Improve dialect handling in read_csv
Browse files Browse the repository at this point in the history
1) Update documentation about how the dialect
parameter is handled.

2) Verify that the dialect parameter passed in
is valid before accessing the dialect attributes.

Closes pandas-devgh-14898.
  • Loading branch information
gfyoung committed Dec 18, 2016
1 parent e503d40 commit a3f3a88
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 49 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ Map on Index types now return other Index types
Other API Changes
^^^^^^^^^^^^^^^^^

- ``pd.read_csv()`` will now issue a ``UserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`)

.. _whatsnew_0200.deprecations:

Expand Down Expand Up @@ -236,6 +237,7 @@ Bug Fixes

- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)



Expand Down
38 changes: 30 additions & 8 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,9 @@
standard encodings
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_
dialect : str or csv.Dialect instance, default None
If None defaults to Excel dialect. Ignored if sep longer than 1 char
See csv.Dialect documentation for more details
If provided, this parameter will override values for the following
parameters: `delimiter`, `doublequote`, `escapechar`, `skipinitialspace`,
`quotechar`, and `quoting`. See csv.Dialect documentation for more details.
tupleize_cols : boolean, default False
Leave a list of tuples on columns as is (default is to convert to
a Multi Index on the columns)
Expand Down Expand Up @@ -692,12 +693,33 @@ def __init__(self, f, engine=None, **kwds):
dialect = kwds['dialect']
if dialect in csv.list_dialects():
dialect = csv.get_dialect(dialect)
kwds['delimiter'] = dialect.delimiter
kwds['doublequote'] = dialect.doublequote
kwds['escapechar'] = dialect.escapechar
kwds['skipinitialspace'] = dialect.skipinitialspace
kwds['quotechar'] = dialect.quotechar
kwds['quoting'] = dialect.quoting

# Any valid dialect should have these attributes.
# If any are missing, we will raise automatically.
for param in ('delimiter', 'doublequote', 'escapechar',
'skipinitialspace', 'quotechar', 'quoting'):
try:
dialect_val = getattr(dialect, param)
except AttributeError:
raise ValueError("Invalid dialect '{dialect}' provided"
.format(dialect=kwds['dialect']))
provided = kwds.get(param, _parser_defaults[param])

# Messages for conflicting values between the dialect instance
# and the actual parameters provided.
conflict_msgs = []

if dialect_val != provided:
conflict_msgs.append((
"Conflicting values for '{param}': '{val}' was "
"provided, but the dialect specifies '{diaval}'. "
"Using the dialect-specified value.".format(
param=param, val=provided, diaval=dialect_val)))

if conflict_msgs:
warnings.warn('\n\n'.join(conflict_msgs), UserWarning,
stacklevel=2)
kwds[param] = dialect_val

if kwds.get('header', 'infer') == 'infer':
kwds['header'] = 0 if kwds.get('names') is None else None
Expand Down
35 changes: 0 additions & 35 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,41 +77,6 @@ def test_read_csv(self):
fname = prefix + compat.text_type(self.csv1)
self.read_csv(fname, index_col=0, parse_dates=True)

def test_dialect(self):
data = """\
label1,label2,label3
index1,"a,c,e
index2,b,d,f
"""

dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
df = self.read_csv(StringIO(data), dialect=dia)

data = '''\
label1,label2,label3
index1,a,c,e
index2,b,d,f
'''
exp = self.read_csv(StringIO(data))
exp.replace('a', '"a', inplace=True)
tm.assert_frame_equal(df, exp)

def test_dialect_str(self):
data = """\
fruit:vegetable
apple:brocolli
pear:tomato
"""
exp = DataFrame({
'fruit': ['apple', 'pear'],
'vegetable': ['brocolli', 'tomato']
})
dia = csv.register_dialect('mydialect', delimiter=':') # noqa
df = self.read_csv(StringIO(data), dialect='mydialect')
tm.assert_frame_equal(df, exp)
csv.unregister_dialect('mydialect')

def test_1000_sep(self):
data = """A|B|C
1|2,334|5
Expand Down
67 changes: 67 additions & 0 deletions pandas/io/tests/parser/dialect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-

"""
Tests that dialects are properly handled during parsing
for all of the parsers defined in parsers.py
"""

import csv

from pandas import DataFrame
from pandas.compat import StringIO

import pandas.util.testing as tm


class DialectTests(object):

def test_dialect(self):
data = """\
label1,label2,label3
index1,"a,c,e
index2,b,d,f
"""

dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
with tm.assert_produces_warning(UserWarning):
df = self.read_csv(StringIO(data), dialect=dia)

data = '''\
label1,label2,label3
index1,a,c,e
index2,b,d,f
'''
exp = self.read_csv(StringIO(data))
exp.replace('a', '"a', inplace=True)
tm.assert_frame_equal(df, exp)

def test_dialect_str(self):
data = """\
fruit:vegetable
apple:brocolli
pear:tomato
"""
exp = DataFrame({
'fruit': ['apple', 'pear'],
'vegetable': ['brocolli', 'tomato']
})
csv.register_dialect('mydialect', delimiter=':')
with tm.assert_produces_warning(UserWarning):
df = self.read_csv(StringIO(data), dialect='mydialect')

tm.assert_frame_equal(df, exp)
csv.unregister_dialect('mydialect')

def test_dialect_conflict(self):
data = 'a,b\n1,2'
dialect = 'excel'
exp = DataFrame({'a': [1], 'b': [2]})

with tm.assert_produces_warning(None):
df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect)
tm.assert_frame_equal(df, exp)

with tm.assert_produces_warning(UserWarning):
df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect)
tm.assert_frame_equal(df, exp)
13 changes: 7 additions & 6 deletions pandas/io/tests/parser/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .common import ParserTests
from .header import HeaderTests
from .comment import CommentTests
from .dialect import DialectTests
from .quoting import QuotingTests
from .usecols import UsecolsTests
from .skiprows import SkipRowsTests
Expand All @@ -26,12 +27,12 @@


class BaseParser(CommentTests, CompressionTests,
ConverterTests, HeaderTests,
IndexColTests, MultithreadTests,
NAvaluesTests, ParseDatesTests,
ParserTests, SkipRowsTests,
UsecolsTests, QuotingTests,
DtypeTests):
ConverterTests, DialectTests,
HeaderTests, IndexColTests,
MultithreadTests, NAvaluesTests,
ParseDatesTests, ParserTests,
SkipRowsTests, UsecolsTests,
QuotingTests, DtypeTests):
def read_csv(self, *args, **kwargs):
raise NotImplementedError

Expand Down

0 comments on commit a3f3a88

Please sign in to comment.