Skip to content

Commit

Permalink
BUG: fixes issue pandas-dev#4322
Browse files Browse the repository at this point in the history
Adds support for the thousands character in csv parser for floats.

Updated docs to reflect bug fix.
  • Loading branch information
guyrt committed Aug 23, 2013
1 parent cba88ed commit 0922599
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 25 deletions.
20 changes: 11 additions & 9 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,11 @@ They can take a number of arguments:
date_converters.py
- ``dayfirst``: if True then uses the DD/MM international/European date format
(This is False by default)
- ``thousands``: sepcifies the thousands separator. If not None, then parser
will try to look for it in the output and parse relevant data to integers.
Because it has to essentially scan through the data again, this causes a
- ``thousands``: specifies the thousands separator. If not None, this character will
be stripped from numeric dtypes. However, if it is the first character in a field,
that column will be imported as a string. In the PythonParser, if not None,
then parser will try to look for it in the output and parse relevant data to numeric
dtypes. Because it has to essentially scan through the data again, this causes a
significant performance hit so only use if necessary.
- ``lineterminator`` : string (length 1), default ``None``, Character to break file into lines. Only valid with C parser
- ``quotechar`` : string, The character to used to denote the start and end of a quoted item.
Expand Down Expand Up @@ -506,8 +508,8 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided:

Thousand Separators
~~~~~~~~~~~~~~~~~~~
For large integers that have been written with a thousands separator, you can
set the ``thousands`` keyword to ``True`` so that integers will be parsed
For large numbers that have been written with a thousands separator, you can
set the ``thousands`` keyword to a string of length 1 so that integers will be parsed
correctly:

.. ipython:: python
Expand All @@ -521,7 +523,7 @@ correctly:
with open('tmp.csv', 'w') as fh:
fh.write(data)
By default, integers with a thousands separator will be parsed as strings
By default, numbers with a thousands separator will be parsed as strings

.. ipython:: python
Expand Down Expand Up @@ -1123,7 +1125,7 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series``
- ``numpy`` : direct decoding to numpy arrays. default is False;
Note that the JSON ordering **MUST** be the same for each term if ``numpy=True``
- ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality
- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default
- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default
None. By default the timestamp precision will be detected, if this is not desired
then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to
seconds, milliseconds, microseconds or nanoseconds respectively.
Expand Down Expand Up @@ -1201,11 +1203,11 @@ nanoseconds
dfju
# Let Pandas detect the correct precision
dfju = pd.read_json(json)
dfju = pd.read_json(json)
dfju
# Or specify that all timestamps are in nanoseconds
dfju = pd.read_json(json, date_unit='ns')
dfju = pd.read_json(json, date_unit='ns')
dfju
.. ipython:: python
Expand Down
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
- Fix selection with ``ix/loc`` and non_unique selectors (:issue:`4619`)
- Fix assignment with iloc/loc involving a dtype change in an existing column (:issue:`4312`)
have internal setitem_with_indexer in core/indexing to use Block.setitem
- Fixed bug where thousands operator was not handled correctly for floating point numbers
in csv_import (:issue:`4322`)

pandas 0.12
===========
Expand Down
3 changes: 3 additions & 0 deletions doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,9 @@ Bug Fixes

- Suppressed DeprecationWarning associated with internal calls issued by repr() (:issue:`4391`)

- Fixed bug where thousands operator was not handled correctly for floating point numbers
in csv_import (:issue:`4322`)

See the :ref:`full release notes
<release>` or issue tracker
on GitHub for a complete list.
92 changes: 87 additions & 5 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
import pandas.io.parsers as parsers
from pandas.io.parsers import (read_csv, read_table, read_fwf,
TextFileReader, TextParser)
from pandas.util.testing import (assert_almost_equal,
from pandas.util.testing import (assert_equal,
assert_almost_equal,
assert_series_equal,
makeCustomDataframe as mkdf,
network,
Expand Down Expand Up @@ -67,6 +68,35 @@ def setUp(self):
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
self.xls1 = os.path.join(self.dirpath, 'test.xls')

def test_multi_character_decimal_marker(self):
data = """A|B|C
1|2,334|5
10|13|10.
"""
self.assertRaises(ValueError, read_csv, StringIO(data), decimal=',,')

def test_empty_decimal_marker(self):
data = """A|B|C
1|2,334|5
10|13|10.
"""
self.assertRaises(ValueError, read_csv, StringIO(data), decimal='')

def test_empty_thousands_marker(self):
data = """A|B|C
1|2,334|5
10|13|10.
"""
self.assertRaises(ValueError, read_csv, StringIO(data), thousands='')


def test_multi_character_decimal_marker(self):
data = """A|B|C
1|2,334|5
10|13|10.
"""
self.assertRaises(ValueError, read_csv, StringIO(data), thousands=',,')

def test_empty_string(self):
data = """\
One,Two,Three
Expand Down Expand Up @@ -164,14 +194,48 @@ def test_1000_sep(self):
1|2,334|5
10|13|10.
"""
expected = [[1, 2334., 5],
[10, 13, 10]]
expected = DataFrame({
'A': [1, 10],
'B': [2334, 13],
'C': [5, 10.]
})

df = self.read_csv(StringIO(data), sep='|', thousands=',')
assert_almost_equal(df.values, expected)
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|', thousands=',')
assert_almost_equal(df.values, expected)
tm.assert_frame_equal(df, expected)

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""
expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

assert_equal(expected.A.dtype, 'int64')
assert_equal(expected.B.dtype, 'float')
assert_equal(expected.C.dtype, 'float')

df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_squeeze(self):
data = """\
Expand Down Expand Up @@ -1862,6 +1926,24 @@ def test_1000_fwf(self):
thousands=',')
assert_almost_equal(df.values, expected)

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""

expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

df = self.read_csv(StringIO(data), sep='|', thousands=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|', thousands=',')
tm.assert_frame_equal(df, expected)

def test_comment_fwf(self):
data = """
1 2. 4 #hello world
Expand Down
8 changes: 4 additions & 4 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ cdef extern from "parser/tokenizer.h":
uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)

inline int to_double(char *item, double *p_value,
char sci, char decimal)
char sci, char decimal, char thousands)
inline int to_complex(char *item, double *p_real,
double *p_imag, char sci, char decimal)
inline int to_longlong(char *item, long long *p_value)
Expand Down Expand Up @@ -355,7 +355,7 @@ cdef class TextReader:

if thousands is not None:
if len(thousands) != 1:
raise ValueError('Only length-1 decimal markers supported')
raise ValueError('Only length-1 thousands markers supported')
self.parser.thousands = ord(thousands)

if escapechar is not None:
Expand Down Expand Up @@ -1397,7 +1397,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
na_count += 1
data[0] = NA
else:
error = to_double(word, data, parser.sci, parser.decimal)
error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
if error != 1:
if strcasecmp(word, cinf) == 0:
data[0] = INF
Expand All @@ -1413,7 +1413,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
else:
for i in range(lines):
word = COLITER_NEXT(it)
error = to_double(word, data, parser.sci, parser.decimal)
error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
if error != 1:
if strcasecmp(word, cinf) == 0:
data[0] = INF
Expand Down
19 changes: 13 additions & 6 deletions pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1633,7 +1633,7 @@ void test_count_lines(char *fname) {


// forward declaration
static double xstrtod(const char *p, char **q, char decimal, char sci, int skip_trailing);
static double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing);


P_INLINE void lowercase(char *p) {
Expand Down Expand Up @@ -1661,11 +1661,11 @@ P_INLINE void uppercase(char *p) {
*
*/

int to_double(char *item, double *p_value, char sci, char decimal)
int to_double(char *item, double *p_value, char sci, char decimal, char tsep)
{
char *p_end;

*p_value = xstrtod(item, &p_end, decimal, sci, TRUE);
*p_value = xstrtod(item, &p_end, decimal, sci, tsep, TRUE);

return (errno == 0) && (!*p_end);
}
Expand All @@ -1675,7 +1675,7 @@ int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, ch
{
char *p_end;

*p_real = xstrtod(item, &p_end, decimal, sci, FALSE);
*p_real = xstrtod(item, &p_end, decimal, sci, '\0', FALSE);
if (*p_end == '\0') {
*p_imag = 0.0;
return errno == 0;
Expand All @@ -1689,7 +1689,7 @@ int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, ch
if (*p_end == '+') {
++p_end;
}
*p_imag = xstrtod(p_end, &p_end, decimal, sci, FALSE);
*p_imag = xstrtod(p_end, &p_end, decimal, sci, '\0', FALSE);
if (errno || ((*p_end != 'i') && (*p_end != 'j'))) {
return FALSE;
}
Expand Down Expand Up @@ -1856,10 +1856,12 @@ int main(int argc, char *argv[])
// * Added decimal and sci arguments.
// * Skip trailing spaces.
// * Commented out the other functions.
// Modifications by Richard T Guy, August 2013:
// * Add tsep argument for thousands separator
//

static double xstrtod(const char *str, char **endptr, char decimal,
char sci, int skip_trailing)
char sci, char tsep, int skip_trailing)
{
double number;
int exponent;
Expand Down Expand Up @@ -1894,6 +1896,11 @@ static double xstrtod(const char *str, char **endptr, char decimal,
number = number * 10. + (*p - '0');
p++;
num_digits++;

if (tsep != '\0' && *p == tsep)
{
++p;
}
}

// Process decimal part
Expand Down
2 changes: 1 addition & 1 deletion pandas/src/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min,
int64_t int_max, int *error, char tsep);
uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error);

int P_INLINE to_double(char *item, double *p_value, char sci, char decimal);
int P_INLINE to_double(char *item, double *p_value, char sci, char decimal, char tsep);
int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal);
int P_INLINE to_longlong(char *item, long long *p_value);
int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep);
Expand Down

0 comments on commit 0922599

Please sign in to comment.