From 5e1a2417b220df1f4f38e8579f850aa9198767af Mon Sep 17 00:00:00 2001 From: Vitor Baptista Date: Tue, 13 Feb 2018 18:32:53 +0000 Subject: [PATCH 1/5] Run pylama inside tox --- Makefile | 1 - setup.py | 1 - tox.ini | 9 +++++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c11350f3..df361b23 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,6 @@ list: @grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n' test: - pylama $(PACKAGE) tox version: diff --git a/setup.py b/setup.py index 666a8bbf..e477f2e8 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,6 @@ def read(*paths): 'lxml>=3.0,<4.0', ] TESTS_REQUIRE = [ - 'pylama', 'tox', ] README = read('README.md') diff --git a/tox.ini b/tox.ini index d2e061e2..47696390 100644 --- a/tox.ini +++ b/tox.ini @@ -6,6 +6,7 @@ envlist= py34 py35 py36 + lint [testenv] extras= @@ -27,3 +28,11 @@ commands= --cov-config tox.ini \ --cov-report term-missing \ {posargs} + +[testenv:lint] +deps= + pylama +commands= + pylama \ + {[tox]package} \ + {posargs} From b426088345c0325339b4d20a5ec33b6b32b8c623 Mon Sep 17 00:00:00 2001 From: Vitor Baptista Date: Tue, 13 Feb 2018 18:35:35 +0000 Subject: [PATCH 2/5] Review and reorganise README and move API reference to docstrings --- .gitignore | 2 + README.md | 1054 +++++++++++++++++---------------------- tabulator/cli.py | 2 - tabulator/exceptions.py | 20 +- tabulator/loader.py | 27 +- tabulator/parser.py | 65 ++- tabulator/stream.py | 150 ++++-- tabulator/validate.py | 16 +- tabulator/writer.py | 23 +- tests/test_helpers.py | 5 +- 10 files changed, 698 insertions(+), 666 deletions(-) diff --git a/.gitignore b/.gitignore index 6c45bcaa..1c00eee3 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,7 @@ pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ +.pytest_cache/ .coverage .coverage.* .cache @@ -74,3 +75,4 @@ tmp # Extra datapackage +.#* diff --git a/README.md b/README.md index 70380356..1116201a 100644 --- a/README.md +++ b/README.md @@ -9,485 +9,294 @@ A library for reading and writing tabular data (csv/xls/json/etc). ## Features -- supports various formats: csv/tsv/xls/xlsx/json/ndjson/ods/gsheet/inline/sql/etc -- reads data from local, remote, stream or text sources -- streams data instead of using a lot of memory -- processes data via simple user processors -- saves data using the same interface -- custom loaders, parsers and writers -- support for compressed files +- **Supports most common tabular formats**: CSV, XLS, ODS, JSON, Google Sheets, SQL, and others. +- **Loads local and remote data**: Supports HTTP and FTP. +- **Low memory usage**: Only the current row is kept in memory, so you can + large datasets. +- **Supports compressed files**: Using ZIP or GZIP algorithms. +- **Extensible**: You can add support for custom file formats and loaders (e.g. + FTP). + + +## Table of Contents + +- [Features](#features) +- [Getting started](#getting-started) + - [Installation](#installation) + - [Running on CLI](#running-on-cli) + - [Running on Python](#running-on-python) +- [Documentation](#documentation) + - [Stream](#stream) + - [Options](#options) + - [Supported schemes](#supported-schemes) + - [file](#file) + - [http/https/ftp/ftps](#httphttpsftpftps) + - [stream](#stream) + - [text](#text) + - [Supported file formats](#supported-file-formats) + - [csv (read & write)](#csv-read--write) + - [xls/xlsx (read only)](#xlsxlsx-read-only) + - [ods (read only)](#ods-read-only) + - [gsheet (read only)](#gsheet-read-only) + - [sql (read only)](#sql-read-only) + - [Data Package (read only)](#data-package-read-only) + - [inline (read only)](#inline-read-only) + - [json (read only)](#json-read-only) + - [ndjson (read only)](#ndjson-read-only) + - [tsv (read only)](#tsv-read-only) + - [Adding support for new file sources, formats, and writers](#adding-support-for-new-file-sources-formats-and-writers) + - [Custom loaders](#custom-loaders) + - [Custom parsers](#custom-parsers) + - [Custom writers](#custom-writers) + - [Validate](#validate) + - [Exceptions](#exceptions) +- [API Reference](#api-reference) +- [Contributing](#contributing) +- [Changelog](#changelog) + + ## Getting started ### Installation -The package use semantic versioning. It means that major versions could include breaking changes. It's highly recommended to specify `tabulator` version range if you use `setup.py` or `requirements.txt` file e.g. `tabulator<2.0`. +```bash +$ pip install tabulator +``` + +### Running on CLI + +Tabulator ships with a simple CLI called `tabulator` to read tabular data. For +example: ```bash -$ pip install tabulator # OR "sudo pip install tabulator" +$ tabulator https://github.com/frictionlessdata/tabulator-py/raw/4c1b3943ac98be87b551d87a777d0f7ca4904701/data/table.csv.gz +id,name +1,english +2,中国人 ``` -### Examples +You can see all supported options by running `tabulator --help`. -It's pretty simple to start with `tabulator`: +### Running on Python ```python from tabulator import Stream -with Stream('path.csv', headers=1) as stream: +with Stream('data.csv', headers=1) as stream: stream.headers # [header1, header2, ..] for row in stream: - row # [value1, value2, ..] + print(row) # [value1, value2, ..] ``` -There is an [examples](https://github.com/frictionlessdata/tabulator-py/tree/master/examples) directory containing other code listings. +You can find other examples in the [examples][examples-dir] directory. ## Documentation -The whole public API of this package is described here and follows semantic versioning rules. Everyting outside of this readme are private API and could be changed without any notification on any new version. +In the following sections, we'll walk through some usage examples of +this library. All examples were tested with Python 3.6, but should +run fine with Python 3.3+. ### Stream -The `Stream` class represents a tabular stream. It takes the `source` argument in a form of source string or object: +The `Stream` class represents a tabular stream. It takes the file path as the +`source` argument. For example: ``` ://path/to/file. ``` -and uses corresponding `Loader` and `Parser` to open and start to iterate over the tabular stream. Also user can pass `scheme` and `format` explicitly as constructor arguments. There are also alot other options described in sections below. - -Let's create a simple stream object to read csv file: - -```python -from tabulator import Stream - -stream = Stream('data.csv') -``` - -This action just instantiate a stream instance. There is no actual IO interactions or source validity checks. We need to open the stream object. - -```python -stream.open() -``` - -This call will validate data source, open underlaying stream and read the data sample (if it's not disabled). All possible exceptions will be raised on `stream.open` call not on constructor call. - -After work with the stream is done it could be closed: - -```python -stream.close() -``` - -The `Stream` class supports Python context manager interface so calls above could be written using `with` syntax. It's a common and recommended way to use `tabulator` stream: - -```pytnon -with Stream('data.csv') as stream: - # use stream -``` - -Now we could iterate over rows in our tabular data source. It's important to understand that `tabulator` uses underlaying streams not loading it to memory (just one row at time). So the `stream.iter()` interface is the most effective way to use the stream: - -```python -for row in stream.iter(): - row # [value1, value2, ..] -``` - -But if you need all the data in one call you could use `stream.read()` function instead of `stream.iter()` function. But if you just run it after code snippet above the `stream.read()` call will return an empty list. That another important following of stream nature of `tabulator` - the `Stream` instance just iterates over an underlaying stream. The underlaying stream has internal pointer (for example as file-like object has). So after we've iterated over all rows in the first listing the pointer is set to the end of stream. - -```python -stream.read() # [] -``` - -The recommended way is to iterate (or read) over stream just once (and save data to memory if needed). But there is a possibility to reset the steram pointer. For some sources it will not be effective (another HTTP request for remote source). But if you work with local file as a source for example it's just a cheap `file.seek()` call: - -``` -stream.reset() -stream.read() # [[value1, value2, ..], ..] -``` - -The `Stream` class supports saving tabular data stream to the filesystem. Let's reset stream again (dont' forget about the pointer) and save it to the disk: - -``` -stream.reset() -stream.save('data-copy.csv') -``` - -The full session will be looking like this: - -```python -from tabulator import Stream - -with Stream('data.csv') as stream: - for row in stream.iter(): - row # [value1, value2, ..] - stream.reset() - stream.read() # [[value1, value2, ..], ..] - stream.reset() - stream.save('data-copy.csv') -``` - -It's just a pretty basic `Stream` introduction. Please read the full documentation below and about `Stream` arguments in more detail in following sections. There are many other goodies like headers extraction, keyed output, post parse processors and many more! - -#### `Stream(source, **options)` - -Create stream class instance. - -- `source (any)` - stream source in a form based on `scheme` argument -- `headers (list/int)` - headers list or row number containing headers or row numbers range containing headers. If number is given for plain source headers row and all rows before will be removed and for keyed source no rows will be removed. See [headers](https://github.com/frictionlessdata/tabulator-py#headers) section. -- `scheme (str)` - source scheme with `file` as default. For the most cases scheme will be inferred from source. See a list of supported schemas below. See [schemes](https://github.com/frictionlessdata/tabulator-py#schemes) section. -- `format (str)` - source format with `None` (detect) as default. For the most cases format will be inferred from source. See a list of supported formats below. See [formats](https://github.com/frictionlessdata/tabulator-py#formats) section. -- `encoding (str)` - source encoding with `None` (detect) as default. See [encoding](https://github.com/frictionlessdata/tabulator-py#encoding) section. -- `compression (str)` - source compression like `zip` with `None` (detect) as default. See [compression](https://github.com/frictionlessdata/tabulator-py#compression) section. -- `allow_html (bool)` - a flag to allow html. See [allow html](https://github.com/frictionlessdata/tabulator-py#allow-html) section. -- `sample_size (int)` - rows count for table.sample. Set to "0" to prevent any parsing activities before actual table.iter call. In this case headers will not be extracted from the source. See [sample size](https://github.com/frictionlessdata/tabulator-py#sample-size) section. -- `bytes_sample_size (int)` - sample size in bytes for operations like encoding detection. See [bytes sample size](https://github.com/frictionlessdata/tabulator-py#bytes-sample-size) section. -- `ignore_blank_headers (bool)` - a flag to ignore any column having a blank header. See [ignore blank headers](https://github.com/frictionlessdata/tabulator-py#ignore-blank-headers) section. -- `force_strings (bool)` - if `True` all output will be converted to strings. See [force strings](https://github.com/frictionlessdata/tabulator-py#force-strings) section. -- `force_parse (bool)` - if `True` on row parsing error a stream will return an empty row instead of raising an exception. See [force parse](https://github.com/frictionlessdata/tabulator-py#force-parse) section. -- `skip_rows (int/str[])` - list of rows to skip by row number or row comment. Example: `skip_rows=[1, 2, -1, -3, '#', '//']` - rows 1, 2 and rows 1, 3 from the end and all rows started with `#` and `//` will be skipped. See [skip rows](https://github.com/frictionlessdata/tabulator-py#skip-rows) section. -- `post_parse (generator[])` - post parse processors (hooks). Signature to follow is `processor(extended_rows) -> yield (row_number, headers, row)` which should yield one extended row per yield instruction. See [post parse](https://github.com/frictionlessdata/tabulator-py#post-parse) section. -- `custom_loaders (dict)` - loaders keyed by scheme. See a section below. See [custom loaders](https://github.com/frictionlessdata/tabulator-py#custom-loaders) section. -- `custom_parsers (dict)` - custom parsers keyed by format. See a section below. See [custom parsers](https://github.com/frictionlessdata/tabulator-py#custom-parsers) section. -- `custom_writers (dict)` - custom writers keyed by format. See a section below. See [custom writers](https://github.com/frictionlessdata/tabulator-py#custom-writers) section. -- ` ()` - loader/parser options. See in the scheme/format section -- `(Stream)` - returns Stream class instance - -#### `stream.closed` - -- `(bool)` - returns`True` if underlaying stream is closed - -#### `stream.open()` - -Open stream by opening underlaying stream. - -#### `stream.close()` - -Close stream by closing underlaying stream. - -#### `stream.reset()` - -Reset stream pointer to the first row. - -#### `stream.headers` - -- `(str[])` - returns data headers - -#### `stream.scheme` - -- `(str)` - returns an actual scheme - -#### `stream.format` - -- `(str)` - returns an actual format - -#### `stream.encoding` - -- `(str)` - returns an actual encoding - -#### `stream.sample` - -- `(list)` - returns data sample - -#### `stream.iter(keyed=False, extended=False)` - -Iter stream rows. See [keyed and extended rows](https://github.com/frictionlessdata/tabulator-py#https://github.com/frictionlessdata/tabulator-py#keyed-and-extended-rows) section. - -- `keyed (bool)` - if True yield keyed rows -- `extended (bool)` - if True yield extended rows -- `(any[]/any{})` - yields row/keyed row/extended row - -#### `stream.read(keyed=False, extended=False, limit=None)` - -Read table rows with count limit. See [keyed and extended rows](https://github.com/frictionlessdata/tabulator-py#https://github.com/frictionlessdata/tabulator-py#keyed-and-extended-rows) section. - -- `keyed (bool)` - return keyed rows -- `extended (bool)` - return extended rows -- `limit (int)` - rows count limit -- `(list)` - returns rows/keyed rows/extended rows - -#### `stream.save(target, format=None, encoding=None, **options)` - -Save stream to filesystem. - -- `target (str)` - stream target -- `format (str)` - saving format. See supported formats -- `encoding (str)` - saving encoding -- `options (dict)` - writer options - -### Schemes - -There is a list of all supported schemes. - -#### file - -The default scheme. Source should be a file in local filesystem. You could provide a string or a `pathlib.Path` instance: - -```python -stream = Stream('data.csv') -stream = Stream(pathlib.Path('data.csv')) -``` - -#### http/https/ftp/ftps - -> In Python 2 `tabulator` can't stream remote data source because of underlaying libraries limitation. The whole data source will be loaded to the memory. In Python 3 there is no such a problem and `tabulator` is able to stream remote data source as expected. - -Source should be a file available via one of this protocols in the web. - -```python -stream = Stream('http://example.com/data.csv') -``` - -Options: -- http_session - a `requests.Session` object. Read more in the `requests` [docs](http://docs.python-requests.org/en/master/user/advanced/#session-objects). -- http_stream - use HTTP streaming when possible. It's enabled by default. Disable if you'd like to preload the whole file into memory first. -#### stream - -Source should be a file-like python object which supports corresponding protocol. +It uses this path to determine the file format (e.g. CSV or XLS) and scheme +(e.g. HTTP or postgresql). If necessary, you also can define these explicitly. +Let's try it out. First, we create a `Stream` object passing the path to a CSV file. ```python -stream = Stream(open('data.csv')) -``` +import tabulator -#### text - -Source should be a string containing tabular data. In this case `format` has to be explicitely passed because it's not possible to infer it from source string. - - -```python -stream = Stream('text://name,age\nJohn, 21\n', format='csv') +stream = tabulator.Stream('data.csv') ``` -### Formats - -There is a list of all supported formats. Formats support `read` operation could be opened by `Stream.open()` and formats support `write` operation could be used in `Stream.save()`. - -#### csv - -Source should be parsable by csv parser. - -```python -stream = Stream('data.csv', delimiter=',') -``` - -Operations: -- read -- write - -Options: -- delimiter -- doublequote -- escapechar -- quotechar -- quoting -- skipinitialspace -- lineterminator - -See options reference in [Python documentation](https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters). - -#### datapackage - -> This format is not included to package by default. To use it please install `tabulator` with an `datapackage` extras: `$ pip install tabulator[datapackage]` - -Source should be a valid Tabular Data Package see (https://frictionlessdata.io). +At this point, the file haven't been read yet. Let's open the stream so we can +read the contents. ```python -stream = Stream('datapackage.json', resource=1) +try: + stream.open() +except tabulator.TabulatorException as e: + pass # Handle exception ``` -Operations: -- read - -Options: -- resource - resource index (starting from 0) or resource name - -#### gsheet +This will open the underlying data stream, read a small sample to detect the +file encoding, and prepare the data to be read. We catch +`tabulator.TabulatorException` here, in case something goes wrong. -Source should be a link to publicly available Google Spreadsheet. +We can now read the file contents. To iterate over each row, we do: ```python -stream = Stream('https://docs.google.com/spreadsheets/d/?usp=sharing') -stream = Stream('https://docs.google.com/spreadsheets/d/edit#gid=') +for row in stream.iter(): + print(row) # [value1, value2, ...] ``` -#### inline +The `stream.iter()` method will return each row data as a list of values. If +you prefer, you could call `stream.iter(keyed=True)` instead, which returns a +dictionary with the column names as keys. Either way, this method keeps only a +single row in memory at a time. This means it can handle handle large files +without consuming too much memory. -Source should be a list of lists or a list of dicts. +If you want to read the entire file, use `stream.read()`. It accepts the same +arguments as `stream.iter()`, but returns all rows at once. ```python -stream = Stream([['name', 'age'], ['John', 21], ['Alex', 33]]) -stream = Stream([{'name': 'John', 'age': 21}, {'name': 'Alex', 'age': 33}]) +stream.reset() +rows = stream.read() ``` -Operations: -- read - -#### json +Notice that we called `stream.reset()` before reading the rows. This is because +internally, tabulator only keeps a pointer to its current location in the file. +If we didn't reset this pointer, we would read starting from where we stopped. +For example, if we ran `stream.read()` again, we would get an empty list, as +the internal file pointer is at the end of the file (because we've already read +it all). Depending on the file location, it might be necessary to download the +file again to rewind (e.g. when the file was loaded from the web). -Source should be a valid JSON document containing array of arrays or array of objects (see `inline` format example). +After we're done, close the stream with: ```python -stream = Stream('data.json', property='key1.key2') +stream.close() ``` -Operations: -- read - -Options: -- property - path to tabular data property separated by dots. For example having data structure like `{"response": {"data": [...]}}` you should set property to `response.data`. - -#### ndjson - -Source should be parsable by ndjson parser. +The entire example looks like: ```python -stream = Stream('data.ndjson') -``` +import tabulator -Operations: -- read - -#### ods +stream = tabulator.Stream('data.csv') +try: + stream.open() +except tabulator.TabulatorException as e: + pass # Handle exception -> This format is not included to package by default. To use it please install `tabulator` with an `ods` extras: `$ pip install tabulator[ods]` +for row in stream.iter(): + print(row) # [value1, value2, ...] -Source should be a valid Open Office document. +stream.reset() # Rewind internal file pointer +rows = stream.read() -```python -stream = Stream('data.ods', sheet=1) +stream.close() ``` -Operations: -- read - -Options: -- sheet - sheet number starting from 1 OR sheet name - -#### sql - -Source should be a valid database URL supported by `sqlalchemy`. +It could be rewritten to use Python's context manager interface as: ```python -stream = Stream('postgresql://name:pass@host:5432/database', table='data') -``` - -Operations: -- read - -Options: -- table - database table name to read data (REQUIRED) -- order_by - SQL expression to order rows e.g. `name desc` - -#### tsv +import tabulator -Source should be parsable by tsv parser. +try: + with tabulator.Stream('data.csv') as stream: + for row in stream.iter(): + print(row) -```python -stream = Stream('data.tsv') + stream.reset() + rows = stream.read() +except tabulator.TabulatorException as e: + pass ``` -Operations: -- read - -#### xls/xlsx - -> For `xls` format `tabulator` can't stream data source because of underlaying libraries limitation. The whole data source will be loaded to the memory. For `xlsx` format there is no such a problem and `tabulator` is able to stream data source as expected. +This is the preferred way, as Python closes the stream automatically, even if some exception was thrown along the way. -Source should be a valid Excel document. +The full API documentation is available as docstrings in the [Stream source code][stream.py]. -```python -stream = Stream('data.xls', sheet=1) -``` +#### Options -Operations: -- read +On this section, we'll see all different options supported by the `Stream` +class. -Options: -- sheet - sheet number starting from 1 OR sheet name -- fill_merged_cells - if `True` it will unmerge and fill all merged cells by a visible value. With this option enabled the parser can't stream data and load the whole document into memory. +##### Headers -### Headers - -By default `Stream` considers all data source rows as values: +By default, tabulator considers that all file rows are values (i.e. there is no +header). ```python -with Stream([['name', 'age'], ['Alex', 21]]): +with Stream([['name', 'age'], ['Alex', 21]]) as stream: stream.headers # None stream.read() # [['name', 'age'], ['Alex', 21]] ``` -To alter this behaviour `headers` argument is supported by `Stream` constructor. This argument could be an integer - row number starting from 1 containing headers: +If you have a header row, you can use the `headers` argument with the its row +number (starting from 1). ```python # Integer -with Stream([['name', 'age'], ['Alex', 21]], headers=1): +with Stream([['name', 'age'], ['Alex', 21]], headers=1) as stream: stream.headers # ['name', 'age'] stream.read() # [['Alex', 21]] ``` -Or it could be a list of strings - user-defined headers: +You can also pass a lists of strings to define the headers expliticly: ```python -with Stream([['Alex', 21]], headers=['name', 'age']): +with Stream([['Alex', 21]], headers=['name', 'age']) as stream: stream.headers # ['name', 'age'] stream.read() # [['Alex', 21]] ``` -It's possible to use multiline headers: +Tabulator also supports multiline headers for the `xls` and `xlsx` formats. ```python -with Stream('data.xlsx', headers=[1,3], fill_merged_cells=True): +with Stream('data.xlsx', headers=[1, 3], fill_merged_cells=True) as stream: stream.headers # ['header from row 1-3'] stream.read() # [['value1', 'value2', 'value3']] ``` -If `headers` is a row number/range and data source is not keyed all rows before headers and headers will be removed from data stream (see first example). +##### Encoding -### Encoding - -`Stream` constructor accepts `encoding` argument to ensure needed encoding will be used. As a value argument supported by python encoding name (e.g. 'latin1', 'utf-8', ..) could be used: +You can specify the file encoding (e.g. `utf-8` and `latin1`) via the `encoding` +argument. ```python with Stream(source, encoding='latin1') as stream: stream.read() ``` -By default an encoding will be detected automatically. If you experience a *UnicodeDecodeError* parsing your file, try setting this argument to 'utf-8'. +If this argument isn't set, Tabulator will try to infer it from the data. If you +get a `UnicodeDecodeError` while loading a file, try setting the encoding to +`utf-8`. -### Compression +##### Compression (Python3-only) -`Stream` constructor accepts `compression` argument to ensure that needed compression will be used. By default compression will be inferred from file name: +Tabulator supports both ZIP and GZIP compression methods. By default it'll infer from the file name: ```python with Stream('http://example.com/data.csv.zip') as stream: stream.read() ``` -Provide user defined compression e.g. `gz`: +You can also set it explicitly: ```python -with Stream('data.csv.ext', compression='zip') as stream: +with Stream('data.csv.ext', compression='gz') as stream: stream.read() ``` -At the moment `tabulator` supports: -- `zip` compression (Python3) -- `gz` compression (Python3) - -### Allow html +##### Allow html -By default `Stream` will raise `exceptions.FormatError` on `stream.open()` call if html contents is detected. It's not a tabular format and for example providing link to csv file inside html (e.g. GitHub page) is a common mistake. +The `Stream` class raises `tabulator.exceptions.FormatError` if it detects HTML +contents. This helps avoiding the relatively common mistake of trying to load a +CSV file inside an HTML page, for example on GitHub. -But sometimes this default behaviour is not what is needed. For example you write custom parser which should support html contents. In this case `allow_html` option for `Stream` could be used: +You can disable this behaviour using the `allow_html` option: ```python with Stream(sorce_with_html, allow_html=True) as stream: stream.read() # no exception on open ``` -### Sample size +##### Sample size -By default `Stream` will read some data on `stream.open()` call in advance. This data is provided as `stream.sample`. The size of this sample could be set in rows using `sample_size` argument of stream constructor: +To detect the file's headers, and run other checks like validating that the file +doesn't contain HTML, Tabulator reads a sample of rows on the `stream.open()` +method. This data is available via the `stream.sample` property. The number of +rows used can be defined via the `sample_size` parameters (defaults to 100). ```python with Stream(two_rows_source, sample_size=1) as stream: @@ -495,85 +304,77 @@ with Stream(two_rows_source, sample_size=1) as stream: stream.read() # first and second rows ``` -Data sample could be really useful if you want to implement some initial data checks without moving stream pointer as `stream.iter/read` do. But if you don't want any interactions with an actual source before first `stream.iter/read` call just disable data smapling with `sample_size=0`. +You can disable this by setting `sample_size` to zero. This way, no data will be +read on `stream.open()`. -### Bytes sample size +##### Bytes sample size -On initial reading stage `tabulator` should detect contents encoding. The argument `bytes_sample_size` customizes how many bytes will be read to detect encoding: +Tabulator needs to read a part of the file to infer its encoding. The +`bytes_sample_size` arguments controls how many bytes will be read for this +detection (defaults to 10000). ```python source = 'data/special/latin1.csv' with Stream(source) as stream: stream.encoding # 'iso8859-2' -with Stream(source, sample_size=0, bytes_sample_size=10) as stream: - stream.encoding # 'utf-8' ``` -In this example our data file doesn't include `iso8859-2` characters in first 10 bytes. So we could see the difference in encoding detection. Note `sample_size` usage here - these two parameters are independent. Here we use `sample_size=0` to prevent rows sample creation (will fail with bad encoding). +You can disable this by setting `bytes_sample_size` to zero, in which case it'll +use the machine locale's default encoding. -### Ignore blank headers +##### Ignore blank headers -Some data tables could have blank headers. For example it could be an empty strings in `csv` or `None` values in inline data. By default `tabulator` processes it as an ordinary header: +When `True`, tabulator will ignore columns that have blank headers (defaults to +`False`). -``` +```python +# Default behaviour source = 'text://header1,,header3\nvalue1,value2,value3' with Stream(source, format='csv', headers=1) as stream: stream.headers # ['header1', '', 'header3'] stream.read(keyed=True) # {'header1': 'value1', '': 'value2', 'header3': 'value3'} -``` -But sometimes it's not a desired behavior. You could ignore columns with a blank header completely using an `ignore_blank_headers` flag: - -``` +# Ignoring columns with blank headers source = 'text://header1,,header3\nvalue1,value2,value3' with Stream(source, format='csv', headers=1, ignore_blank_headers=True) as stream: stream.headers # ['header1', 'header3'] stream.read(keyed=True) # {'header1': 'value1', 'header3': 'value3'} ``` -### Force strings +##### Force strings -Because `tabulator` support not only sources with string data representation as `csv` but also sources supporting different data types as `json` or `inline` there is a `Stream` option `force_strings` to stringify all data values on reading. - -Here how stream works without forcing strings: +When `True`, all rows' values will be converted to strings (defaults to +`False`). ```python -with Stream([['string', 1, datetime.time(17, 00)]]) as stream: - stream.read() # [['string', 1, datetime.time(17, 00)]] -``` +# Default behaviour +with Stream([['string', 1, datetime.datetime(2017, 12, 1, 17, 00)]]) as stream: + stream.read() # [['string', 1, datetime.dateime(2017, 12, 1, 17, 00)]] -The same data source using `force_strings` option: - -```python +# Forcing rows' values as strings with Stream([['string', 1]], force_strings=True) as stream: - stream.read() # [['string', '1', '17:00:00']] + stream.read() # [['string', '1', '2017-12-01 17:00:00']] ``` -For all temporal values stream will use ISO format. But if your data source doesn't support temporal values (for instance `json` format) `Stream` just returns it as it is without converting to ISO format. - -### Force parse +##### Force parse -Some data source could be partially mailformed for a parser. For example `inline` source could have good rows (lists or dicts) and bad rows (for example strings). By default `stream.iter/read` will raise `exceptions.SourceError` on the first bad row: +When `True`, don't raise an exception when parsing a malformed row, but simply +return an empty row. Otherwise, tabulator raises +`tabulator.exceptions.SourceError` when a row can't be parsed. Defaults to `False`. ```python +# Default behaviour with Stream([[1], 'bad', [3]]) as stream: - stream.read() # raise exceptions.SourceError -``` - -With `force_parse` option for `Stream` constructor this default behaviour could be changed. If it's set to `True` non-parsable rows will be returned as empty rows: + stream.read() # raises tabulator.exceptions.SourceError -```python -with Stream([[1], 'bad', [3]]) as stream: +# With force_parse +with Stream([[1], 'bad', [3]], force_parse=True) as stream: stream.read() # [[1], [], [3]] ``` -### Skip rows - -It's a very common situation when your tabular data contains some rows you want to skip. It could be blank rows or commented rows. `Stream` constructors accepts `skip_rows` argument to make it possible. Value of this argument should be a list of integers and strings where: -- integer is a row number (1 is the first row, -1 is the last) -- string is a first row chars indicating that row is a comment +##### Skip rows -Let's skip first, second, last and commented by '#' symbol rows: +List of row numbers and/or strings to skip. If it's a string, all rows that begin with it will be skipped (e.g. '#' and '//'). ```python source = [['John', 1], ['Alex', 2], ['#Sam', 3], ['Mike', 4], ['John', 5]] @@ -581,9 +382,12 @@ with Stream(source, skip_rows=[1, 2, -1, '#']) as stream: stream.read() # [['Mike', 4]] ``` -### Post parse +##### Post parse -Skipping rows is a very basic ETL (extrac-transform-load) feature. For more advanced data transormations there are post parse processors. +List of functions that can filter or transform rows after they are parsed. These +functions receive the `extended_rows` containing the row's number, headers +list, and the row values list. They then process the rows, and yield or discard +them, modified or not. ```python def skip_odd_rows(extended_rows): @@ -591,360 +395,423 @@ def skip_odd_rows(extended_rows): if not row_number % 2: yield (row_number, headers, row) -def multiply_on_two(extended_rows): +def multiply_by_two(extended_rows): for row_number, headers, row in extended_rows: - yield (row_number, headers, list(map(lambda value: value * 2, row))) - - -with Stream([[1], [2], [3], [4]], post_parse=[skip_odd_rows, multiply_on_two]) as stream: + doubled_row = list(map(lambda value: value * 2, row)) + yield (row_number, headers, doubled_row) + +rows = [ + [1], + [2], + [3], + [4], +] +with Stream(rows, post_parse=[skip_odd_rows, multiply_by_two]) as stream: stream.read() # [[4], [8]] ``` -Post parse processor gets extended rows (`[row_number, headers, row]`) iterator and must yields updated extended rows back. This interface is very powerful because every processors have full control on iteration process could skip rows, catch exceptions etc. +These functions are applied in order, as a simple data pipeline. In the example +above, `multiply_by_two` just sees the rows yielded by `skip_odd_rows`. -Processors will be applied to source from left to right. For example in listing above `multiply_on_two` processor gets rows from `skip_odd_rows` processor. +##### Keyed and extended rows -### Keyed and extended rows +The methods `stream.iter()` and `stream.read()` accept the `keyed` and +`extended` flag arguments to modify how the rows are returned. -Stream methods `stream.iter/read()` accept `keyed` and `extended` flags to vary data structure of output data row. - -By default a stream returns every row as a list: +By default, every row is returned as a list of its cells values: ```python with Stream([['name', 'age'], ['Alex', 21]]) as stream: stream.read() # [['Alex', 21]] ``` -With `keyed=True` a stream returns every row as a dict: +With `keyed=True`, the rows are returned as dictionaries, mapping the column names to their values in the row: ```python with Stream([['name', 'age'], ['Alex', 21]]) as stream: stream.read(keyed=True) # [{'name': 'Alex', 'age': 21}] ``` -And with `extended=True` a stream returns every row as a tuple contining row number starting from 1, headers as a list and row as a list: +And with `extended=True`, the rows are returned as a tuple of `(row_number, +headers, row)`, there `row_number` is the current row number (starting from 1), +`headers` is a list with the headers names, and `row` is a list with the rows +values: ```python with Stream([['name', 'age'], ['Alex', 21]]) as stream: stream.read(extended=True) # (1, ['name', 'age'], ['Alex', 21]) ``` -### Custom loaders +### Supported schemes -To create a custom loader `Loader` interface should be implemented and passed to `Stream` constructor as `custom_loaders={'scheme': CustomLoader}` argument. +#### file -For example let's implement a custom loader: +The default scheme, a file in the local filesystem. ```python -from tabulator import Loader +stream = Stream('data.csv') +``` -class CustomLoader(Loader): - options = [] - def __init__(self, bytes_sample_size, **options): - pass - def load(self, source, mode='t', encoding=None): - # load logic +#### http/https/ftp/ftps -with Stream(source, custom_loaders={'custom': CustomLoader}) as stream: - stream.read() +> In Python 2, `tabulator` can't stream remote data sources because of a limitation in the underlying libraries. The whole data source will be loaded to the memory. In Python 3 there is no such problem and remote files are streamed. + +```python +stream = Stream('https://example.com/data.csv') ``` -There are more examples in internal `tabulator.loaders` module. +##### Options +- **http\_session** - a `requests.Session` object. Read more in the [requests docs][requests-session]. +- **http\_stream** - Enables or disables HTTP streaming, when possible (enabled by default). Disable it if you'd like to preload the whole file into memory. -#### `Loader.options` +#### stream -List of supported custom options. +The source is a file-like Python object. -#### `Loader(bytes_sample_size, **options)` -- `bytes_sample_size (int)` - sample size in bytes -- `options (dict)` - loader options -- `(Loader)` - returns `Loader` class instance +```python +with open('data.csv') as fp: + stream = Stream(fp) +``` -#### `loader.load(source, mode='t', encoding=None)` +#### text -- `source (str)` - table source -- `mode (str)` - text stream mode: 't' or 'b' -- `encoding (str)` - encoding of source -- `(file-like)` - returns file-like object of bytes or chars based on mode argument +The source is a string containing the tabular data. Both `scheme` and `format` +must be set expliticly, as it's not possible to infer them. -### Custom parsers +```python +stream = Stream( + 'name,age\nJohn, 21\n', + scheme='text', + format='csv' +) +``` + +### Supported file formats -To create a custom parser `Parser` interface should be implemented and passed to `Stream` constructor as `custom_parsers={'format': CustomParser}` argument. +In this section, we'll describe the supported file formats, and their respective +configuration options and operations. Some formats only support read operations, +while others support both reading and writing. -For example let's implement a custom parser: +#### csv (read & write) ```python -from tabulator import Parser +stream = Stream('data.csv', delimiter=',') +``` -class CustomParser(Parser): - options = [] - def __init__(self, loader, force_parse, **options): - self.__loader = loader - @property - def closed(self): - return False - def open(self, source, encoding=None): - # open logic - def close(self): - # close logic - def reset(self): - raise NotImplemenedError() - @property - def extended_rows(): - # extended rows logic +##### Options -with Stream(source, custom_parsers={'custom': CustomParser}) as stream: - stream.read() +It supports all options from the Python CSV library. Check [their +documentation][pydoc-csv] for more information. + +#### xls/xlsx (read only) + +> Tabulator is unable to stream `xls` files, so the entire file is loaded in +> memory. Streaming is supported for `xlsx` files. + +```python +stream = Stream('data.xls', sheet=1) ``` -There are more examples in internal `tabulator.parsers` module. +##### Options -#### `Parser.options` +- **sheet**: Sheet name or number (starting from 1) +- **fill_merged_cells**: if `True` it will unmerge and fill all merged cells by + a visible value. With this option enabled the parser can't stream data and + load the whole document into memory. -List of supported custom options. +#### ods (read only) -#### `Parser(loader, force_parse, **options)` +> This format is not included to package by default. To use it please install `tabulator` with an `ods` extras: `$ pip install tabulator[ods]` -Create parser class instance. +Source should be a valid Open Office document. -- `loader (Loader)` - loader instance -- `force_parse (bool)` - if True parser must yield (row_number, None, []) if there is an row in parsing error instead of stopping the iteration by raising an exception -- `options (dict)` - parser options -- `(Parser)` - returns `Parser` class instance +```python +stream = Stream('data.ods', sheet=1) +``` -#### `parser.closed` +##### Options -- `(bool)` - returns `True` if parser is closed +- **sheet**: Sheet name or number (starting from 1) -#### `parser.open(source, encoding=None)` +#### gsheet (read only) -Open underlaying stream. Parser gets byte or text stream from loader -to start emit items from this stream. +A publicly-accessible Google Spreadsheet. -- `source (str)` - table source -- `encoding (str)` - encoding of source +```python +stream = Stream('https://docs.google.com/spreadsheets/d/?usp=sharing') +stream = Stream('https://docs.google.com/spreadsheets/d/edit#gid=') +``` + +#### sql (read only) -#### `parser.close()` +Any database URL supported by [sqlalchemy][sqlalchemy]. + +```python +stream = Stream('postgresql://name:pass@host:5432/database', table='data') +``` -Close underlaying stream. +##### Options -#### `parser.reset()` +- **table (required)**: Database table name +- **order_by**: SQL expression for row ordering (e.g. `name DESC`) -Reset items and underlaying stream. After reset call iterations over items will start from scratch. +#### Data Package (read only) -#### `parser.encoding` +> This format is not included to package by default. You can enable it by +> installing tabulator using `pip install tabulator[datapackage]`. -- `(str)` - returns an actual encoding +A [Tabular Data Package][tdp]. -#### `parser.extended_rows` +```python +stream = Stream('datapackage.json', resource=1) +``` -- `(iterator)` - returns extended rows iterator +##### Options -### Custom writers +- **resource**: Resource name or index (starting from 0) -To create a custom writer `Writer` interface should be implemented and passed to `Stream` constructor as `custom_writers={'format': CustomWriter}` argument. +#### inline (read only) -For example let's implement a custom writer: +Either a list of lists, or a list of dicts mapping the colum names to their +respective values. ```python -from tabulator import Writer +stream = Stream([['name', 'age'], ['John', 21], ['Alex', 33]]) +stream = Stream([{'name': 'John', 'age': 21}, {'name': 'Alex', 'age': 33}]) +``` -class CustomWriter(Writer): - options = [] - def __init__(self, **options): - pass - def save(self, source, target, headers=None, encoding=None): - # save logic +#### json (read only) -with Stream(source, custom_writers={'custom': CustomWriter}) as stream: - stream.save(target) +JSON document containing a list of lists, or a list of dicts mapping the column +names to their respective values (see the `inline` format for an example). + +```python +stream = Stream('data.json', property='key1.key2') ``` -There are more examples in internal `tabulator.writers` module. +##### Options + +- **property**: JSON Path to the property containing the tabular data. For example, considering the JSON `{"response": {"data": [...]}}`, the `property` should be set to `response.data`. -#### `Writer.options` +#### ndjson (read only) + +```python +stream = Stream('data.ndjson') +``` -List of supported custom options. +#### tsv (read only) -#### `Writer(**options)` +```python +stream = Stream('data.tsv') +``` -Create writer class instance. +### Adding support for new file sources, formats, and writers -- `options (dict)` - writer options -- `(Writer)` - returns `Writer` class instance +Tabulator is written with extensibility in mind, allowing you to add support for +new tabular file formats, schemes (e.g. ssh), and writers (e.g. MongoDB). There +are three components that allow this: -#### `writer.save(source, target, headers=None, encoding=None)` +* Loaders + * Loads a stream from some location (e.g. ssh) +* Parsers + * Parses a stream of tabular data in some format (e.g. xls) +* Writers + * Writes tabular data to some destination (e.g. MongoDB) -Save source data to target. +In this section, we'll see how to write custom classes to extend any of these components. -- `source (str)` - data source -- `source (str)` - save target -- `headers (str[])` - optional headers -- `encoding (str)` - encoding of source +#### Custom loaders -### Validate +You can add support for a new scheme (e.g. ssh) by creating a custom loader. +Custom loaders are implemented by inheriting from the `Loader` class, and +implementing its methods. This loader can then be used by `Stream` to load data +by passing it via the `custom_loaders={'scheme': CustomLoader}` argument. -For cases you don't need open the source but want to know is it supported by `tabulator` or not you could use `validate` function. It also let you know what exactly is not supported raising correspondig exception class. +The skeleton of a custom loader looks like: ```python -from tabulator import validate, exceptions +from tabulator import Loader -try: - tabular = validate('data.csv') -except exceptions.TabulatorException: - tabular = False +class CustomLoader(Loader): + options = [] + + def __init__(self, bytes_sample_size, **options): + pass + + def load(self, source, mode='t', encoding=None): + # load logic + +with Stream(source, custom_loaders={'custom': CustomLoader}) as stream: + stream.read() ``` -#### `validate(source, scheme=None, format=None)` +You can see examples of how the loaders are implemented by looking in the +`tabulator.loaders` module. -Validate if this source has supported scheme and format. +#### Custom parsers -- `source (any)` - data source -- `scheme (str)` - data scheme -- `format (str)` - data format -- `(exceptions.SchemeError)` - raises if scheme is not supported -- `(exceptions.FormatError)` - raises if format is not supported -- `(bool)` - returns `True` if scheme/format is supported +You can add support for a new file format by creating a custom parser. Similarly +to custom loaders, custom parsers are implemented by inherinting from the +`Parser` class, and implementing its methods. This parser can then be used by +`Stream` to parse data by passing it via the `custom_parsers={'format': +CustomParser}` argument. -### Exceptions +The skeleton of a custom parser looks like: -#### `exceptions.TabulatorException` +```python +from tabulator import Parser -Base class for all `tabulator` exceptions. +class CustomParser(Parser): + options = [] -#### `exceptions.IOError` + def __init__(self, loader, force_parse, **options): + self.__loader = loader -All underlaying input-output errors. + def open(self, source, encoding=None): + # open logic -#### `exceptions.HTTPError` + def close(self): + # close logic -All underlaying HTTP errors. + def reset(self): + # reset logic -#### `exceptions.SourceError` + @property + def closed(self): + return False -This class of exceptions covers all source errors like bad data structure for JSON. + @property + def extended_rows(self): + # extended rows logic -#### `exceptions.SchemeError` +with Stream(source, custom_parsers={'custom': CustomParser}) as stream: + stream.read() +``` -For example this exceptions will be used if you provide not supported source scheme like `bad://source.csv`. +You can see examples of how parsers are implemented by looking in the +`tabulator.parsers` module. -#### `exceptions.FormatError` +#### Custom writers -For example this exceptions will be used if you provide not supported source format like `http://source.bad`. +You can add support to write files in a specific format by creating a custom +writer. The custom writers are implemented by inheriting from the base `Writer` +class, and implementing its methods. This writer can then be used by `Stream` to +write data via the `custom_writers={'format': CustomWriter}` argument. -#### `exceptions.EncodingError` +The skeleton of a custom writer looks like: -All errors related to encoding problems. +```python +from tabulator import Writer -### CLI +class CustomWriter(Writer): + options = [] -> It's a provisional API. If you use it as a part of other program please pin concrete `goodtables` version to your requirements file. + def __init__(self, **options): + pass -The library ships with a simple CLI to read tabular data: + def save(self, source, target, headers=None, encoding=None): + # save logic -```bash -$ tabulator data/table.csv -id, name -1, english -2, 中国人 +with Stream(source, custom_writers={'custom': CustomWriter}) as stream: + stream.save(target) ``` -#### `$ tabulator` +You can see examples of how parsers are implemented by looking in the +`tabulator.writers` module. -```bash -Usage: cli.py [OPTIONS] SOURCE +### Validate + +You can check if a source can be loaded by tabulator using the `validate` function. + +```python +from tabulator import validate, exceptions -Options: - --headers INTEGER - --scheme TEXT - --format TEXT - --encoding TEXT - --limit INTEGER - --help Show this message and exit. +try: + tabular = validate('data.csv') +except exceptions.SchemeError: + # The file scheme isn't supported +except exceptions.FormatError: + # The file format isn't supported ``` -## Contributing +### Exceptions -The project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards). +All the exceptions thrown by tabulator inherit from +`tabulator.exceptions.TabulatorException`, so you can use it as a way to catch +any tabulator exception. You can learn about the other exceptions thrown by +looking into the [tabulator.exceptions][tabulator.exceptions] module. -Recommended way to get started is to create and activate a project virtual environment. To install package and development dependencies into active environment: +## API Reference -``` -$ make install -``` +The API reference is written as docstrings in the tabulator classes. A good +place to start is the [Stream](tabulator/stream.py) class, which manages all +loading and parsing of data files. -To run tests with linting and coverage: +## Contributing -```bash -$ make test -``` +This project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards). -For linting `pylama` configured in `pylama.ini` is used. On this stage it's already installed into your environment and could be used separately with more fine-grained control as described in documentation - https://pylama.readthedocs.io/en/latest/. +We recommend you to use `virtualenv` to isolate this project from the rest of the +packages in your machine. -For example to sort results by error type: +To install the project and its development dependencies, run: ```bash -$ pylama --sort +$ make install ``` -For testing `tox` configured in `tox.ini` is used. It's already installed into your environment and could be used separately with more fine-grained control as described in documentation - https://testrun.org/tox/latest/. - -For example to check subset of tests against Python 2 environment with increased verbosity. All positional arguments and options after `--` will be passed to `py.test`: +To run the tests, use: ```bash -tox -e py27 -- -v tests/ +$ make test ``` -Under the hood `tox` uses `pytest` configured in `pytest.ini`, `coverage` and `mock` packages. This packages are available only in tox envionments. - ## Changelog -Here described only breaking and the most important changes. The full changelog and documentation for all released versions could be found in nicely formatted [commit history](https://github.com/frictionlessdata/tabulator-py/commits/master). - ### v1.13 New API added: -- the `skip_rows` argument now supports negative numbers to skip rows from the end +- The `skip_rows` argument now supports negative numbers to skip rows starting from the end ### v1.12 Updated behaviour: -- Now `UserWarning` will be emitted on bad options instead of raising an exception +- Instead of raising an exception, a `UserWarning` warning will be emitted if an option isn't recognized. ### v1.11 New API added: -- Added `http_session` argument for `http/https` format (it now uses `requests`) -- Added support for multiline headers: `headers` argument now accepts ranges like `[1,3]` +- Added `http_session` argument for the `http/https` format (it uses `requests` now) +- Added support for multiline headers: `headers` argument accept ranges like `[1,3]` ### v1.10 New API added: -- Added support for compressed files i.e. `zip` and `gz` for Python3 +- Added support for compressed files i.e. `zip` and `gz` on Python3 - The `Stream` constructor now accepts a `compression` argument - The `http/https` scheme now accepts a `http_stream` flag ### v1.9 Improved behaviour: -- Now the `headers` argument allows to set order for keyed sources and cherry-pick values +- The `headers` argument allows to set the order for keyed sources and cherry-pick values ### v1.8 New API added: -- Formats `XLS/XLSX/ODS` now supports a sheet name passed as a `sheet` argument -- The `Stream` constructor now accepts an `ignore_blank_headers` option +- Formats `XLS/XLSX/ODS` supports sheet names passed via the `sheet` argument +- The `Stream` constructor accepts an `ignore_blank_headers` option ### v1.7 Improved behaviour: -- Rebased `datapackage` format on `datapackage@1` libarry +- Rebased `datapackage` format on `datapackage@1` library ### v1.6 New API added: -- Argument `source` for the `Stream` constructor now could be a `pathlib.Path` +- Argument `source` for the `Stream` constructor can be a `pathlib.Path` ### v1.5 @@ -954,7 +821,7 @@ New API added: ### v1.4 Improved behaviour: -- updated encoding name to a canonical form +- Updated encoding name to a canonical form ### v1.3 @@ -972,29 +839,38 @@ Promoted provisional API to stable API: ### v1.2 Improved behaviour: -- autodetect common csv delimiters +- Autodetect common CSV delimiters ### v1.1 New API added: -- added `fill_merged_cells` argument to `xls/xlsx` formats +- Added `fill_merged_cells` option to `xls/xlsx` formats ### v1.0 New API added: - published `Loader/Parser/Writer` API -- added `Stream` argument `force_strings` -- added `Stream` argument `force_parse` -- added `Stream` argument `custom_writers` +- Added `Stream` argument `force_strings` +- Added `Stream` argument `force_parse` +- Added `Stream` argument `custom_writers` Deprecated API removal: - removed `topen` and `Table` - use `Stream` instead - removed `Stream` arguments `loader/parser_options` - use `**options` instead Provisional API changed: -- updated `Loader/Parser/Writer` API - please use an updated version +- Updated the `Loader/Parser/Writer` API - please use an updated version ### v0.15 Provisional API added: -- unofficial support for `Stream` arguments `custom_loaders/parsers` +- Unofficial support for `Stream` arguments `custom_loaders/parsers` + + +[stream.py]: tabulator/stream.py +[examples-dir]: examples "Examples" +[requests-session]: https://docs.puthon-requests.org/en/master/user/advanced/#session-objects +[pydoc-csv]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters "Python CSV options" +[sqlalchemy]: https://www.sqlalchemy.org/ +[tdp]: https://frictionlessdata.io/specs/tabular-data-package/ "Tabular Data Package" +[tabulator.exceptions]: tabulator/exceptions.py "Tabulator Exceptions" diff --git a/tabulator/cli.py b/tabulator/cli.py index 124956f6..3572f626 100644 --- a/tabulator/cli.py +++ b/tabulator/cli.py @@ -20,8 +20,6 @@ @click.option('--limit', type=click.INT) @click.version_option(tabulator.__version__, message='%(version)s') def cli(source, limit, **options): - """https://github.com/frictionlessdata/tabulator-py#cli - """ options = {key: value for key, value in options.items() if value is not None} with tabulator.Stream(source, **options) as stream: cast = str diff --git a/tabulator/exceptions.py b/tabulator/exceptions.py index c9660402..2700631f 100644 --- a/tabulator/exceptions.py +++ b/tabulator/exceptions.py @@ -8,44 +8,34 @@ # Module API class TabulatorException(Exception): - """https://github.com/frictionlessdata/tabulator-py#exceptions - """ + '''Base class for all tabulator exceptions.''' pass class IOError(TabulatorException): - """https://github.com/frictionlessdata/tabulator-py#exceptions - """ pass -class HTTPError(TabulatorException): - """https://github.com/frictionlessdata/tabulator-py#exceptions - """ +class HTTPError(IOError): pass class SourceError(TabulatorException): - """https://github.com/frictionlessdata/tabulator-py#exceptions - """ + '''The source file could not be parsed correctly.''' pass class SchemeError(TabulatorException): - """https://github.com/frictionlessdata/tabulator-py#exceptions - """ + '''The file scheme is not supported.''' pass class FormatError(TabulatorException): - """https://github.com/frictionlessdata/tabulator-py#exceptions - """ + '''The file format is unsupported or invalid.''' pass class EncodingError(TabulatorException): - """https://github.com/frictionlessdata/tabulator-py#exceptions - """ pass diff --git a/tabulator/loader.py b/tabulator/loader.py index d175bff8..8328791a 100644 --- a/tabulator/loader.py +++ b/tabulator/loader.py @@ -12,18 +12,37 @@ @add_metaclass(ABCMeta) class Loader(object): + '''Abstract class implemented by the data loaders + + The loaders inherit and implement this class' methods to add support for a + new scheme (e.g. ssh). + + Args: + bytes_sample_size (int): Sample size in bytes + **options (dict): Loader options + + Returns: + Loader: Loader instance. + ''' # Public options = [] def __init__(self, bytes_sample_size, **options): - """https://github.com/frictionlessdata/tabulator-py#custom-loaders - """ pass @abstractmethod def load(self, source, mode='t', encoding=None): - """https://github.com/frictionlessdata/tabulator-py#custom-loaders - """ + '''Load source file. + + Args: + source (str): Path to tabular source file. + mode (str, optional): Text stream mode, `t` (text) or `b` (binary). + Defaults to `t`. + encoding (str, optional): Source encoding. Auto-detect by default. + + Returns: + Union[TextIO, BinaryIO]: I/O stream opened either as text or binary. + ''' pass diff --git a/tabulator/parser.py b/tabulator/parser.py index 8504b8e1..a8d48f4b 100644 --- a/tabulator/parser.py +++ b/tabulator/parser.py @@ -12,51 +12,88 @@ @add_metaclass(ABCMeta) class Parser(object): + '''Abstract class implemented by the data parsers. + + The parsers inherit and implement this class' methods to add support for a + new file type. + + Args: + loader (tabulator.Loader): Loader instance to read the file. + force_parse (bool): When `True`, the parser yields an empty extended + row tuple `(row_number, None, [])` when there is an error parsing a + row. Otherwise, it stops the iteration by raising the exception + `tabulator.exceptions.SourceError`. + **options (dict): Loader options + + Returns: + Parser: Parser instance. + ''' # Public options = [] def __init__(self, loader, force_parse, **options): - """https://github.com/frictionlessdata/tabulator-py#custom-parsers - """ pass @property @abstractmethod def closed(self): - """https://github.com/frictionlessdata/tabulator-py#custom-parsers - """ + '''Flag telling if the parser is closed.''' pass # pragma: no cover @abstractmethod def open(self, source, encoding=None): - """https://github.com/frictionlessdata/tabulator-py#custom-parsers - """ + '''Open underlying file stream in the beginning of the file. + + The parser gets a byte or text stream from the `tabulator.Loader` + instance and start emitting items. + + Args: + source (str): Path to source table. + encoding (str, optional): Source encoding. Auto-detect by default. + + Returns: + None + ''' pass # pragma: no cover @abstractmethod def close(self): - """https://github.com/frictionlessdata/tabulator-py#custom-parsers - """ + '''Closes underlying file stream.''' pass # pragma: no cover @abstractmethod def reset(self): - """https://github.com/frictionlessdata/tabulator-py#custom-parsers - """ + '''Resets underlying stream and current items list. + + After `reset()` is called, iterating over the items will start from the + beginning. + ''' pass # pragma: no cover @property @abstractmethod def encoding(self): - """https://github.com/frictionlessdata/tabulator-py#custom-parsers - """ pass # pragma: no cover @property @abstractmethod def extended_rows(self): - """https://github.com/frictionlessdata/tabulator-py#custom-parsers - """ + '''Returns extended rows iterator. + + The extended rows are tuples containing `(row_number, headers, row)`, + + Yields: + Tuple[int, List[str], List[Any]]: Extended rows containing + `(row_number, headers, row)`, where `headers` is a list of the + header names (can be `None`), and `row` is a list of row + values. + + Raises: + `tabulator.exceptions.SourceError`: If `force_parse` is `False` and + a row can't be parsed, this exception will be raised. + Otherwise, an empty extended row is returned (i.e. + `(row_number, None, [])`). + ''' pass # pragma: no cover diff --git a/tabulator/stream.py b/tabulator/stream.py index 830c77cd..37e3177e 100644 --- a/tabulator/stream.py +++ b/tabulator/stream.py @@ -21,6 +21,64 @@ # Module API class Stream(object): + '''Stream of tabular data. + + This is the main `tabulator` class. It loads a data source, and allows you + to stream its parsed contents. + + Args: + source (str): Path to file as ``://path/to/file.``. If + not explicitly set, the scheme (file, http, ...) and format (csv, xls, + ...) are inferred from the source string. + headers (Union[int, List[int], List[str]], optional): Either a row + number or list of row numbers (in case of multi-line headers) to be + considered as headers (rows start counting at 1), or the actual + headers defined a list of strings. If not set, all rows will be + treated as containing values. + scheme (str, optional): Scheme for loading the file (file, http, ...). + If not set, it'll be inferred from `source`. + format (str, optional): File source's format (csv, xls, ...). If not + set, it'll be inferred from `source`. inferred + encoding (str, optional): Source encoding. If not set, it'll be inferred. + compression (str, optional): Source file compression (zip, ...). If not + set, it'll be inferred. + allow_html (bool, optional): Allow the file source to be an HTML page. + If False, raises ``exceptions.FormatError`` if the loaded file is + an HTML page. Defaults to False. + sample_size (int, optional): Controls the number of sample rows used to + infer properties from the data (headers, encoding, etc.). Set to + ``0`` to disable sampling, in which case nothing will be inferred + from the data. Defaults to ``config.DEFAULT_SAMPLE_SIZE``. + bytes_sample_size (int, optional): Same as `sample_size`, but instead + of number of rows, controls number of bytes. Defaults to + ``config.DEFAULT_BYTES_SAMPLE_SIZE``. + ignore_blank_headers (bool, optional): When True, ignores all columns + that have blank headers. Defaults to False. + force_strings (bool, optional): When True, casts all data to strings. + Defaults to False. + force_parse (bool, optional): When True, don't raise exceptions when + parsing malformed rows, simply returning an empty value. Defaults + to False. + skip_rows (List[Union[int, str]], optional): List of row numbers and + strings to skip. If a string, it'll skip rows that begin with it + (e.g. '#' and '//'). + post_parse (List[function], optional): List of generator functions that + receives a list of rows and headers, processes them, and yields + them (or not). Useful to pre-process the data. Defaults to None. + custom_loaders (dict, optional): Dictionary with keys as scheme names, + and values as their respective ``Loader`` class implementations. + Defaults to None. + custom_parsers (dict, optional): Dictionary with keys as format names, + and values as their respective ``Parser`` class implementations. + Defaults to None. + custom_loaders (dict, optional): Dictionary with keys as writer format + names, and values as their respective ``Writer`` class + implementations. Defaults to None. + **options (Any, optional): Extra options passed to the loaders and parsers. + + Returns: + Stream: The Stream instance. + ''' # Public @@ -43,8 +101,6 @@ def __init__(self, custom_parsers={}, custom_writers={}, **options): - """https://github.com/frictionlessdata/tabulator-py#stream - """ # Set headers self.__headers = None @@ -102,32 +158,24 @@ def __init__(self, self.__row_number = 0 def __enter__(self): - """https://github.com/frictionlessdata/tabulator-py#stream - """ if self.closed: self.open() return self def __exit__(self, type, value, traceback): - """https://github.com/frictionlessdata/tabulator-py#stream - """ if not self.closed: self.close() def __iter__(self): - """https://github.com/frictionlessdata/tabulator-py#stream - """ return self.iter() @property def closed(self): - """https://github.com/frictionlessdata/tabulator-py#stream - """ + '''Returns True if the underlying stream is closed, False otherwise.''' return not self.__parser or self.__parser.closed def open(self): - """https://github.com/frictionlessdata/tabulator-py#stream - """ + '''Opens the stream for reading.''' options = copy(self.__options) # Get scheme and format @@ -221,14 +269,12 @@ def open(self): return self def close(self): - """https://github.com/frictionlessdata/tabulator-py#stream - """ + '''Closes the stream.''' self.__parser.close() self.__row_number = 0 def reset(self): - """https://github.com/frictionlessdata/tabulator-py#stream - """ + '''Resets the stream pointer to the beginning of the file.''' if self.__row_number > self.__sample_size: self.__parser.reset() self.__extract_sample() @@ -237,32 +283,27 @@ def reset(self): @property def headers(self): - """https://github.com/frictionlessdata/tabulator-py#stream - """ return self.__headers @property def scheme(self): - """https://github.com/frictionlessdata/tabulator-py#stream - """ return self.__actual_scheme @property def format(self): - """https://github.com/frictionlessdata/tabulator-py#stream - """ return self.__actual_format @property def encoding(self): - """https://github.com/frictionlessdata/tabulator-py#stream - """ return self.__actual_encoding @property def sample(self): - """https://github.com/frictionlessdata/tabulator-py#stream - """ + '''Returns the stream's rows used as sample. + + These sample rows are used internally to infer characteristics of the + source file (e.g. encoding, headers, ...). + ''' sample = [] iterator = iter(self.__sample_extended_rows) iterator = self.__apply_processors(iterator) @@ -271,8 +312,30 @@ def sample(self): return sample def iter(self, keyed=False, extended=False): - """https://github.com/frictionlessdata/tabulator-py#stream - """ + '''Iterate over the rows. + + Each row is returned in a format that depends on the arguments `keyed` + and `extended`. By default, each row is returned as list of their + values. + + Args: + keyed (bool, optional): When True, each returned row will be a + `dict` mapping the header name to its value in the current row. + For example, `[{'name': 'J Smith', 'value': '10'}]`. Ignored if + ``extended`` is True. Defaults to False. + extended (bool, optional): When True, returns each row as a tuple + with row number (starts at 1), list of headers, and list of row + values. For example, `(1, ['name', 'value'], ['J Smith', '10'])`. + Defaults to False. + + Returns: + Iterator[Union[List[Any], Dict[str, Any], Tuple[int, List[str], List[Any]]]]: + The row itself. The format depends on the values of `keyed` and + `extended` arguments. + + Raises: + exceptions.TabulatorException: If the stream is closed. + ''' # Error if closed if self.closed: @@ -299,8 +362,19 @@ def iter(self, keyed=False, extended=False): yield row def read(self, keyed=False, extended=False, limit=None): - """https://github.com/frictionlessdata/tabulator-py#stream - """ + '''Returns a list of rows. + + Args: + keyed (bool, optional): See :func:`Stream.iter`. + extended (bool, optional): See :func:`Stream.iter`. + limit (int, optional): Number of rows to return. If None, returns + all rows. Defaults to None. + + Returns: + List[Union[List[Any], Dict[str, Any], Tuple[int, List[str], List[Any]]]]: + The list of rows. The format depends on the values of `keyed` + and `extended` arguments. + ''' result = [] rows = self.iter(keyed=keyed, extended=extended) for count, row in enumerate(rows, start=1): @@ -310,8 +384,16 @@ def read(self, keyed=False, extended=False, limit=None): return result def save(self, target, format=None, encoding=None, **options): - """https://github.com/frictionlessdata/tabulator-py#stream - """ + '''Save stream to the local filesystem. + + Args: + target (str): Path where to save the stream. + format (str, optional): The format the stream will be saved as. If + None, detects from the ``target`` path. Defaults to None. + encoding (str, optional): Saved file encoding. Defaults to + ``config.DEFAULT_ENCODING``. + **options: Extra options passed to the writer. + ''' # Get encoding/format if encoding is None: @@ -447,11 +529,11 @@ def builtin_processor(extended_rows): # Skip nagative rows processor def skip_negative_rows(extended_rows): - """ + ''' This processor will skip rows which counts from the end, e.g. -1: skip last row, -2: skip pre-last row, etc. Rows to skip are taken from Stream.__skip_rows_by_numbers - """ + ''' rows_to_skip = [n for n in self.__skip_rows_by_numbers if n < 0] buffer_size = abs(min(rows_to_skip)) # collections.deque - takes O[1] time to push/pop values from any side. diff --git a/tabulator/validate.py b/tabulator/validate.py index 8d73c9b8..eca6f729 100644 --- a/tabulator/validate.py +++ b/tabulator/validate.py @@ -12,8 +12,20 @@ # Module API def validate(source, scheme=None, format=None): - """https://github.com/frictionlessdata/tabulator-py#validate - """ + '''Check if tabulator is able to load the source. + + Args: + source (Union[str, IO]): The source path or IO object. + scheme (str, optional): The source scheme. Auto-detect by default. + format (str, optional): The source file format. Auto-detect by default. + + Returns: + bool: Whether tabulator is able to load the source file. + + Raises: + `tabulator.exceptions.SchemeError`: The file scheme is not supported. + `tabulator.exceptions.FormatError`: The file format is not supported. + ''' # Get scheme and format detected_scheme, detected_format = helpers.detect_scheme_and_format(source) diff --git a/tabulator/writer.py b/tabulator/writer.py index f637f824..a332b4f5 100644 --- a/tabulator/writer.py +++ b/tabulator/writer.py @@ -12,18 +12,33 @@ @add_metaclass(ABCMeta) class Writer(object): + '''Abstract class implemented by the data writers. + + The writers inherit and implement this class' methods to add support for a + new file destination. + + Args: + **options (dict): Writer options. + + Returns: + Writer: Writer instance. + ''' # Public options = [] def __init__(self, **options): - """https://github.com/frictionlessdata/tabulator-py#custom-writers - """ pass @abstractmethod def write(self, source, target, headers=None, encoding=None): - """https://github.com/frictionlessdata/tabulator-py#custom-writers - """ + '''Writes source data to target. + + Args: + source (str): Source data. + target (str): Write target. + headers (List[str], optional): List of header names. + encoding (str, optional): Source file encoding. + ''' pass diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 67679aed..3382395d 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -39,8 +39,9 @@ def test_detect_scheme_and_format(source, scheme, format): def test_detect_encoding(): - sample = io.open('README.md', 'rb').read(config.DEFAULT_BYTES_SAMPLE_SIZE) - assert helpers.detect_encoding(sample) == 'utf-8' + with io.open('Makefile', 'rb') as fp: + sample = fp.read(config.DEFAULT_BYTES_SAMPLE_SIZE) + assert helpers.detect_encoding(sample) == 'utf-8' def test_detect_encoding_windows_1252(): From 185db6b57f62a04a8968bdcecd30301fa15e0918 Mon Sep 17 00:00:00 2001 From: Vitor Baptista Date: Tue, 13 Feb 2018 18:36:02 +0000 Subject: [PATCH 3/5] Fix CSV output of CLI tool Previously, we were joining the CSV cells with ", " (notice the whitespace after the comma). This modifies the data, as this extra whitespace can be parsed differently. This commit changes so we just join cells using "," (no whitespace). --- tabulator/cli.py | 2 +- tests/test_cli.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tabulator/cli.py b/tabulator/cli.py index 3572f626..7aba2014 100644 --- a/tabulator/cli.py +++ b/tabulator/cli.py @@ -28,7 +28,7 @@ def cli(source, limit, **options): if stream.headers: click.echo(click.style(', '.join(map(cast, stream.headers)), bold=True)) for count, row in enumerate(stream, start=1): - click.echo(', '.join(map(cast, row))) + click.echo(','.join(map(cast, row))) if count == limit: break diff --git a/tests/test_cli.py b/tests/test_cli.py index 3dd92df2..92463af7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -14,7 +14,7 @@ def test_cli(): runner = CliRunner() result = runner.invoke(cli, ['data/table.csv']) assert result.exit_code == 0 - assert result.output.startswith('id, name\n1, english\n2,') + assert result.output.startswith('id,name\n1,english\n2,') def test_cli_version(): From 9d5ee066967dc580a6ba2e45d934cba4d2f4fbb5 Mon Sep 17 00:00:00 2001 From: Vitor Baptista Date: Tue, 13 Feb 2018 18:37:40 +0000 Subject: [PATCH 4/5] Limit openpyxl to versions < 2.5 because of a bug on 2.5 See https://bitbucket.org/openpyxl/openpyxl/issues/977/version-25-breaks-with --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e477f2e8..0f5530f1 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ def read(*paths): # Format: xls 'xlrd>=1.0,<2.0', # Format: xlsx - 'openpyxl>=2.4,<3.0', + 'openpyxl>=2.4,<2.5', ] INSTALL_FORMAT_DATAPACKAGE_REQUIRES = [ 'datapackage>=1.1.3,<2.0', From 57635b0a5cf7b54665c7066021e3e52192a0bb63 Mon Sep 17 00:00:00 2001 From: Serah Rono Date: Wed, 18 Jul 2018 17:55:19 +0300 Subject: [PATCH 5/5] minor edits --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1116201a..0bf1b253 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ A library for reading and writing tabular data (csv/xls/json/etc). ## Features -- **Supports most common tabular formats**: CSV, XLS, ODS, JSON, Google Sheets, SQL, and others. +- **Supports most common tabular formats**: CSV, XLS, ODS, JSON, Google Sheets, SQL, and others. See complete list [below](#supported-file-formats). - **Loads local and remote data**: Supports HTTP and FTP. - **Low memory usage**: Only the current row is kept in memory, so you can large datasets. @@ -232,7 +232,7 @@ with Stream([['name', 'age'], ['Alex', 21]], headers=1) as stream: stream.read() # [['Alex', 21]] ``` -You can also pass a lists of strings to define the headers expliticly: +You can also pass a lists of strings to define the headers explicitly: ```python with Stream([['Alex', 21]], headers=['name', 'age']) as stream: @@ -287,7 +287,7 @@ CSV file inside an HTML page, for example on GitHub. You can disable this behaviour using the `allow_html` option: ```python -with Stream(sorce_with_html, allow_html=True) as stream: +with Stream(source_with_html, allow_html=True) as stream: stream.read() # no exception on open ``` @@ -477,7 +477,7 @@ with open('data.csv') as fp: #### text The source is a string containing the tabular data. Both `scheme` and `format` -must be set expliticly, as it's not possible to infer them. +must be set explicitly, as it's not possible to infer them. ```python stream = Stream( @@ -573,7 +573,7 @@ stream = Stream('datapackage.json', resource=1) #### inline (read only) -Either a list of lists, or a list of dicts mapping the colum names to their +Either a list of lists, or a list of dicts mapping the column names to their respective values. ```python @@ -652,7 +652,7 @@ You can see examples of how the loaders are implemented by looking in the #### Custom parsers You can add support for a new file format by creating a custom parser. Similarly -to custom loaders, custom parsers are implemented by inherinting from the +to custom loaders, custom parsers are implemented by inheriting from the `Parser` class, and implementing its methods. This parser can then be used by `Stream` to parse data by passing it via the `custom_parsers={'format': CustomParser}` argument.