From a0b5ddd858da758c8e3ac57ca8ded7de2c18b57a Mon Sep 17 00:00:00 2001 From: Michiaki Ariga Date: Wed, 24 May 2017 09:21:11 +0900 Subject: [PATCH] [Experimental] Add multiple_tables mode. close #2 This feature is experimental because this approach is not robust for future changes of tabula-java. Setting `multiple_tables` option, we can extract list of DataFrames. --- README.md | 3 +++ tabula/wrapper.py | 26 +++++++++++++++++++++++++- tests/test_read_pdf_table.py | 6 ++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 231ddbc..7429644 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,9 @@ See [example notebook](./examples/tabula_example.ipynb) - Same as `--outfile` option of tabula-java. - java_options (`list`, optional): - Set java options like `-Xmx256m`. +- multiple_tables (bool, optional): + - (Experimental) Extract multiple tables. + - This option uses JSON as an intermediate format, so if tabula-java output format will change, this option doesn't work. ## FAQ diff --git a/tabula/wrapper.py b/tabula/wrapper.py index 8771959..b55773c 100644 --- a/tabula/wrapper.py +++ b/tabula/wrapper.py @@ -48,6 +48,11 @@ def read_pdf(input_path, **kwargs): elif output_format == 'json': kwargs['format'] = 'JSON' + multiple_tables = kwargs.get('multiple_tables') + if multiple_tables: + kwargs.pop('multiple_tables', None) + kwargs['format'] = 'JSON' + java_options = kwargs.get('java_options', []) if isinstance(java_options, str): java_options = [java_options] @@ -69,7 +74,11 @@ def read_pdf(input_path, **kwargs): fmt = kwargs.get('format') if fmt == 'JSON': - return json.loads(output.decode(encoding)) + if multiple_tables: + return extract_from(json.loads(output.decode(encoding))) + + else: + return json.loads(output.decode(encoding)) else: return pd.read_csv(io.BytesIO(output), encoding=encoding) @@ -130,6 +139,21 @@ def convert_into(input_path, output_path, **kwargs): if is_url: os.unlink(path) +def extract_from(raw_json): + '''Extract tables from json. + + Args: + raw_json (list): + Decoded list from tabula-java JSON. + ''' + + data_frames = [] + + for table in raw_json: + list_data = [[e['text'] for e in row] for row in table['data']] + data_frames.append(pd.DataFrame(list_data[1:-1], columns=list_data[0])) + + return data_frames def localize_file(path): is_url = False diff --git a/tests/test_read_pdf_table.py b/tests/test_read_pdf_table.py index 7a7f529..7afcc02 100644 --- a/tests/test_read_pdf_table.py +++ b/tests/test_read_pdf_table.py @@ -46,6 +46,12 @@ def test_read_pdf_with_java_option(self): self.assertTrue(tabula.read_pdf(pdf_path, pages=1, java_options=['-Xmx256m'] ).equals(pd.read_csv(expected_csv1))) + def test_read_pdf_for_multiple_tables(self): + pdf_path = 'tests/resources/data.pdf' + self.assertEqual(len(tabula.read_pdf(pdf_path, pages=2, multiple_tables=True)), 2) + with self.assertRaises(pd.parser.CParserError): + tabula.read_pdf(pdf_path, pages=2) + def test_convert_from(self): pdf_path = 'tests/resources/data.pdf' expected_csv = 'tests/resources/data_1.csv'