Skip to content

Commit

Permalink
[Experimental] Add multiple_tables mode. close #2
Browse files Browse the repository at this point in the history
This feature is experimental because this approach is not robust for
future changes of tabula-java.
Setting `multiple_tables` option, we can extract list of DataFrames.
  • Loading branch information
chezou committed May 24, 2017
1 parent e8c2205 commit a0b5ddd
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 1 deletion.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ See [example notebook](./examples/tabula_example.ipynb)
- Same as `--outfile` option of tabula-java.
- java_options (`list`, optional):
- Set java options like `-Xmx256m`.
- multiple_tables (bool, optional):
- (Experimental) Extract multiple tables.
- This option uses JSON as an intermediate format, so if tabula-java output format will change, this option doesn't work.


## FAQ
Expand Down
26 changes: 25 additions & 1 deletion tabula/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ def read_pdf(input_path, **kwargs):
elif output_format == 'json':
kwargs['format'] = 'JSON'

multiple_tables = kwargs.get('multiple_tables')
if multiple_tables:
kwargs.pop('multiple_tables', None)
kwargs['format'] = 'JSON'

java_options = kwargs.get('java_options', [])
if isinstance(java_options, str):
java_options = [java_options]
Expand All @@ -69,7 +74,11 @@ def read_pdf(input_path, **kwargs):

fmt = kwargs.get('format')
if fmt == 'JSON':
return json.loads(output.decode(encoding))
if multiple_tables:
return extract_from(json.loads(output.decode(encoding)))

else:
return json.loads(output.decode(encoding))

else:
return pd.read_csv(io.BytesIO(output), encoding=encoding)
Expand Down Expand Up @@ -130,6 +139,21 @@ def convert_into(input_path, output_path, **kwargs):
if is_url:
os.unlink(path)

def extract_from(raw_json):
'''Extract tables from json.
Args:
raw_json (list):
Decoded list from tabula-java JSON.
'''

data_frames = []

for table in raw_json:
list_data = [[e['text'] for e in row] for row in table['data']]
data_frames.append(pd.DataFrame(list_data[1:-1], columns=list_data[0]))

return data_frames

def localize_file(path):
is_url = False
Expand Down
6 changes: 6 additions & 0 deletions tests/test_read_pdf_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ def test_read_pdf_with_java_option(self):
self.assertTrue(tabula.read_pdf(pdf_path, pages=1, java_options=['-Xmx256m']
).equals(pd.read_csv(expected_csv1)))

def test_read_pdf_for_multiple_tables(self):
pdf_path = 'tests/resources/data.pdf'
self.assertEqual(len(tabula.read_pdf(pdf_path, pages=2, multiple_tables=True)), 2)
with self.assertRaises(pd.parser.CParserError):
tabula.read_pdf(pdf_path, pages=2)

def test_convert_from(self):
pdf_path = 'tests/resources/data.pdf'
expected_csv = 'tests/resources/data_1.csv'
Expand Down

0 comments on commit a0b5ddd

Please sign in to comment.