In [None]:
import camelot
import matplotlib.pyplot as plt

# Camelot advanced stuff

It's all from https://camelot-py.readthedocs.io/en/master/user/advanced.html, this is basically "Soma cut and pasted some code and downloaded the PDFs for me," although I didn't copy over most of the text that actually helps you understand the code.

# Backgrounds

In [None]:
tables = camelot.read_pdf('background_lines.pdf')
tables

In [None]:
tables = camelot.read_pdf('background_lines.pdf', process_background=True)
tables[0].df

In [None]:
tables[1].df

In [None]:
tables = camelot.read_pdf('foo.pdf')
tables[0]

In [None]:
camelot.plot(tables[0], kind='text')
plt.show()

In [None]:
# %matplotlib notebook
# camelot.plot(tables[0], kind='text')
# plt.show()

In [None]:
camelot.plot(tables[0], kind='grid')
plt.show()

In [None]:
camelot.plot(tables[0], kind='contour')
plt.show()

In [None]:
camelot.plot(tables[0], kind='line')
plt.show()

In [None]:
camelot.plot(tables[0], kind='joint')
plt.show()

# `table_areas`

> In cases such as these, it can be useful to specify exact table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table.

In [None]:
tables = camelot.read_pdf('table_areas.pdf')
tables

In [None]:
tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
tables[0].df

# `table_regions`

> However there may be cases where the table might not lie at the exact coordinates every time but in an approximate region.
>
> You can use the table_regions keyword argument to read_pdf() to solve for such cases. When table_regions is specified, Camelot will only analyze the specified regions to look for tables.

In [None]:
tables = camelot.read_pdf('table_regions.pdf')
tables

In [None]:
tables = camelot.read_pdf('table_regions.pdf', table_regions=['170,370,560,270'])
tables[0].df

# `column_separators`

In [None]:
camelot.read_pdf("column_separators.pdf", flavor='stream')
tables

In [None]:
tables = camelot.read_pdf('column_separators.pdf', flavor='stream', columns=['72,95,209,327,442,529,566,606,683'])
tables[0].df

# `split_text`

> To deal with cases like the output from the previous section, you can pass split_text=True to read_pdf(), which will split any strings that lie in different cells but have been assigned to a single cell (as a result of being merged together by PDFMiner).



In [None]:
tables = camelot.read_pdf('column_separators.pdf', flavor='stream', columns=['72,95,209,327,442,529,566,606,683'], split_text=True)
tables[0].df

# Superscript

In [None]:
tables = camelot.read_pdf('superscript.pdf', flavor='stream')
tables[0].df

In [None]:
tables = camelot.read_pdf('superscript.pdf', flavor='stream', flag_size=True)
tables[0].df

# Strip unwanted characters

> You can strip unwanted characters like spaces, dots and newlines from a string using the strip_text keyword argument. Take a look at this PDF as an example, the text at the start of each row contains a lot of unwanted spaces, dots and newlines.



In [None]:
tables = camelot.read_pdf('12s0324.pdf', flavor='stream')
tables[0].df

In [None]:
tables = camelot.read_pdf('12s0324.pdf', flavor='stream', strip_text=' .\n')
tables[0].df

# Improve guessed table areas

In [None]:
tables = camelot.read_pdf('edge_tol.pdf', flavor='stream')
tables[0].df

In [None]:
camelot.plot(tables[0], kind='contour')
plt.show()

> To improve the detected area, you can increase the edge_tol (default: 50) value to counter the effect of text being placed relatively far apart vertically. Larger edge_tol will lead to longer textedges being detected, leading to an improved guess of the table area. Let’s use a value of 500.



In [None]:
tables = camelot.read_pdf('edge_tol.pdf', flavor='stream', edge_tol=500)
camelot.plot(tables[0], kind='contour')
plt.show()

# Short lines

In [None]:
tables = camelot.read_pdf('short_lines.pdf')
tables[0].df

In [None]:
camelot.plot(tables[0], kind='grid')
plt.show()

> There might be cases while using Lattice when smaller lines don’t get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page’s dimensions with a scaling factor called line_scale. By default, its value is 15.
> 
> As you can guess, the larger the line_scale, the smaller the size of lines getting detected.

In [None]:
tables = camelot.read_pdf('short_lines.pdf', line_scale=40)
camelot.plot(tables[0], kind='grid')
plt.show()

In [None]:
tables[0].df

# Shifting text

> By default, the Lattice method shifts text in spanning cells, first to the left and then to the top, as you can observe in the output table above. However, this behavior can be changed using the shift_text keyword argument. Think of it as setting the gravity for a table — it decides the direction in which the text will move and finally come to rest.
> 
> shift_text expects a list with one or more characters from the following set: ('', l', 'r', 't', 'b'), which are then applied in order. The default, as we discussed above, is ['l', 't'].
>
> We’ll use the PDF from the previous example. Let’s pass shift_text=[''], which basically means that the text will experience weightlessness! (It will remain in place.)



In [None]:
tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=[''])
tables[0].df

> No surprises there — it did remain in place (observe the strings “2400” and “All the available individuals”). Let’s pass shift_text=['r', 'b'] to set the gravity to right-bottom and move the text in that direction.



In [None]:
tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b'])
tables[0].df

# Copy text in spanning cells

In [None]:
tables = camelot.read_pdf('copy_text.pdf')
tables[0].df

In [None]:
tables = camelot.read_pdf('copy_text.pdf', copy_text=['v'])
tables[0].df