Skip to content

Commit

Permalink
tabula-py java work-around
Browse files Browse the repository at this point in the history
  • Loading branch information
dcalde committed Jan 4, 2021
1 parent 4c48703 commit 15efc7c
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 6 deletions.
8 changes: 5 additions & 3 deletions Dockerfile
@@ -1,11 +1,13 @@
FROM joyzoursky/python-chromedriver:3.6-selenium
#FROM joyzoursky/python-chromedriver:3.6-selenium
FROM openaustralia/buildstep:early_release

COPY requirements.txt .

RUN apt update -y \
&& apt install -y default-jre
&& apt-get install -y python3-pip
# && apt install -y default-jre

RUN pip install -r requirements.txt
RUN pip3 install -r requirements.txt

RUN useradd morph

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -5,3 +5,4 @@ bs4
requests
selenium
splinter>=0.7.3
install-jdk
19 changes: 16 additions & 3 deletions scraper.py
@@ -1,17 +1,30 @@
import os
import re
import stat
from functools import partial
from typing import List


import numpy as np
import pandas as pd
# from selenium.webdriver import ChromeOptions
import tabula
from selenium.webdriver.chrome.options import Options

from splinter import Browser
from sqlalchemy import create_engine
from tabula import read_pdf

import tabula_custom

# Note: work-around because the morph early_release image doesn't have java installed,
# and the tabula _run() function has the java path hard-coded
if not os.path.isfile("java") or not os.access("java", os.X_OK):
print("Java not found. Installing JRE.")
import jdk
jre_dir = jdk.install('11', jre=True, path='/tmp/.jre')
tabula.io._run = partial(tabula_custom._run, java_path=jre_dir + '/bin/java')

URL = "https://www.perth.wa.gov.au/develop/planning-and-building-applications/building-and-development-applications"
DATABASE = "data.sqlite"
DATA_TABLE = "data"
Expand All @@ -38,9 +51,9 @@ def clean_address(address: str) -> str:
def clean_description(description: str) -> str:
return description.replace("\r", " ")

print(os.popen('whoami').read())
print(stat.filemode(os.stat('/usr/local/bin/chromedriver').st_mode))
print(stat.filemode(os.stat('/usr/bin/google-chrome').st_mode))
# print(os.popen('whoami').read())
# print(stat.filemode(os.stat('/usr/local/bin/chromedriver').st_mode))
# print(stat.filemode(os.stat('/usr/bin/google-chrome').st_mode))

# can no longer use a simple request to get the page content. Need headless browser
# chrome_options = ChromeOptions()
Expand Down
47 changes: 47 additions & 0 deletions tabula_custom.py
@@ -0,0 +1,47 @@
import os
import subprocess

from tabula.errors import JavaNotFoundError
from tabula.io import JAVA_NOT_FOUND_ERROR, _jar_path, build_options, logger


def _run(java_options, options, path=None, encoding="utf-8", java_path: str = None):
"""Call tabula-java with the given lists of Java options and tabula-py
options, as well as an optional path to pass to tabula-java as a regular
argument and an optional encoding to use for any required output sent to
stderr.
tabula-py options are translated into tabula-java options, see
:func:`build_options` for more information.
"""
# Workaround to enforce the silent option. See:
# https://github.com/tabulapdf/tabula-java/issues/231#issuecomment-397281157
if options.get("silent"):
java_options.extend(
(
"-Dorg.slf4j.simpleLogger.defaultLogLevel=off",
"-Dorg.apache.commons.logging.Log"
"=org.apache.commons.logging.impl.NoOpLog",
)
)

built_options = build_options(**options)
args = [java_path or "java"] + java_options + ["-jar", _jar_path()] + built_options
if path:
args.append(path)

try:
result = subprocess.run(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stdin=subprocess.DEVNULL,
check=True,
)
if result.stderr:
logger.warning("Got stderr: {}".format(result.stderr.decode(encoding)))
return result.stdout
except FileNotFoundError:
raise JavaNotFoundError(JAVA_NOT_FOUND_ERROR)
except subprocess.CalledProcessError as e:
logger.error("Error from tabula-java:\n{}\n".format(e.stderr.decode(encoding)))
raise

0 comments on commit 15efc7c

Please sign in to comment.