Skip to content

Commit

Permalink
fix gov.il rendering
Browse files Browse the repository at this point in the history
  • Loading branch information
erlichsefi committed Apr 18, 2024
2 parents 3ddfcd3 + 27466dd commit 096e03d
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 60 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ logging.log
temp*/
.vscode/settings.json
.DS_Store
test_dump
7 changes: 3 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#syntax=docker/dockerfile:1

FROM debian:latest as base
FROM node:20-bookworm as base
WORKDIR /usr/src/app
ARG PY_VERSION="3.11.0"

Expand All @@ -10,9 +10,8 @@ RUN apt-get update -y && \
apt-get install libxml2-dev -y && \
apt-get install libxslt-dev -y

# setting the C++
# RUN apt-get install gcc-10 -y && \
# apt-get install g++-10 -y
# playwrite
RUN npx -y playwright@1.43.0 install --with-deps

# setting python and more
RUN apt-get install python3-pip -y && \
Expand Down
8 changes: 4 additions & 4 deletions il_supermarket_scarper/engines/multipage_web.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from urllib.parse import urlsplit
import re
import ntpath
import lxml
from lxml import html as lxml_html

import requests

from il_supermarket_scarper.utils.connection import url_connection_retry
Expand Down Expand Up @@ -44,8 +45,7 @@ def get_number_of_pages(self, url, timeout=15):
raise ValueError(
f"Fetching resources failed from {url}, status code: {response.status_code}"
)

html_body = lxml.html.fromstring(response.content)
html_body = lxml_html.fromstring(response.content)

total_pages = self.get_total_pages(html_body)
Logger.info(f"Found {total_pages} pages")
Expand Down Expand Up @@ -120,7 +120,7 @@ def process_links_before_download(
"""additional processing to the links before download"""
response = self.session_with_cookies_by_chain(page)

html = lxml.html.fromstring(response.text)
html = lxml_html.fromstring(response.text)

file_links, filenames = self.collect_files_details_from_page(html)
Logger.info(f"Page {page}: Found {len(file_links)} files")
Expand Down
15 changes: 14 additions & 1 deletion il_supermarket_scarper/utils/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import subprocess
import requests


from playwright.sync_api import sync_playwright
from urllib3.exceptions import ReadTimeoutError
from requests.exceptions import (
ReadTimeout,
Expand Down Expand Up @@ -209,6 +209,19 @@ def session_with_cookies(url, timeout=15, chain_cookie_name=None):
return response_content


def render_webpage(url, extraction):
"""render website with playwrite"""

with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url)
page.wait_for_load_state("networkidle")
content = extraction(page)
browser.close()
return content


@url_connection_retry()
def session_and_check_status(url, timeout=15):
"""use a session to load the response and check status"""
Expand Down
66 changes: 22 additions & 44 deletions il_supermarket_scarper/utils/status.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,33 @@
import datetime
import difflib
import re
import os
import enum
import holidays
import pytz
import lxml.html as lh
from bs4 import BeautifulSoup

from lxml.html.clean import clean_html # pylint: disable=no-name-in-module
from .logger import Logger
from .connection import session_with_cookies
from .connection import render_webpage


def get_statue_page():
def get_statue_page(extraction_type):
"""fetch the gov.il site"""
url = "https://www.gov.il/he/departments/legalInfo/cpfta_prices_regulations"
# Create a handle, page, to handle the contents of the website
return session_with_cookies(url, chain_cookie_name="gov_il")

def get_from_playwrite(page):
if extraction_type == "update_date":
content = page.locator('//*[@id="metaData_updateDate_0"]').last.inner_text()
elif extraction_type == "links_name":
content = page.evaluate(
"""() => {
const links = Array.from(document.querySelectorAll('a'));
return links.map(link => link.textContent.trim());
}"""
)
else:
raise ValueError(f"type '{extraction_type}' is not valid.")
return content

return render_webpage(url, extraction=get_from_playwrite)


def get_cached_page():
Expand All @@ -37,12 +47,10 @@ def get_cached_page():

def get_status():
"""get the number of scarper listed on the gov.il site"""
page = get_statue_page()
links_text = get_statue_page(extraction_type="links_name")
# Store the contents of the website under doc
doc = BeautifulSoup(page.content, features="lxml")
# Parse data that are stored between <tr>..</tr> of HTML
count = 0
for element in doc.find_all("strong"):
for element in links_text:
if "לצפייה במחירים" in str(element) or "לצפיה במחירים" in str(element):
count += 1

Expand All @@ -51,19 +59,8 @@ def get_status():

def get_status_date():
"""get the date change listed on the gov.il site"""
page = get_statue_page()

if page.status_code != 200:
Logger.error(f"request as failed, page body is {page}.")
raise ValueError("Failed reading the gov.il site.")
line_with_date = (
lh.fromstring(page.content)
.xpath(
r"""/html/body/section/div/
div[3]/div/span"""
)[0]
.text
)
line_with_date = get_statue_page(extraction_type="update_date")
print(line_with_date)
Logger.info(f"line_with_date: {line_with_date}")

dates = re.findall(
Expand All @@ -78,25 +75,6 @@ def get_status_date():
return datetime.datetime.strptime("".join(dates[0]), "%d.%m.%Y")


def compute_page_diff():
"""compute the diff between the page in the cache and the webpage"""
page = get_statue_page()
cache = get_cached_page()

cache_text = (
"".join(lh.fromstring(clean_html(cache)).itertext())
.replace("\n", "")
.replace("\r", "")
)
page_text = (
"".join(lh.fromstring(clean_html(page.content.decode("utf-8"))).itertext())
.replace("\n", "")
.replace("\r", "")
)

return [li for li in difflib.ndiff(cache_text, page_text) if li[0] != " "]


def get_output_folder(chain_name):
"""the the folder to write the chain fils in"""
return os.path.join(_get_dump_folder(), chain_name)
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
retry==0.9.2
mock==4.0.3
requests==2.31.0
lxml==4.9.1
lxml==5.2.1
beautifulsoup4==4.10.0
pymongo==4.2.0
pytz==2022.4
holidays==0.16
cachetools==5.2.0
cachetools==5.2.0
pytest-playwright==0.4.4
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
tests_require=dev_required,
extras_require={"test": ["pytest"]},
# *strongly* suggested for sharing
version="0.3.7",
version="0.3.8",
# The license can be anything you like
license="MIT",
description="python package that implement a scraping for israeli supermarket data",
Expand Down
5 changes: 1 addition & 4 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from il_supermarket_scarper.utils.status import (
get_status,
get_status_date,
compute_page_diff,
)
from il_supermarket_scarper.scrappers_factory import ScraperFactory
from il_supermarket_scarper.utils.connection import disable_when_outside_israel
Expand Down Expand Up @@ -30,6 +29,4 @@ def test_all_chain_id_unqiue():
def test_update_date():
"""test date the site update"""
date = get_status_date()
assert (
date.date() == datetime.datetime(2024, 2, 11).date()
), f"gov il site changed, please check it out, {compute_page_diff()}"
assert date.date() == datetime.datetime(2024, 2, 11).date(), "gov il site changed"

0 comments on commit 096e03d

Please sign in to comment.