Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 23 additions & 42 deletions bases/ecoindex/cli/app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from asyncio import run
from datetime import datetime
from multiprocessing import cpu_count
from os.path import dirname
Expand All @@ -16,9 +16,9 @@
get_window_sizes_from_args,
)
from ecoindex.cli.console_output import display_result_synthesis
from ecoindex.cli.helper import run_page_analysis
from ecoindex.cli.report import Report
from ecoindex.models import ExportFormat, Language
from ecoindex.scraper.helper import bulk_analysis
from ecoindex.utils.files import write_results_to_file, write_urls_to_file
from loguru import logger
from rich.progress import (
Expand Down Expand Up @@ -165,7 +165,9 @@ def analyze(
urls=urls, urls_file=urls_file, tmp_folder=tmp_folder
)
elif sitemap:
secho(f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA)
secho(
f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA
)
urls = get_urls_from_sitemap(main_url=sitemap)
(
file_prefix,
Expand Down Expand Up @@ -220,47 +222,26 @@ def analyze(
TextColumn("•"),
TimeRemainingColumn(),
) as progress:
count_errors = 0
task = progress.add_task("Processing", total=len(urls) * len(window_sizes))

with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_analysis = {}

for url in urls:
for window_size in window_sizes:
future_to_analysis[
executor.submit(
run_page_analysis,
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)
] = (
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)
count_errors = 0

for future in as_completed(future_to_analysis):
try:
result, success = future.result()

if not success:
count_errors += 1

else:
results.append(result)

except Exception as e:
count_errors += 1
url, _, _, _, _ = future_to_analysis[future]
logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")

progress.update(task, advance=1)
analysis_results = run(
bulk_analysis(
max_workers=max_workers,
urls=urls,
window_sizes=window_sizes,
wait_after_scroll=wait_after_scroll,
wait_before_scroll=wait_before_scroll,
logger=logger,
)
)

for result, success in analysis_results:
results.append(result)
if not success:
count_errors += 1

progress.update(task, advance=1)

if count_errors > 0:
secho(
Expand Down
43 changes: 3 additions & 40 deletions bases/ecoindex/cli/helper.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,14 @@
from asyncio import run
from ecoindex.config import Settings

from ecoindex.models import Result, WindowSize, CliHost
from ecoindex.scraper import EcoindexScraper


def run_page_analysis(
url: str,
window_size: WindowSize,
wait_after_scroll: int = 3,
wait_before_scroll: int = 3,
logger=None,
) -> tuple[Result, bool]:
"""Run the page analysis and return the result and a boolean indicating if the analysis was successful"""
scraper = EcoindexScraper(
url=str(url),
window_size=window_size,
wait_after_scroll=wait_after_scroll,
wait_before_scroll=wait_before_scroll,
page_load_timeout=20,
)
try:
return (run(scraper.get_page_analysis()), True)
except Exception as e:
logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")

return (
Result(
url=url,
water=0,
width=window_size.width,
height=window_size.height,
size=0,
nodes=0,
requests=0,
),
False,
)
from ecoindex.models import CliHost


def replace_localhost_with_hostdocker(netloc: str) -> CliHost:
if Settings().DOCKER_CONTAINER and "localhost" in netloc:
domain = "host.docker.internal"
netloc = netloc.replace("localhost", domain)
elif "localhost" in netloc :
elif "localhost" in netloc:
domain = "localhost"
else :
else:
domain = netloc

return CliHost(domain=domain, netloc=netloc)
74 changes: 74 additions & 0 deletions components/ecoindex/scraper/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from asyncio import run
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import AsyncGenerator

from ecoindex.models.compute import Result, WindowSize
from ecoindex.scraper.scrap import EcoindexScraper


def run_page_analysis(
url: str,
window_size: WindowSize,
wait_after_scroll: int = 3,
wait_before_scroll: int = 3,
logger=None,
) -> tuple[Result, bool]:
"""Run the page analysis and return the result and a boolean indicating if the analysis was successful"""
scraper = EcoindexScraper(
url=str(url),
window_size=window_size,
wait_after_scroll=wait_after_scroll,
wait_before_scroll=wait_before_scroll,
page_load_timeout=20,
)
try:
return (run(scraper.get_page_analysis()), True)
except Exception as e:
logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")

return (
Result(
url=url,
water=0,
width=window_size.width,
height=window_size.height,
size=0,
nodes=0,
requests=0,
),
False,
)


async def bulk_analysis(
max_workers,
urls,
window_sizes,
wait_after_scroll: int = 0,
wait_before_scroll: int = 0,
logger=None,
) -> AsyncGenerator[tuple[Result, bool], None]:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_analysis = {}

for url in urls:
for window_size in window_sizes:
future_to_analysis[
executor.submit(
run_page_analysis,
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)
] = (
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)

for future in as_completed(future_to_analysis):
yield future.result()
4 changes: 3 additions & 1 deletion components/ecoindex/scraper/scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(
screenshot_uid: int | None = None,
screenshot_gid: int | None = None,
page_load_timeout: int = 20,
headless: bool = True,
):
self.url = url
self.window_size = window_size
Expand All @@ -39,6 +40,7 @@ def __init__(
self.har_temp_file_path = (
f"/tmp/ecoindex-{self.now.strftime('%Y-%m-%d-%H-%M-%S-%f')}-{uuid4()}.har"
)
self.headless = headless

@deprecated("This method is useless with new version of EcoindexScraper")
def init_chromedriver(self):
Expand All @@ -64,7 +66,7 @@ async def get_requests_by_category(self) -> MimetypeAggregation:

async def scrap_page(self) -> PageMetrics:
async with async_playwright() as p:
browser = await p.chromium.launch()
browser = await p.chromium.launch(headless=self.headless)
self.page = await browser.new_page(
record_har_path=self.har_temp_file_path,
screen=self.window_size.model_dump(),
Expand Down
13 changes: 12 additions & 1 deletion development/ecoindex_scraper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
import asyncio
from pprint import pprint
from uuid import uuid1

from ecoindex.models.compute import ScreenShot
from ecoindex.scraper import EcoindexScraper

pprint(asyncio.run(EcoindexScraper(url="http://ecoindex.fr").get_page_analysis()))
scraper = EcoindexScraper(
url="https://www.kiabi.com",
screenshot=ScreenShot(id=str(uuid1()), folder="./screenshots"),
)

result = asyncio.run(scraper.get_page_analysis())
all_requests = asyncio.run(scraper.get_all_requests())
requests_by_category = asyncio.run(scraper.get_requests_by_category())

pprint(result)