Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions bases/ecoindex/cli/app.py
Comment thread
PaulPHPE marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
get_file_prefix_input_file_logger_file,
get_url_from_args,
get_urls_from_file,
get_urls_from_sitemap,
get_urls_recursive,
get_window_sizes_from_args,
)
Expand All @@ -37,6 +38,9 @@
@app.command()
def analyze(
url: list[str] = Option(default=None, help="List of urls to analyze"),
sitemap: str = Option(
default=None, help="Sitemap url of the website you want to analyze"
),
window_size: list[str] = Option(
default=["1920,1080"],
help=(
Expand Down Expand Up @@ -117,6 +121,16 @@ def analyze(
default=True,
)

if sitemap and not no_interaction:
confirm(
text=(
"You are about to read urls from a website sitemap. "
"This can take a long time. Are you sure to want to proceed?"
),
abort=True,
default=True,
)

try:
window_sizes = get_window_sizes_from_args(window_size)
tmp_folder = "/tmp/ecoindex-cli"
Expand Down Expand Up @@ -150,6 +164,14 @@ def analyze(
) = get_file_prefix_input_file_logger_file(
urls=urls, urls_file=urls_file, tmp_folder=tmp_folder
)
elif sitemap:
secho(f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA)
urls = get_urls_from_sitemap(main_url=sitemap)
(
file_prefix,
input_file,
logger_file,
) = get_file_prefix_input_file_logger_file(urls=urls)

else:
secho("🔥 You must provide an url...", fg=colors.RED)
Expand Down
23 changes: 22 additions & 1 deletion bases/ecoindex/cli/arguments_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ecoindex.cli.crawl import EcoindexSpider
from ecoindex.cli.helper import replace_localhost_with_hostdocker
from ecoindex.cli.sitemap import EcoindexSitemapSpider
from ecoindex.models import WindowSize

from pydantic import AnyHttpUrl, validate_call
Expand Down Expand Up @@ -54,10 +55,30 @@ def get_urls_recursive(main_url: str) -> Set[str]:
process.start()
temp_file.seek(0)
urls = temp_file.readlines()

return validate_list_of_urls(urls) # type: ignore


def get_urls_from_sitemap(main_url: str) -> Set[str]:
process = CrawlerProcess()
if "sitemap" not in main_url or not main_url.endswith(".xml"):
raise ValueError("The provided url is not a valid sitemap url")

with NamedTemporaryFile(mode="w+t") as temp_file:
process.crawl(
crawler_or_spidercls=EcoindexSitemapSpider,
sitemap_urls=[main_url],
temp_file=temp_file,
)
process.start()
temp_file.seek(0)
urls = list()
str_urls = temp_file.readlines()
for url in str_urls:
urls.append(AnyHttpUrl(url))

return validate_list_of_urls(urls)


@validate_call
def get_url_from_args(urls_arg: list[AnyHttpUrl]) -> set[AnyHttpUrl]:
urls_from_args = set()
Expand Down
22 changes: 22 additions & 0 deletions bases/ecoindex/cli/sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from tempfile import NamedTemporaryFile
from scrapy.spiders import SitemapSpider


class EcoindexSitemapSpider(SitemapSpider):
name = "EcoindexSitemapSpider"
custom_settings = {"LOG_ENABLED": False}

def __init__(
self,
sitemap_urls: list[str],
temp_file: NamedTemporaryFile, # type: ignore
*a,
**kw,
):
self.sitemap_urls = sitemap_urls
self.temp_file = temp_file
super().__init__(*a, **kw)

def parse(self, response):
self.temp_file.write(f"{response.url}\n")

32 changes: 31 additions & 1 deletion projects/ecoindex_cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
![PyPI - Downloads](https://img.shields.io/pypi/dm/ecoindex-cli?style=social&logo=pypi)
![Docker Pulls](https://img.shields.io/docker/pulls/vvatelot/ecoindex-cli?style=social&logo=docker)


This tool provides an easy way to analyze websites with [Ecoindex](https://www.ecoindex.fr) from your local computer using multi-threading. You have the ability to:

- Make the analysis on multiple pages
Expand Down Expand Up @@ -113,6 +112,37 @@ There are 2 url(s), do you want to process? [Y/n]:

</details>

### Make a website analysis based on the website's sitemap

The cli allows for the analysis of a designated website using its sitemap. By entering the sitemap URL, the app identifies and analyzes all pages within the website. ⚠️ This can process for a very long time! **Use it at your own risks!**
Comment thread
PaulPHPE marked this conversation as resolved.

```bash
ecoindex-cli analyze --sitemap https://www.ecoindex.fr/sitemap.xml
```

<details><summary>Result</summary>

```bash
You are about to read urls from a website sitemap. This can take a long time. Are you sure to want to proceed? [Y/n]:
⏲️ Crawling sitemap url https://www.ecoindex.fr/sitemap.xml -> Wait a minute!
2024-02-20 18:38:16 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2024-02-20 18:38:16 [scrapy.utils.log] INFO: Versions: lxml 5.1.0.0, libxml2 2.12.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 23.10.0, Python 3.12.2 (main, Feb 13 2024, 08:34:52) [GCC 12.2.0], pyOpenSSL 24.0.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.3, Platform Linux-6.4.16-linuxkit-aarch64-with-glibc2.36
📁️ Urls recorded in file `/tmp/ecoindex-cli/input/www.ecoindex.fr.csv`
There are 22 url(s), do you want to process? [Y/n]: y
22 urls for 1 window size with 10 maximum workers
100% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 22/22 • 0:00:20 • 0:00:00
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┓
┃ Total analysis ┃ Success ┃ Failed ┃
┡━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━┩
│ 22 │ 22 │ 0 │
└────────────────┴─────────┴────────┘
🙌️ File /tmp/ecoindex-cli/output/www.ecoindex.fr/2024-02-20_183842/results.csv written !
```

</details>

Note: When utilizing the sitemap method with the --recursive flag, please be aware that the recursive flag will not take effect.

### Make a recursive analysis

You can make a recursive analysis of a given webiste. This means that the app will try to find out all the pages into your website and launch an analysis on all those web pages. ⚠️ This can process for a very long time! **Use it at your own risks!**
Expand Down
Loading