cnumr · PaulPHPE · Feb 26, 2024 · Feb 20, 2024 · Feb 20, 2024 · Feb 25, 2024
diff --git a/bases/ecoindex/cli/app.py b/bases/ecoindex/cli/app.py
@@ -11,6 +11,7 @@
     get_file_prefix_input_file_logger_file,
     get_url_from_args,
     get_urls_from_file,
+    get_urls_from_sitemap,
     get_urls_recursive,
     get_window_sizes_from_args,
 )
@@ -37,6 +38,9 @@
 @app.command()
 def analyze(
     url: list[str] = Option(default=None, help="List of urls to analyze"),
+    sitemap: str = Option(
+        default=None, help="Sitemap url of the website you want to analyze"
+    ),
     window_size: list[str] = Option(
         default=["1920,1080"],
         help=(
@@ -117,6 +121,16 @@ def analyze(
             default=True,
         )
 
+    if sitemap and not no_interaction:
+        confirm(
+            text=(
+                "You are about to read urls from a website sitemap. "
+                "This can take a long time. Are you sure to want to proceed?"
+            ),
+            abort=True,
+            default=True,
+        )
+
     try:
         window_sizes = get_window_sizes_from_args(window_size)
         tmp_folder = "/tmp/ecoindex-cli"
@@ -150,6 +164,14 @@ def analyze(
             ) = get_file_prefix_input_file_logger_file(
                 urls=urls, urls_file=urls_file, tmp_folder=tmp_folder
             )
+        elif sitemap:
+            secho(f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA)
+            urls = get_urls_from_sitemap(main_url=sitemap)
+            (
+                file_prefix,
+                input_file,
+                logger_file,
+            ) = get_file_prefix_input_file_logger_file(urls=urls)
 
         else:
             secho("🔥 You must provide an url...", fg=colors.RED)

diff --git a/bases/ecoindex/cli/arguments_handler.py b/bases/ecoindex/cli/arguments_handler.py
@@ -4,6 +4,7 @@
 
 from ecoindex.cli.crawl import EcoindexSpider
 from ecoindex.cli.helper import replace_localhost_with_hostdocker
+from ecoindex.cli.sitemap import EcoindexSitemapSpider
 from ecoindex.models import WindowSize
 
 from pydantic import AnyHttpUrl, validate_call
@@ -54,10 +55,30 @@ def get_urls_recursive(main_url: str) -> Set[str]:
         process.start()
         temp_file.seek(0)
         urls = temp_file.readlines()
-
     return validate_list_of_urls(urls)  # type: ignore
 
 
+def get_urls_from_sitemap(main_url: str) -> Set[str]:
+    process = CrawlerProcess()
+    if "sitemap" not in main_url or not main_url.endswith(".xml"):
+        raise ValueError("The provided url is not a valid sitemap url")
+
+    with NamedTemporaryFile(mode="w+t") as temp_file:
+        process.crawl(
+            crawler_or_spidercls=EcoindexSitemapSpider, 
+            sitemap_urls=[main_url],
+            temp_file=temp_file,
+        )
+        process.start()
+        temp_file.seek(0)
+        urls = list()
+        str_urls = temp_file.readlines()
+        for url in str_urls:
+            urls.append(AnyHttpUrl(url))
+
+    return validate_list_of_urls(urls)
+
+
 @validate_call
 def get_url_from_args(urls_arg: list[AnyHttpUrl]) -> set[AnyHttpUrl]:
     urls_from_args = set()

diff --git a/bases/ecoindex/cli/sitemap.py b/bases/ecoindex/cli/sitemap.py
@@ -0,0 +1,22 @@
+from tempfile import NamedTemporaryFile
+from scrapy.spiders import SitemapSpider
+
+
+class EcoindexSitemapSpider(SitemapSpider):
+    name = "EcoindexSitemapSpider"
+    custom_settings = {"LOG_ENABLED": False}
+
+    def __init__(
+        self,
+        sitemap_urls: list[str],
+        temp_file: NamedTemporaryFile,  # type: ignore
+        *a,
+        **kw,
+    ):
+        self.sitemap_urls = sitemap_urls
+        self.temp_file = temp_file
+        super().__init__(*a, **kw)
+
+    def parse(self, response):
+        self.temp_file.write(f"{response.url}\n")
+
diff --git a/projects/ecoindex_cli/README.md b/projects/ecoindex_cli/README.md
@@ -6,7 +6,6 @@
 ![PyPI - Downloads](https://img.shields.io/pypi/dm/ecoindex-cli?style=social&logo=pypi)
 ![Docker Pulls](https://img.shields.io/docker/pulls/vvatelot/ecoindex-cli?style=social&logo=docker)
 
-
 This tool provides an easy way to analyze websites with [Ecoindex](https://www.ecoindex.fr) from your local computer using multi-threading. You have the ability to:
 
 - Make the analysis on multiple pages
@@ -113,6 +112,37 @@ There are 2 url(s), do you want to process? [Y/n]:
 
 </details>
 
+### Make a website analysis based on the website's sitemap
+
+The cli allows for the analysis of a designated website using its sitemap. By entering the sitemap URL, the app identifies and analyzes all pages within the website. ⚠️ This can process for a very long time! **Use it at your own risks!**
+
+```bash
+ecoindex-cli analyze --sitemap https://www.ecoindex.fr/sitemap.xml
+```
+
+<details><summary>Result</summary>
+
+```bash
+You are about to read urls from a website sitemap. This can take a long time. Are you sure to want to proceed? [Y/n]: 
+⏲️ Crawling sitemap url https://www.ecoindex.fr/sitemap.xml -> Wait a minute!
+2024-02-20 18:38:16 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
+2024-02-20 18:38:16 [scrapy.utils.log] INFO: Versions: lxml 5.1.0.0, libxml2 2.12.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 23.10.0, Python 3.12.2 (main, Feb 13 2024, 08:34:52) [GCC 12.2.0], pyOpenSSL 24.0.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.3, Platform Linux-6.4.16-linuxkit-aarch64-with-glibc2.36
+📁️ Urls recorded in file `/tmp/ecoindex-cli/input/www.ecoindex.fr.csv`
+There are 22 url(s), do you want to process? [Y/n]: y
+22 urls for 1 window size with 10 maximum workers
+100% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 22/22 • 0:00:20 • 0:00:00
+┏━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┓
+┃ Total analysis ┃ Success ┃ Failed ┃
+┡━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━┩
+│ 22             │ 22      │ 0      │
+└────────────────┴─────────┴────────┘
+🙌️ File /tmp/ecoindex-cli/output/www.ecoindex.fr/2024-02-20_183842/results.csv written !
+```
+
+</details>
+
+Note: When utilizing the sitemap method with the --recursive flag, please be aware that the recursive flag will not take effect.
+
 ### Make a recursive analysis
 
 You can make a recursive analysis of a given webiste. This means that the app will try to find out all the pages into your website and launch an analysis on all those web pages. ⚠️ This can process for a very long time! **Use it at your own risks!**