From 6739dd7db72b78d098ce295251dbee08ab928b23 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Thu, 19 Jun 2025 18:56:19 +0000
Subject: [PATCH] Speed up `codespell:ignore` check by skipping the regex in
 most cases

The codespell codebase unsurprisingly spends a vast majority of its
runtime in various regex related code such as `search` and `finditer`.

The best way to optimize runtime spend in regexes is to not do a regex
in the first place, since the regex engine has a rather steep overhead
over regular string primitives (that is at the cost of
flexibility). If the regex rarely matches and there is a very easy
static substring that can be used to rule out the match, then you can
speed up the code by using `substring in string` as a conditional to
skip the regex. This is assuming the regex is used enough for the
performance to matter.

An obvious choice here falls on the `codespell:ignore` regex, because
it has a very distinctive substring in the form of `codespell:ignore`,
which will rule out almost all lines that will not match.

With this little trick, runtime goes from ~5.4s to ~4.5s on the corpus
mentioned in #3419.
---
 codespell_lib/_codespell.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 98696c0f5d..94ab65d068 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -54,7 +54,13 @@
 uri_regex_def = (
     r"(\b(?:https?|[ts]?ftp|file|git|smb)://[^\s]+(?=$|\s)|\b[\w.%+-]+@[\w.-]+\b)"
 )
-inline_ignore_regex = re.compile(r"[^\w\s]\s*codespell:ignore\b(\s+(?P<words>[\w,]*))?")
+codespell_ignore_tag = "codespell:ignore"
+inline_ignore_regex = re.compile(
+    rf"[^\w\s]\s*{codespell_ignore_tag}\b(\s+(?P<words>[\w,]*))?"
+)
+USAGE = """
+\t%prog [OPTIONS] [file1 file2 ... fileN]
+"""
 
 supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU")
 supported_languages = supported_languages_en
@@ -904,7 +910,9 @@ def parse_lines(
         line_number = fragment_line_number + i
 
         extra_words_to_ignore = set()
-        match = inline_ignore_regex.search(line)
+        match = (
+            inline_ignore_regex.search(line) if codespell_ignore_tag in line else None
+        )
         if match:
             extra_words_to_ignore = set(
                 filter(None, (match.group("words") or "").split(","))