In [2]:
import requests


def fetch_words(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text.splitlines()
    return []


# URLs of the raw word lists
google_en = requests.get(
    "https://raw.githubusercontent.com/coffee-and-fun/google-profanity-words/main/data/en.txt"
).text.splitlines()
ldnoobw_en = requests.get(
    "https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
).text.splitlines()
ldnoobw_rus = requests.get(
    "https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/ru"
).text.splitlines()
# Add other URLs as needed


def sanitize_wordlist(words):
    """Remove duplicates, empty lines, and normalize case"""
    return sorted({w.strip().lower() for w in words if w.strip()})


with open("bad_words.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(sanitize_wordlist(google_en + ldnoobw_en + ldnoobw_rus)))

In [11]:
from pybloom_live import ScalableBloomFilter


class OptimizedBloomModerator:
    def __init__(self, phrase_length=5):
        # Grows dynamically while maintaining error rate
        self.filter = ScalableBloomFilter(
            initial_capacity=5000,  # Increased for phrases
            error_rate=0.001,
            mode=ScalableBloomFilter.LARGE_SET_GROWTH,
        )
        self.phrase_length = phrase_length
        self._load_words()

    def _load_words(self):
        with open("bad_words.txt", "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip().lower()
                # Add single words
                self.filter.add(line)
                # Add phrases split by underscores (if present)
                if "_" in line:
                    self.filter.add(line.replace("_", " "))

    def check_text(self, text):
        words = text.lower().split()

        # Check single words
        if any(word in self.filter for word in words):
            return True

        # Check multi-word phrases
        for i in range(len(words)):
            for j in range(1, self.phrase_length + 1):
                if i + j > len(words):
                    continue
                phrase = " ".join(words[i : i + j])
                if phrase in self.filter:
                    return True
        return False


moderator = OptimizedBloomModerator()

In [13]:
moderator.check_text("sin на in фиг python")

False