diff --git a/.gitignore b/.gitignore
index 8d5d5ae..c82996a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 /stash/
 /gharchive-dump/
 /export/
+/scratchpad/
 *.sh
 
 *.pyc
diff --git a/bin/bag_messages.py b/bin/bag_messages.py
new file mode 100644
index 0000000..e0020cc
--- /dev/null
+++ b/bin/bag_messages.py
@@ -0,0 +1,403 @@
+import argparse
+import datetime
+import json
+import time
+from pathlib import Path
+import os
+import sys
+import re
+from typing import List, Tuple, Generator
+
+
+from tqdm import tqdm
+
+ROOT_DIR = Path(__file__).resolve().parent.parent
+sys.path.append(str(ROOT_DIR))
+
+from src.bagofwords import BagOfWords
+from src.file_iter import iter_ndjson
+
+
+RE_REPLACE_WHITESPACE = (
+    # urls
+    re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&#+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"),
+    # emails
+    re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"),
+    # long hex numbers like commit refs
+    re.compile(r"[a-f0-9]{30,1000}"),
+)
+
+USELESS_MESSAGE_TAILS = (
+    "Merge pull request",
+    "Signed-off-by:",
+    "Commit reference:",
+    "Sync branch \"",
+    "Authored-by:",
+    "Co-authored-by:",
+    "Reported-by:",
+    "Acked-by:",
+    "Reviewed By:",
+    "Merged PRs:",
+    "Excluded PRs:",
+    "Merge branch '",
+    "Merge remote-tracking branch '",
+    "Deploying to gh-pages from @",
+    "This is an automated commit",
+    "Set up CI with Azure Pipelines\n",
+    "Source-Link: http",
+    "Change log: http",
+    "issuer signature:",
+    "*By submitting this pull request, I confirm",
+    "# All SDK Contribution checklist:",  # Azure SDK
+    "Update Composer dependencies (",
+    "--BEGIN PGP SIGNATURE--",
+)
+
+TOPICS = [
+    (
+        "game", "shotgun", "sprite", "missile", "witch", "witches", "zombie", "zombies",
+        "spaceship", "spaceships", "demon", "demons", "player", "players", "player's",
+        "joystick", "spawn", "spawned", "spawning", "respawning", "dragon",
+        "lair", "monster", "monsters", "survivor", "assassin", "assassins",
+        "billiard", "shooter", "knight", "knights", "knight's", "projectile",
+        "damage", "minecraft", "bukkit", "spigot", "multiplayer",
+        "explosive", "hostile", "gameloop", "grenade",
+    ),
+    (
+        "emotion", "sadness", "pain", "depression", "depressed",
+        "despair", "desperation", "loving", "frustrating", "frustration",
+        "paranoia", "forgiving", "suffering",
+    ),
+    (
+        "fun", "funny", "hilarious", "ridiculous", "joke", "joking",
+        "laugh", "laughing", "kidding",
+    ),
+    (
+        "music", "musical", "song", "melody", ""
+    ),
+    (
+        "code", "function", "refactor", "kernel", "status", "thread", "settings",
+        "threads", "maintenance", "synchronisation", "revision",
+        "function", "functions", "variable", "variables", "upstream", "implementation",
+        "driver", "struct", "cleanup", "issues", "fixes", "process", "processing", "buffer", "module",
+        "command", "cpu", "arm64", "documentation", "reference", "align", "pointer",
+        "architecture", "server", "hardware", "compiler", "directory", "register",
+        "integration", "deployment", "inline", "coverage", "recursive", "dependencies",
+        "compilation", "bytes", "bits", "generator", "functional", "compatibility",
+        "extension", "instructions", "operation", "management", "interrupt", "macro",
+        "syntax", "initialization", "install", "overflow", "merged", "endpoint", "pipeline",
+    ),
+    (
+        "blog", "article"
+    ),
+    ("fixed", "fixes", "fixing"),
+    "microsoft", "apple", "amazon", "facebook",
+    "segfault",
+    ("css", "scss", "html", "http", "javascript"),
+    ("amazing", "superb", "stunning", "excellent", "inspiring", "superb"),
+    (
+        "curse", "cursed", "damn", "awful", "hate", "hated", "disgusting", "bloody",
+        "silly", "silliness", "stupid", "stupidity", "terrible",
+        "shit", "shitty", "bullshit", "suck", "sucks", "sucking", "sucker"
+        "fuck", "fucking", "fucked", "fuckoff", "fuckyou",
+    ),
+    (
+        "politics", "government", "governments", "governmental", "politics", "political"
+    ),
+    (
+        "oops", "ah", "aah", "oh", "ohh", "woah"
+    ),
+    (
+        "corona", "pandemic",
+    ),
+    (
+        "tanh", "tan", "sin", "asin", "sinh", "cos", "acos", "cosh", "tan", "tanh", "sqrt",
+        "ceil",
+    )
+]
+
+TOPICS_3 = [
+    ("honest", "honestly", "frankly"),
+    ("amazing", "superb", "stunning", "excellent", "inspiring"),
+    ("shit", "shitty", "bullshit"),
+    ("depressed", "depression", "depressive", "depressing"),
+    ("code", "coding", "programming"),
+    ("confess", "confessing", "confession"),
+    ("idea", "ideas", "inspiration", "inspiring"),
+    "feedback",
+]
+
+TOPICS_2 = [
+    "female", "male",
+    "girl", "boy",
+    "girlfriend", "boyfriend",
+    "sister", "brother",
+    "college", "school",
+    ("mystery", "mysteries"),
+    ("friend", "friends"),
+    ("dream", "dreams", "dreamy"),
+    "thoughts", "grief",
+    "tears",
+    ("criminal", "criminals"),
+    "antifa",
+    ("sex", "sexual"),
+    (),
+    ("fuck", "fucking", "fucked", "fuckoff", "fuckyou"),
+    ("corona", "pandemic", "covid"),
+    "personal",
+
+    ("god", "gods"),
+    ("hell", "hellish", "hellfire"),
+    ("guitar", "saxophone", "banjo", "ukulele", "flute"),
+    ("praise", "praised"),
+    ("church", "churches"),
+    "love", "hate", "hatred",
+    "loving",
+    ("fire", "fires"),
+    "kernel",
+    "python",
+]
+
+
+def parse_args() -> dict:
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument(
+        "commit_file", type=str,
+        help="ndjson of exported commit messages"
+    )
+    parser.add_argument(
+        "--repos", type=str, nargs="+",
+        help="Instead of the topic-bags, build one for each repo",
+    )
+    parser.add_argument(
+        "-v", "--verbose", type=bool, nargs="?", default=False, const=True,
+        help="Display stuff during run"
+    )
+    
+    args = vars(parser.parse_args())
+
+    return args
+
+
+def append_filename(filename: Path, suffix: str) -> Path:
+    name = filename.name
+    if name.lower().endswith(".gz"):
+        name = name[:-3]
+
+    name_parts = name.split(".")
+    name = ".".join(name_parts[:-1])
+
+    name = name + suffix
+
+    return filename.parent / name
+
+
+class Main:
+    def __init__(
+            self,
+            commit_file: str,
+            repos: List[str],
+            verbose: bool,
+    ):
+        self.commit_file = Path(commit_file)
+        self.bag_file = append_filename(self.commit_file, "-big-bag.json")
+        self.commit_bag_file = append_filename(self.commit_file, "-bags.ndjson")
+        self.verbose = verbose
+        self.repos = repos
+
+        if self.bag_file.exists():
+            self.bag = BagOfWords.load_json(self.bag_file)
+        else:
+            self.bag = self.render_bag()
+            self.bag.save_json(self.bag_file)
+        self.bag_norm = self.bag.normalized()
+
+        print(f"bag: {self.bag.size():,} / {self.bag.count():,}")
+
+        self.topic_bags = dict()
+        if not self.repos:
+            for topic in sorted(TOPICS, key=lambda t: t[0] if isinstance(t, tuple) else t):
+                if isinstance(topic, str):
+                    topic = [topic]
+                for suffix in ("", "_r"):
+                    self.topic_bags[topic[0] + suffix] = {
+                        "tokens": topic,
+                        "bag": BagOfWords()
+                    }
+        else:
+            for repo in self.repos:
+                name = repo.replace("/", "-")
+                self.topic_bags[name] = {
+                    "repo": repo,
+                    "bag": BagOfWords()
+                }
+        self.scan_commits()
+
+    @classmethod
+    def normalize_text(cls, text: str) -> str:
+        for useless_tail in USELESS_MESSAGE_TAILS:
+            try:
+                idx = text.index(useless_tail)
+                text = text[:idx]
+            except ValueError:
+                pass
+        text = text.lower()
+        for repl in RE_REPLACE_WHITESPACE:
+            text = repl.sub(" ", text)
+        text = text.replace("\n", " ").replace("\r", " ")
+        return text
+
+    @classmethod
+    def skip_message(cls, text: str) -> bool:
+        if not text:
+            return True
+
+        for char, min_avg_space in (
+                # exclude file listings
+                ("/", 40),
+                # exclude other weird stuff
+                ("(", 60),
+                ("[", 60),
+                (".", 40),
+                (">", 60),
+        ):
+            ratio = text.count(char) / len(text)
+            if ratio >= 1. / min_avg_space:
+                return True
+
+        return False
+
+    def iter_commits(self, desc: str) -> Generator[Tuple[dict, str, str], None, None]:
+        iterable = iter_ndjson(self.commit_file)
+        if self.verbose:
+            iterable = tqdm(iterable, desc=desc)
+
+        for i, commit in enumerate(iterable):
+
+            if self.repos:
+                found = False
+                for repo in self.repos:
+                    if ("/" in repo and commit["repo"] == repo) or (commit["repo"].endswith("/" + repo)):
+                        found = True
+                        break
+                if not found:
+                    continue
+
+            if "[bot]" in commit["author"]:
+                continue
+
+            message = commit["message"]
+            message_n = self.normalize_text(message)
+
+            if self.skip_message(message_n):
+                continue
+
+            yield commit, message, message_n
+
+    def render_bag(self) -> BagOfWords:
+        big_bag = BagOfWords()
+        num_bytes_written = 0
+
+        def dump_stats():
+            print()
+            print(
+                f"\ndate:      {commit['date']}"
+                f"\ntokens:    {big_bag.size():,} / {big_bag.count():,}"
+                f"\nwritten:   {num_bytes_written:,}"
+                "\n"
+            )
+
+        for i, (commit, message, message_n) in enumerate(self.iter_commits(desc="creating big bag-of-words")):
+            big_bag += message_n
+            #bag = BagOfWords(message_n)
+            #big_bag += bag
+
+            #commit.pop("message")
+            #commit["bag"] = bag.bag
+            #num_bytes_written += fp.write(json.dumps(commit) + "\n")
+
+            if self.verbose and i % 300_000 == 0:
+                dump_stats()
+
+        if self.verbose:
+            dump_stats()
+
+        return big_bag
+
+    def scan_commits(self):
+        try:
+            self._scan_commits_topic()
+        except KeyboardInterrupt:
+            pass
+
+        self._save_topic_bags()
+
+    def _save_topic_bags(self):
+        path = Path(__file__).resolve().parent.parent / "export" / "topic-bags"
+        os.makedirs(path, exist_ok=True)
+        for topic, bag in self.topic_bags.items():
+            bag["bag"].save_json(path / f"{topic}.json")
+
+    def _scan_commits_topic(self):
+        def dump_stats():
+            msg = f"\n\ndate:      {commit['date']}\n"
+            for token, bag in self.topic_bags.items():
+                if not token.endswith("_r") or self.repos:
+                    msg += f"{token+':':20}"
+                msg += f" {bag['bag'].size():10,d} / {bag['bag'].count():10,d}"
+                if token.endswith("_r") or self.repos:
+                    msg += "\n"
+            print(msg)
+
+        last_print_time = time.time()
+        last_save_time = time.time()
+        for i, (commit, message, message_n) in enumerate(self.iter_commits(desc="building topic bags")):
+
+            bag = BagOfWords(message_n)
+            subset = None
+            for topic, topic_bag in self.topic_bags.items():
+                if self.repos:
+                    if (
+                            ("/" in topic_bag["repo"] and commit["repo"] == topic_bag["repo"])
+                            or (commit["repo"].endswith("/" + topic_bag["repo"]))
+                    ):
+                        topic_bag["bag"] += bag
+
+                else:
+                    found = False
+                    for t in topic_bag["tokens"]:
+                        if t in bag.bag:
+                            found = True
+                            break
+                    if not found:
+                        continue
+
+                    if topic.endswith("_r"):
+                        if subset is None:
+                            subset = bag.get_subset(
+                                self.bag_norm,
+                                max_freq=0.03,
+                                min_freq_mult=20,
+                            )
+                        for key in subset:
+                            topic_bag["bag"].add_word(key)
+                    else:
+                        for key in bag.bag:
+                            topic_bag["bag"].add_word(key)
+
+            cur_time = time.time()
+            if self.verbose and cur_time - last_print_time > 10:
+                last_print_time = cur_time
+                dump_stats()
+
+            if cur_time - last_save_time > 60:
+                last_save_time = cur_time
+                self._save_topic_bags()
+
+        if self.verbose:
+            dump_stats()
+
+
+if __name__ == "__main__":
+    Main(**parse_args())
diff --git a/bin/export_messages.py b/bin/export_messages.py
index 2d71ff5..ccdae54 100644
--- a/bin/export_messages.py
+++ b/bin/export_messages.py
@@ -12,8 +12,6 @@
 sys.path.append(str(ROOT_DIR))
 
 from src.gharchive import GHArchive
-from src.good_messages import GoodMessages
-from src.update_index import update_index, get_message_files
 
 
 def parse_args() -> dict:
diff --git a/bin/scan_messages.py b/bin/scan_messages.py
new file mode 100644
index 0000000..cd71faa
--- /dev/null
+++ b/bin/scan_messages.py
@@ -0,0 +1,164 @@
+import argparse
+import datetime
+import json
+import time
+import glob
+from pathlib import Path
+import os
+import sys
+import re
+import hashlib
+from typing import List, Tuple, Generator, Optional
+
+
+from tqdm import tqdm
+
+ROOT_DIR = Path(__file__).resolve().parent.parent
+sys.path.append(str(ROOT_DIR))
+
+from src.bagofwords import BagOfWords
+from src.file_iter import iter_ndjson
+from bin.bag_messages import Main as BagMain, append_filename
+
+
+def parse_args() -> dict:
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument(
+        "commit_file", type=str,
+        help="ndjson of exported commit messages"
+    )
+    parser.add_argument(
+        "--bags", type=str, default=["export/topic-bags-5"],
+        help="paths to topic bag-of-words json files"
+    )
+    parser.add_argument(
+        "-v", "--verbose", type=bool, nargs="?", default=False, const=True,
+        help="Display stuff during run"
+    )
+    
+    args = vars(parser.parse_args())
+
+    return args
+
+
+class Main:
+    def __init__(
+            self,
+            commit_file: str,
+            bags: List[str],
+            verbose: bool,
+    ):
+        self.verbose = verbose
+        self.commit_file = Path(commit_file)
+        self.bag_file = append_filename(self.commit_file, "-big-bag.json")
+
+        self.bigbag = BagOfWords.load_json(self.bag_file)
+        self.bigbag_n = self.bigbag.normalized()
+        self.topic_sets = dict()
+        self.message_hashes = set()
+        topic_bags = dict()
+
+        for path in bags:
+            for file in sorted(glob.glob(str(Path(path) / "*.json"))):
+                name = Path(file).name.split(".")[0]
+                if name.endswith("_r"):
+                    continue
+                if self.verbose:
+                    print("loading", file)
+                bag = BagOfWords.load_json(file)
+                #bag = bag.normalized()
+                #bag.subtract(self.bigbag_n, 4)
+                topic_bags[name] = bag
+
+        if not topic_bags:
+            raise ValueError("No topic-bags found")
+
+        orig_size = dict()
+        self.topic_bags = dict()
+        for name1, bag1 in topic_bags.items():
+            orig_size[name1] = bag1.size()
+            bag = bag1.copy()
+            for name2, bag2 in topic_bags.items():
+                if name1 != name2:
+                    bag.subtract(bag2, bag1["to"] / bag2["to"] / len(topic_bags))
+
+            self.topic_bags[name1] = bag
+
+        for name, bag in self.topic_bags.items():
+            self.topic_sets[name] = set(key for key, value in bag.items() if value > 5)
+            if self.verbose:
+                print(
+                    f"{name:25}: {bag.size():8} / {orig_size[name]:8}",
+                    ", ".join(list(bag.sort().bag.keys())[:10])
+                )
+        #self.topic_bags["politics"].dump()
+        input()
+        self.run()
+
+    def run(self):
+        for i, (commit, message, message_n) in enumerate(self.iter_commits(desc="scanning messages", min_length=1000)):
+            bag = BagOfWords(message_n)
+            if bag.size() < 30:
+                continue
+
+            rows = []
+            bag_set = set(bag.bag)
+            for topic, topic_set in self.topic_sets.items():
+                intersection = topic_set & bag_set
+                intersection_length = len(intersection)
+                if intersection_length > 1:
+                    rows.append([topic, intersection_length, intersection_length / bag.size(), intersection])
+            rows.sort(key=lambda r: -r[1])
+
+            if rows and rows[0][0] in (
+                    #"fuck",
+                    #"personal",
+                    #"loving",
+                    #"code", "fixed", "curse", "segfault", "oops",
+                    #"emotion",
+                    #"corona",
+                    "music",
+                    #"amazing",
+                    #"facebook",
+                    #"repo_spiral",
+                    #"curse"
+                    #"politics",
+            ) and rows[0][2] >= 0.3:
+                print(f"\n\n{commit['date']} {commit['repo']}/commit/{commit['sha']} {commit['author']}\n")
+                print(message_n)
+                print(f"\n{'message:':20} {bag.size():6,} / 1.0")
+                for topic, size, ratio, intersection in rows:
+                    print(
+                        f"{topic + ':':20} {size:6,} / {str(round(ratio, 3)):7}"
+                        f" / {', '.join(sorted(intersection))}"
+                    )
+
+    def iter_commits(self, desc: str, min_length: Optional[int] = None) -> Generator[Tuple[dict, str, str], None, None]:
+        iterable = iter_ndjson(self.commit_file)
+        if self.verbose:
+            iterable = tqdm(iterable, desc=desc)
+
+        for commit in iterable:
+            if "[bot]" in commit["author"]:
+                continue
+
+            message = commit["message"].strip()
+            if min_length and len(message) < min_length:
+                continue
+
+            hash = hashlib.md5(message.encode("utf-8")).hexdigest()
+            if hash in self.message_hashes:
+                continue
+            self.message_hashes.add(hash)
+
+            message_n = BagMain.normalize_text(message)
+
+            if BagMain.skip_message(message_n):
+                continue
+
+            yield commit, message, message_n
+
+
+if __name__ == "__main__":
+    Main(**parse_args())
diff --git a/src/bagofwords.py b/src/bagofwords.py
index 81ceddf..7ec3491 100644
--- a/src/bagofwords.py
+++ b/src/bagofwords.py
@@ -41,10 +41,13 @@ def add_token(token):
 class BagOfWords:
 
     def __init__(self, data: Optional[WordBagArgument] = None):
-        self.bag = dict()
         self.is_normalized = False
-        if data:
-            self += data
+        if isinstance(data, dict):
+            self.bag = data
+        else:
+            self.bag = dict()
+            if data:
+                self += data
 
     def __copy__(self) -> "BagOfWords":
         bag = BagOfWords()
@@ -125,10 +128,10 @@ def normalized(self, copy: bool = False) -> "BagOfWords":
         if self.is_normalized:
             return self.__copy__() if copy else self
 
-        bag = BagOfWords()
-        count = self.count() or 1
+        bag = self.copy()
+        factor = 1 / (self.count() or 1)
         bag.bag = {
-            key: value / count
+            key: value * factor
             for key, value in self.bag.items()
         }
         bag.is_normalized = True
@@ -157,70 +160,78 @@ def limited(self, min_count: Optional[int] = None, max_count: Optional[int] = No
                     bag.bag[key] = value
         return bag
 
-    def sort(self):
+    def sort(self) -> "BagOfWords":
         self.bag = {
             key: self.bag[key]
             for key in sorted(sorted(self.bag), key=lambda k: -self.bag[k])
         }
+        return self
 
     def add_word(self, word: str, count: int = 1):
         self.bag[word] = self.bag.get(word, 0) + count
 
-    def subtract(self, other: WordBagArgument, amount: Optional[Union[str, int, float]] = None) -> "BagOfWords":
+    def subtract(self, other: WordBagArgument, amount: Optional[Number] = None) -> "BagOfWords":
         """
         Subtract value of other
         :param other: text, tokens, dict or BagOfWords
-        :param amount: None to leave values untouched, number to multiply,
-            "all" to remove all keys that are in 'other'
+        :param amount: optional number to multiply other's values
         :return: self
         """
         self.is_normalized = False
         other_dict = self._as_dict(other)
 
         if self.size() > len(other_dict):
-            for key, value in other_dict.items():
-                if key not in self.bag:
-                    continue
+            if amount is None:
+                for key, value in other_dict.items():
+                    if key not in self.bag:
+                        continue
 
-                if amount == "all":
-                    value = -1
-                elif amount is None:
                     value = self.bag[key] - value
-                else:
+                    if value <= 0:
+                        del self.bag[key]
+                    else:
+                        self.bag[key] = value
+            else:
+                for key, value in other_dict.items():
+                    if key not in self.bag:
+                        continue
+
                     value = self.bag[key] - amount * value
+                    if value <= 0:
+                        del self.bag[key]
+                    else:
+                        self.bag[key] = value
+        else:
+            new_dict = dict()
+            self._subtract_dict(new_dict, other_dict, amount)
+            self.bag = new_dict
+        return self
+
+    def subtracted(self, other: WordBagArgument, amount: Optional[Number] = None) -> "BagOfWords":
+        other_dict = self._as_dict(other)
+        bag = BagOfWords()
+        self._subtract_dict(bag.bag, other_dict, amount)
+        return bag
 
-                if value <= 0:
-                    del self.bag[key]
+    def _subtract_dict(self, new_bag: dict, other: dict, amount: Optional[Number]):
+        if amount is None:
+            for key, value in self.items():
+                if key not in other:
+                    new_bag[key] = value
                 else:
-                    self.bag[key] = value
+                    value -= other[key]
+
+                    if value > 0:
+                        new_bag[key] = value
         else:
-            has_zeros = False
             for key, value in self.items():
-                if key in other_dict:
-
-                    if amount == "all":
-                        value = -1
-                    elif amount is None:
-                        value = value - other_dict[key]
-                    else:
-                        value = value - amount * other_dict[key]
-
-                    self.bag[key] = value
-                    if value <= 0:
-                        has_zeros = True
-
-            if has_zeros:
-                self.bag = {
-                    key: value
-                    for key, value in self.items()
-                    if value > 0
-                }
-        return self
+                if key not in other:
+                    new_bag[key] = value
+                else:
+                    value -= other[key] * amount
 
-    def subtracted(self, other: WordBagArgument, amount: Optional[float] = None) -> "BagOfWords":
-        new_bag = self.__copy__()
-        new_bag.subtract(other, amount=amount)
-        return new_bag
+                    if value > 0:
+                        new_bag[key] = value
 
     def union(self, other: WordBagArgument):
         bag = self.__copy__()
diff --git a/src/tests/test_bags.py b/src/tests/test_bags.py
index cbe70e7..8ce6b0f 100644
--- a/src/tests/test_bags.py
+++ b/src/tests/test_bags.py
@@ -47,35 +47,77 @@ def test_bag_subtract(self):
 
         self.assertEqual(
             {"b": 2, "c": 2, "d": 4},
-            bag.subtracted({"a": 1, "c": 1}).bag
+            (bag - {"a": 1, "c": 1}).bag
         )
         self.assertEqual(
             {"c": 2, "d": 2},
-            bag.subtracted({"a": 1, "b": 5, "c": 1, "d": 2, "e": 1}).bag
+            (bag - {"a": 1, "b": 5, "c": 1, "d": 2, "e": 1}).bag
         )
 
-    def test_speed_subtract(self):
+    def test_bag_isubtract(self):
+        bag = BagOfWords({"a": 1, "b": 2, "c": 3, "d": 4})
+        bag -= {"a": 1, "c": 3, "d": 1}
+        self.assertEqual(
+            {"b": 2, "d": 3},
+            bag.bag
+        )
+
+        bag = BagOfWords({"a": 1, "b": 2, "c": 3, "d": 4})
+        bag -= {"a": 1, "b": 5, "c": 1, "d": 2, "e": 1}
+        self.assertEqual(
+            {"c": 2, "d": 2},
+            bag.bag
+        )
+
+    def test_speed(self):
         iterations = 1000
-        print()
-        for bag_size in [10, 100, 10_000]:
-            bag1 = BagOfWords({str(i): i for i in range(bag_size)})
-            bag2 = bag1.copy()
-            bag1["extra"] *= 1
 
-            start_time = time.time()
+        def func_normalized(bag1, bag2):
             for i in range(iterations):
                 bag1.normalized()
-            fps = iterations / (time.time() - start_time)
-            print(f"normalized  with bag size {bag_size:7} @ {fps:12,.0f} fps")
 
-            start_time = time.time()
+        def func_big_minus_small(bag1, bag2):
             for i in range(iterations):
                 bag1 - bag2
-            fps = iterations / (time.time() - start_time)
-            print(f"big - small with bag size {bag_size:7} @ {fps:12,.0f} fps")
 
-            start_time = time.time()
+        def func_small_minus_big(bag1, bag2):
             for i in range(iterations):
                 bag2 - bag1
-            fps = iterations / (time.time() - start_time)
-            print(f"small - big with bag size {bag_size:7} @ {fps:12,.0f} fps")
+
+        def func_big_minus_small_inplace(bag1, bag2):
+            for i in range(iterations):
+                bag1 -= bag2
+
+        def func_small_minus_big_inplace(bag1, bag2):
+            for i in range(iterations):
+                bag2 -= bag1
+
+        def func_big_union_small(bag1, bag2):
+            for i in range(iterations):
+                bag1.union(bag2)
+
+        def func_small_union_big(bag1, bag2):
+            for i in range(iterations):
+                bag2.union(bag1)
+
+        functions = (
+            func_normalized,
+            func_big_minus_small,
+            func_small_minus_big,
+            func_big_minus_small_inplace,
+            func_small_minus_big_inplace,
+            func_big_union_small,
+            func_small_union_big,
+        )
+
+        print()
+        for bag_size in [10, 100, 10_000]:
+            for func in functions:
+
+                bag1 = BagOfWords({str(i): i for i in range(bag_size*2)})
+                bag2 = BagOfWords({str(i): i for i in range(bag_size)})
+
+                start_time = time.time()
+                func(bag1, bag2)
+                fps = iterations / (time.time() - start_time)
+                print(f"{func.__name__:30}: bag size {bag_size:7} @ {fps:12,.0f} fps")
diff --git a/src/words.py b/src/words.py
index ee1e54d..0f1ba70 100644
--- a/src/words.py
+++ b/src/words.py
@@ -18,73 +18,88 @@
         " i m ", " i am ", " i ve ", " i have", " i d ", " i had ",
         " i ll ", " i will ", " i won't ", " i wont ",
 
-        " today",
+        " today ", " fairly ",
+    ],
+    .3: [
+        " wrong ", " extremely ", " hope ", " why ",
+        " enough ", " free ", " good ", " wouldn t ",
     ],
     1: [
-        " i want ",
-        " my ", " myself", " he ", " she ", " his ", " her ", " you ", " you d ", " you ll ",
-        " we ", " we ll ", " we d ",
+        " my ", " myself", " he ", " she ", " his ", " her ",
+        " you ", " you d ", " you ll ", " you re ",
+        " we ", " we ll ", " we d ", " we re ",
         " we should ", " i should", " he should", " she should",
-        " think", " thinking", " feeling", " beautiful", " happy", " sad ",
+        " think", " thinking", " feeling", " opinion ", " beautiful",
+        " happy", " sad ", " sadly ",
         "paradox", " worry", " wise", " sane ", " weird ", " wierd "  # !sic
         " days ", " hours ", " yesterday", " tomorrow", " year",
+        " summer", " winter", " autumn",
         "music", " personal", " cool ", " interesting",
         " offend", " frankly", " fortunate", " however ",
         "didn't help", "thanks", "thank you", "anyway", " enjoy ",
         re.compile(" a+h+ "), re.compile(" o+h+ "), re.compile(" oo+ps "),
         " do the tricks ", " garbage", " depressing ",
-        " love", "love ", " appease ",
+        " love", "love ", " appease ", " suspicion ", " deeply ",
+        " buttload ", " dunno ", " quirk ",
+        " literally ", " giant ", " illusion ", " brute ",
+        " turf ", " turfs ", " outright ", " repetitious ", " adware ",
+        " bugged ", " heart ", " sleep ", " poor ", " juicy ", " facts ",
     ],
     2: [
         re.compile(" a+rr+g+h* "), re.compile(" mm+h+ "),
         re.compile(" ba+h+ "),
         re.compile(" ha\s*ha "),re.compile(" ha\s*ha\s*ha "),
         re.compile(" har\s*har "),
-        " my brain",
+        " i want ", " my brain", " shot ",
         "pandemic", " war ", "science", "scientist",
         " hack", " hacks", " wild ", " claw", " crazy ", " beast ",
-        "drinking", " exhaustive", " headache",
+        "drinking", " exhaustive", " headache", " shocked ", " puzzled ",
         " poorly", " unreadable", " theory", " theorize",
         " diary", " wonder ", " depression ", " revenge ",
-        " intelligent machine", "psychedelic", " deeply ",
+        " intelligent machine", "psychedelic", " exciting ", " angel ",
+        " quirky ", " mistake ", " fluctuation", " twinkle ", " twinkling ", " hookin ", #! sic
+        " enjoy ", " artists ",
     ],
     3: [
         " yeah", re.compile(" yah+ "),
         " my opinion", " not that", "kinda ", "to be honest", "honestly",
         " doesn't make sense", " confess", " remember",
         " funny", " hilarious", " ridiculous", " amazing", " wonky",
-        " stupid", " awful", " silly", " ugly", " clunk", " creep",
+        " stupid", " awful", " silly", " ugly", " ugliness ", " clunk", " creep",
         " shut up", " suck", " sucking", " sick ", " screw you", " idiot", " idiotic",
         " sadness", " emotion", " pain", " miracle", " despair", " despar",  # !sic
         " insane", " insanit", " rage ", " die ", " lynch",
+        " commercialized ", " ideology ", " suicid", " revolution ",
+        " joke ", " joking ", " jokes ", " laugh", " lousy ", " guru",
 
         " lovely", " hate ", " hateful", " hating", " curse", "damn", " joy ",
         " frustrati", "humbug", " gosh ", " blood", " annoy", " trouble",
         " evil", " god ", " devil ", " god s ", "praise", " holy ", "church", " verse ", " faith ",
-        re.compile(" omg+ "),
+        re.compile(" omg+ "), " salvation ", " terror",
         " bless", "ascetic", " spirit", "demonic", " demon ",
         " heaven", " hell ", " hellish", "fantasy", " mystery", " magic",
         " moral", " immoral", " boy", " girl", "friend", "acquaintance",
-        " shame", "thoughts", "sorry",
+        " shame", "thoughts", " sorry",
         " female", " male ", " cock", " ass ", " arse ", "bitch",
-
-        " life ", " dream", " grief", " tears",
+        " smoker ", " violent",
+        " life ", " dream", " grief", " tears", " angels ",
         "paranoia", "paranoid", " society", " social", " forgive", " forgiving",
 
-        " cheat", "singularity", " hot potato",
+        " cheat", "singularity", " hot potato", " crisis ", " espionage ",
         " breakfast", " dinner", " lunch", " morning", " evening", "night ",
-
+        " my parents"
         " sister", " brother", " mama ", " papa ",
         " stunned", " stunning", " horribl",
         " humiliate", " inspiration", "experience", " darkness", " misery", " suffering",
         " struggling", " obsess", "ignorant",
-        " please understand",
+        " please understand", " armageddon",
     ],
     4: [
-        "fuck", " shit", "bullshit", " sin ", "facepalm",
-        "cthulhu",
+        "fuck", " shit", "bullshit", " sinner ", "facepalm",
+        "cthulhu", " junkie",
         re.compile("notes? to .*self"), " hate myself",
-        "dear diary",
+        "dear diary", " tinfoil ",
+        " should be sleep", " my parents", " why me ",
     ],
     10: [
         " hate my life", "fuck you",