diff --git a/.gitignore b/.gitignore index 8d5d5ae..c82996a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ /stash/ /gharchive-dump/ /export/ +/scratchpad/ *.sh *.pyc diff --git a/bin/bag_messages.py b/bin/bag_messages.py new file mode 100644 index 0000000..e0020cc --- /dev/null +++ b/bin/bag_messages.py @@ -0,0 +1,403 @@ +import argparse +import datetime +import json +import time +from pathlib import Path +import os +import sys +import re +from typing import List, Tuple, Generator + + +from tqdm import tqdm + +ROOT_DIR = Path(__file__).resolve().parent.parent +sys.path.append(str(ROOT_DIR)) + +from src.bagofwords import BagOfWords +from src.file_iter import iter_ndjson + + +RE_REPLACE_WHITESPACE = ( + # urls + re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&#+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"), + # emails + re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"), + # long hex numbers like commit refs + re.compile(r"[a-f0-9]{30,1000}"), +) + +USELESS_MESSAGE_TAILS = ( + "Merge pull request", + "Signed-off-by:", + "Commit reference:", + "Sync branch \"", + "Authored-by:", + "Co-authored-by:", + "Reported-by:", + "Acked-by:", + "Reviewed By:", + "Merged PRs:", + "Excluded PRs:", + "Merge branch '", + "Merge remote-tracking branch '", + "Deploying to gh-pages from @", + "This is an automated commit", + "Set up CI with Azure Pipelines\n", + "Source-Link: http", + "Change log: http", + "issuer signature:", + "*By submitting this pull request, I confirm", + "# All SDK Contribution checklist:", # Azure SDK + "Update Composer dependencies (", + "--BEGIN PGP SIGNATURE--", +) + +TOPICS = [ + ( + "game", "shotgun", "sprite", "missile", "witch", "witches", "zombie", "zombies", + "spaceship", "spaceships", "demon", "demons", "player", "players", "player's", + "joystick", "spawn", "spawned", "spawning", "respawning", "dragon", + "lair", "monster", "monsters", "survivor", "assassin", "assassins", + "billiard", "shooter", "knight", "knights", "knight's", "projectile", + "damage", "minecraft", "bukkit", "spigot", "multiplayer", + "explosive", "hostile", "gameloop", "grenade", + ), + ( + "emotion", "sadness", "pain", "depression", "depressed", + "despair", "desperation", "loving", "frustrating", "frustration", + "paranoia", "forgiving", "suffering", + ), + ( + "fun", "funny", "hilarious", "ridiculous", "joke", "joking", + "laugh", "laughing", "kidding", + ), + ( + "music", "musical", "song", "melody", "" + ), + ( + "code", "function", "refactor", "kernel", "status", "thread", "settings", + "threads", "maintenance", "synchronisation", "revision", + "function", "functions", "variable", "variables", "upstream", "implementation", + "driver", "struct", "cleanup", "issues", "fixes", "process", "processing", "buffer", "module", + "command", "cpu", "arm64", "documentation", "reference", "align", "pointer", + "architecture", "server", "hardware", "compiler", "directory", "register", + "integration", "deployment", "inline", "coverage", "recursive", "dependencies", + "compilation", "bytes", "bits", "generator", "functional", "compatibility", + "extension", "instructions", "operation", "management", "interrupt", "macro", + "syntax", "initialization", "install", "overflow", "merged", "endpoint", "pipeline", + ), + ( + "blog", "article" + ), + ("fixed", "fixes", "fixing"), + "microsoft", "apple", "amazon", "facebook", + "segfault", + ("css", "scss", "html", "http", "javascript"), + ("amazing", "superb", "stunning", "excellent", "inspiring", "superb"), + ( + "curse", "cursed", "damn", "awful", "hate", "hated", "disgusting", "bloody", + "silly", "silliness", "stupid", "stupidity", "terrible", + "shit", "shitty", "bullshit", "suck", "sucks", "sucking", "sucker" + "fuck", "fucking", "fucked", "fuckoff", "fuckyou", + ), + ( + "politics", "government", "governments", "governmental", "politics", "political" + ), + ( + "oops", "ah", "aah", "oh", "ohh", "woah" + ), + ( + "corona", "pandemic", + ), + ( + "tanh", "tan", "sin", "asin", "sinh", "cos", "acos", "cosh", "tan", "tanh", "sqrt", + "ceil", + ) +] + +TOPICS_3 = [ + ("honest", "honestly", "frankly"), + ("amazing", "superb", "stunning", "excellent", "inspiring"), + ("shit", "shitty", "bullshit"), + ("depressed", "depression", "depressive", "depressing"), + ("code", "coding", "programming"), + ("confess", "confessing", "confession"), + ("idea", "ideas", "inspiration", "inspiring"), + "feedback", +] + +TOPICS_2 = [ + "female", "male", + "girl", "boy", + "girlfriend", "boyfriend", + "sister", "brother", + "college", "school", + ("mystery", "mysteries"), + ("friend", "friends"), + ("dream", "dreams", "dreamy"), + "thoughts", "grief", + "tears", + ("criminal", "criminals"), + "antifa", + ("sex", "sexual"), + (), + ("fuck", "fucking", "fucked", "fuckoff", "fuckyou"), + ("corona", "pandemic", "covid"), + "personal", + + ("god", "gods"), + ("hell", "hellish", "hellfire"), + ("guitar", "saxophone", "banjo", "ukulele", "flute"), + ("praise", "praised"), + ("church", "churches"), + "love", "hate", "hatred", + "loving", + ("fire", "fires"), + "kernel", + "python", +] + + +def parse_args() -> dict: + parser = argparse.ArgumentParser() + + parser.add_argument( + "commit_file", type=str, + help="ndjson of exported commit messages" + ) + parser.add_argument( + "--repos", type=str, nargs="+", + help="Instead of the topic-bags, build one for each repo", + ) + parser.add_argument( + "-v", "--verbose", type=bool, nargs="?", default=False, const=True, + help="Display stuff during run" + ) + + args = vars(parser.parse_args()) + + return args + + +def append_filename(filename: Path, suffix: str) -> Path: + name = filename.name + if name.lower().endswith(".gz"): + name = name[:-3] + + name_parts = name.split(".") + name = ".".join(name_parts[:-1]) + + name = name + suffix + + return filename.parent / name + + +class Main: + def __init__( + self, + commit_file: str, + repos: List[str], + verbose: bool, + ): + self.commit_file = Path(commit_file) + self.bag_file = append_filename(self.commit_file, "-big-bag.json") + self.commit_bag_file = append_filename(self.commit_file, "-bags.ndjson") + self.verbose = verbose + self.repos = repos + + if self.bag_file.exists(): + self.bag = BagOfWords.load_json(self.bag_file) + else: + self.bag = self.render_bag() + self.bag.save_json(self.bag_file) + self.bag_norm = self.bag.normalized() + + print(f"bag: {self.bag.size():,} / {self.bag.count():,}") + + self.topic_bags = dict() + if not self.repos: + for topic in sorted(TOPICS, key=lambda t: t[0] if isinstance(t, tuple) else t): + if isinstance(topic, str): + topic = [topic] + for suffix in ("", "_r"): + self.topic_bags[topic[0] + suffix] = { + "tokens": topic, + "bag": BagOfWords() + } + else: + for repo in self.repos: + name = repo.replace("/", "-") + self.topic_bags[name] = { + "repo": repo, + "bag": BagOfWords() + } + self.scan_commits() + + @classmethod + def normalize_text(cls, text: str) -> str: + for useless_tail in USELESS_MESSAGE_TAILS: + try: + idx = text.index(useless_tail) + text = text[:idx] + except ValueError: + pass + text = text.lower() + for repl in RE_REPLACE_WHITESPACE: + text = repl.sub(" ", text) + text = text.replace("\n", " ").replace("\r", " ") + return text + + @classmethod + def skip_message(cls, text: str) -> bool: + if not text: + return True + + for char, min_avg_space in ( + # exclude file listings + ("/", 40), + # exclude other weird stuff + ("(", 60), + ("[", 60), + (".", 40), + (">", 60), + ): + ratio = text.count(char) / len(text) + if ratio >= 1. / min_avg_space: + return True + + return False + + def iter_commits(self, desc: str) -> Generator[Tuple[dict, str, str], None, None]: + iterable = iter_ndjson(self.commit_file) + if self.verbose: + iterable = tqdm(iterable, desc=desc) + + for i, commit in enumerate(iterable): + + if self.repos: + found = False + for repo in self.repos: + if ("/" in repo and commit["repo"] == repo) or (commit["repo"].endswith("/" + repo)): + found = True + break + if not found: + continue + + if "[bot]" in commit["author"]: + continue + + message = commit["message"] + message_n = self.normalize_text(message) + + if self.skip_message(message_n): + continue + + yield commit, message, message_n + + def render_bag(self) -> BagOfWords: + big_bag = BagOfWords() + num_bytes_written = 0 + + def dump_stats(): + print() + print( + f"\ndate: {commit['date']}" + f"\ntokens: {big_bag.size():,} / {big_bag.count():,}" + f"\nwritten: {num_bytes_written:,}" + "\n" + ) + + for i, (commit, message, message_n) in enumerate(self.iter_commits(desc="creating big bag-of-words")): + big_bag += message_n + #bag = BagOfWords(message_n) + #big_bag += bag + + #commit.pop("message") + #commit["bag"] = bag.bag + #num_bytes_written += fp.write(json.dumps(commit) + "\n") + + if self.verbose and i % 300_000 == 0: + dump_stats() + + if self.verbose: + dump_stats() + + return big_bag + + def scan_commits(self): + try: + self._scan_commits_topic() + except KeyboardInterrupt: + pass + + self._save_topic_bags() + + def _save_topic_bags(self): + path = Path(__file__).resolve().parent.parent / "export" / "topic-bags" + os.makedirs(path, exist_ok=True) + for topic, bag in self.topic_bags.items(): + bag["bag"].save_json(path / f"{topic}.json") + + def _scan_commits_topic(self): + def dump_stats(): + msg = f"\n\ndate: {commit['date']}\n" + for token, bag in self.topic_bags.items(): + if not token.endswith("_r") or self.repos: + msg += f"{token+':':20}" + msg += f" {bag['bag'].size():10,d} / {bag['bag'].count():10,d}" + if token.endswith("_r") or self.repos: + msg += "\n" + print(msg) + + last_print_time = time.time() + last_save_time = time.time() + for i, (commit, message, message_n) in enumerate(self.iter_commits(desc="building topic bags")): + + bag = BagOfWords(message_n) + subset = None + for topic, topic_bag in self.topic_bags.items(): + if self.repos: + if ( + ("/" in topic_bag["repo"] and commit["repo"] == topic_bag["repo"]) + or (commit["repo"].endswith("/" + topic_bag["repo"])) + ): + topic_bag["bag"] += bag + + else: + found = False + for t in topic_bag["tokens"]: + if t in bag.bag: + found = True + break + if not found: + continue + + if topic.endswith("_r"): + if subset is None: + subset = bag.get_subset( + self.bag_norm, + max_freq=0.03, + min_freq_mult=20, + ) + for key in subset: + topic_bag["bag"].add_word(key) + else: + for key in bag.bag: + topic_bag["bag"].add_word(key) + + cur_time = time.time() + if self.verbose and cur_time - last_print_time > 10: + last_print_time = cur_time + dump_stats() + + if cur_time - last_save_time > 60: + last_save_time = cur_time + self._save_topic_bags() + + if self.verbose: + dump_stats() + + +if __name__ == "__main__": + Main(**parse_args()) diff --git a/bin/export_messages.py b/bin/export_messages.py index 2d71ff5..ccdae54 100644 --- a/bin/export_messages.py +++ b/bin/export_messages.py @@ -12,8 +12,6 @@ sys.path.append(str(ROOT_DIR)) from src.gharchive import GHArchive -from src.good_messages import GoodMessages -from src.update_index import update_index, get_message_files def parse_args() -> dict: diff --git a/bin/scan_messages.py b/bin/scan_messages.py new file mode 100644 index 0000000..cd71faa --- /dev/null +++ b/bin/scan_messages.py @@ -0,0 +1,164 @@ +import argparse +import datetime +import json +import time +import glob +from pathlib import Path +import os +import sys +import re +import hashlib +from typing import List, Tuple, Generator, Optional + + +from tqdm import tqdm + +ROOT_DIR = Path(__file__).resolve().parent.parent +sys.path.append(str(ROOT_DIR)) + +from src.bagofwords import BagOfWords +from src.file_iter import iter_ndjson +from bin.bag_messages import Main as BagMain, append_filename + + +def parse_args() -> dict: + parser = argparse.ArgumentParser() + + parser.add_argument( + "commit_file", type=str, + help="ndjson of exported commit messages" + ) + parser.add_argument( + "--bags", type=str, default=["export/topic-bags-5"], + help="paths to topic bag-of-words json files" + ) + parser.add_argument( + "-v", "--verbose", type=bool, nargs="?", default=False, const=True, + help="Display stuff during run" + ) + + args = vars(parser.parse_args()) + + return args + + +class Main: + def __init__( + self, + commit_file: str, + bags: List[str], + verbose: bool, + ): + self.verbose = verbose + self.commit_file = Path(commit_file) + self.bag_file = append_filename(self.commit_file, "-big-bag.json") + + self.bigbag = BagOfWords.load_json(self.bag_file) + self.bigbag_n = self.bigbag.normalized() + self.topic_sets = dict() + self.message_hashes = set() + topic_bags = dict() + + for path in bags: + for file in sorted(glob.glob(str(Path(path) / "*.json"))): + name = Path(file).name.split(".")[0] + if name.endswith("_r"): + continue + if self.verbose: + print("loading", file) + bag = BagOfWords.load_json(file) + #bag = bag.normalized() + #bag.subtract(self.bigbag_n, 4) + topic_bags[name] = bag + + if not topic_bags: + raise ValueError("No topic-bags found") + + orig_size = dict() + self.topic_bags = dict() + for name1, bag1 in topic_bags.items(): + orig_size[name1] = bag1.size() + bag = bag1.copy() + for name2, bag2 in topic_bags.items(): + if name1 != name2: + bag.subtract(bag2, bag1["to"] / bag2["to"] / len(topic_bags)) + + self.topic_bags[name1] = bag + + for name, bag in self.topic_bags.items(): + self.topic_sets[name] = set(key for key, value in bag.items() if value > 5) + if self.verbose: + print( + f"{name:25}: {bag.size():8} / {orig_size[name]:8}", + ", ".join(list(bag.sort().bag.keys())[:10]) + ) + #self.topic_bags["politics"].dump() + input() + self.run() + + def run(self): + for i, (commit, message, message_n) in enumerate(self.iter_commits(desc="scanning messages", min_length=1000)): + bag = BagOfWords(message_n) + if bag.size() < 30: + continue + + rows = [] + bag_set = set(bag.bag) + for topic, topic_set in self.topic_sets.items(): + intersection = topic_set & bag_set + intersection_length = len(intersection) + if intersection_length > 1: + rows.append([topic, intersection_length, intersection_length / bag.size(), intersection]) + rows.sort(key=lambda r: -r[1]) + + if rows and rows[0][0] in ( + #"fuck", + #"personal", + #"loving", + #"code", "fixed", "curse", "segfault", "oops", + #"emotion", + #"corona", + "music", + #"amazing", + #"facebook", + #"repo_spiral", + #"curse" + #"politics", + ) and rows[0][2] >= 0.3: + print(f"\n\n{commit['date']} {commit['repo']}/commit/{commit['sha']} {commit['author']}\n") + print(message_n) + print(f"\n{'message:':20} {bag.size():6,} / 1.0") + for topic, size, ratio, intersection in rows: + print( + f"{topic + ':':20} {size:6,} / {str(round(ratio, 3)):7}" + f" / {', '.join(sorted(intersection))}" + ) + + def iter_commits(self, desc: str, min_length: Optional[int] = None) -> Generator[Tuple[dict, str, str], None, None]: + iterable = iter_ndjson(self.commit_file) + if self.verbose: + iterable = tqdm(iterable, desc=desc) + + for commit in iterable: + if "[bot]" in commit["author"]: + continue + + message = commit["message"].strip() + if min_length and len(message) < min_length: + continue + + hash = hashlib.md5(message.encode("utf-8")).hexdigest() + if hash in self.message_hashes: + continue + self.message_hashes.add(hash) + + message_n = BagMain.normalize_text(message) + + if BagMain.skip_message(message_n): + continue + + yield commit, message, message_n + + +if __name__ == "__main__": + Main(**parse_args()) diff --git a/src/bagofwords.py b/src/bagofwords.py index 81ceddf..7ec3491 100644 --- a/src/bagofwords.py +++ b/src/bagofwords.py @@ -41,10 +41,13 @@ def add_token(token): class BagOfWords: def __init__(self, data: Optional[WordBagArgument] = None): - self.bag = dict() self.is_normalized = False - if data: - self += data + if isinstance(data, dict): + self.bag = data + else: + self.bag = dict() + if data: + self += data def __copy__(self) -> "BagOfWords": bag = BagOfWords() @@ -125,10 +128,10 @@ def normalized(self, copy: bool = False) -> "BagOfWords": if self.is_normalized: return self.__copy__() if copy else self - bag = BagOfWords() - count = self.count() or 1 + bag = self.copy() + factor = 1 / (self.count() or 1) bag.bag = { - key: value / count + key: value * factor for key, value in self.bag.items() } bag.is_normalized = True @@ -157,70 +160,78 @@ def limited(self, min_count: Optional[int] = None, max_count: Optional[int] = No bag.bag[key] = value return bag - def sort(self): + def sort(self) -> "BagOfWords": self.bag = { key: self.bag[key] for key in sorted(sorted(self.bag), key=lambda k: -self.bag[k]) } + return self def add_word(self, word: str, count: int = 1): self.bag[word] = self.bag.get(word, 0) + count - def subtract(self, other: WordBagArgument, amount: Optional[Union[str, int, float]] = None) -> "BagOfWords": + def subtract(self, other: WordBagArgument, amount: Optional[Number] = None) -> "BagOfWords": """ Subtract value of other :param other: text, tokens, dict or BagOfWords - :param amount: None to leave values untouched, number to multiply, - "all" to remove all keys that are in 'other' + :param amount: optional number to multiply other's values :return: self """ self.is_normalized = False other_dict = self._as_dict(other) if self.size() > len(other_dict): - for key, value in other_dict.items(): - if key not in self.bag: - continue + if amount is None: + for key, value in other_dict.items(): + if key not in self.bag: + continue - if amount == "all": - value = -1 - elif amount is None: value = self.bag[key] - value - else: + if value <= 0: + del self.bag[key] + else: + self.bag[key] = value + else: + for key, value in other_dict.items(): + if key not in self.bag: + continue + value = self.bag[key] - amount * value + if value <= 0: + del self.bag[key] + else: + self.bag[key] = value + else: + new_dict = dict() + self._subtract_dict(new_dict, other_dict, amount) + self.bag = new_dict + return self + + def subtracted(self, other: WordBagArgument, amount: Optional[Number] = None) -> "BagOfWords": + other_dict = self._as_dict(other) + bag = BagOfWords() + self._subtract_dict(bag.bag, other_dict, amount) + return bag - if value <= 0: - del self.bag[key] + def _subtract_dict(self, new_bag: dict, other: dict, amount: Optional[Number]): + if amount is None: + for key, value in self.items(): + if key not in other: + new_bag[key] = value else: - self.bag[key] = value + value -= other[key] + + if value > 0: + new_bag[key] = value else: - has_zeros = False for key, value in self.items(): - if key in other_dict: - - if amount == "all": - value = -1 - elif amount is None: - value = value - other_dict[key] - else: - value = value - amount * other_dict[key] - - self.bag[key] = value - if value <= 0: - has_zeros = True - - if has_zeros: - self.bag = { - key: value - for key, value in self.items() - if value > 0 - } - return self + if key not in other: + new_bag[key] = value + else: + value -= other[key] * amount - def subtracted(self, other: WordBagArgument, amount: Optional[float] = None) -> "BagOfWords": - new_bag = self.__copy__() - new_bag.subtract(other, amount=amount) - return new_bag + if value > 0: + new_bag[key] = value def union(self, other: WordBagArgument): bag = self.__copy__() diff --git a/src/tests/test_bags.py b/src/tests/test_bags.py index cbe70e7..8ce6b0f 100644 --- a/src/tests/test_bags.py +++ b/src/tests/test_bags.py @@ -47,35 +47,77 @@ def test_bag_subtract(self): self.assertEqual( {"b": 2, "c": 2, "d": 4}, - bag.subtracted({"a": 1, "c": 1}).bag + (bag - {"a": 1, "c": 1}).bag ) self.assertEqual( {"c": 2, "d": 2}, - bag.subtracted({"a": 1, "b": 5, "c": 1, "d": 2, "e": 1}).bag + (bag - {"a": 1, "b": 5, "c": 1, "d": 2, "e": 1}).bag ) - def test_speed_subtract(self): + def test_bag_isubtract(self): + bag = BagOfWords({"a": 1, "b": 2, "c": 3, "d": 4}) + bag -= {"a": 1, "c": 3, "d": 1} + self.assertEqual( + {"b": 2, "d": 3}, + bag.bag + ) + + bag = BagOfWords({"a": 1, "b": 2, "c": 3, "d": 4}) + bag -= {"a": 1, "b": 5, "c": 1, "d": 2, "e": 1} + self.assertEqual( + {"c": 2, "d": 2}, + bag.bag + ) + + def test_speed(self): iterations = 1000 - print() - for bag_size in [10, 100, 10_000]: - bag1 = BagOfWords({str(i): i for i in range(bag_size)}) - bag2 = bag1.copy() - bag1["extra"] *= 1 - start_time = time.time() + def func_normalized(bag1, bag2): for i in range(iterations): bag1.normalized() - fps = iterations / (time.time() - start_time) - print(f"normalized with bag size {bag_size:7} @ {fps:12,.0f} fps") - start_time = time.time() + def func_big_minus_small(bag1, bag2): for i in range(iterations): bag1 - bag2 - fps = iterations / (time.time() - start_time) - print(f"big - small with bag size {bag_size:7} @ {fps:12,.0f} fps") - start_time = time.time() + def func_small_minus_big(bag1, bag2): for i in range(iterations): bag2 - bag1 - fps = iterations / (time.time() - start_time) - print(f"small - big with bag size {bag_size:7} @ {fps:12,.0f} fps") + + def func_big_minus_small_inplace(bag1, bag2): + for i in range(iterations): + bag1 -= bag2 + + def func_small_minus_big_inplace(bag1, bag2): + for i in range(iterations): + bag2 -= bag1 + + def func_big_union_small(bag1, bag2): + for i in range(iterations): + bag1.union(bag2) + + def func_small_union_big(bag1, bag2): + for i in range(iterations): + bag2.union(bag1) + + functions = ( + func_normalized, + func_big_minus_small, + func_small_minus_big, + func_big_minus_small_inplace, + func_small_minus_big_inplace, + func_big_union_small, + func_small_union_big, + ) + + print() + for bag_size in [10, 100, 10_000]: + for func in functions: + + bag1 = BagOfWords({str(i): i for i in range(bag_size*2)}) + bag2 = BagOfWords({str(i): i for i in range(bag_size)}) + + start_time = time.time() + func(bag1, bag2) + fps = iterations / (time.time() - start_time) + print(f"{func.__name__:30}: bag size {bag_size:7} @ {fps:12,.0f} fps") diff --git a/src/words.py b/src/words.py index ee1e54d..0f1ba70 100644 --- a/src/words.py +++ b/src/words.py @@ -18,73 +18,88 @@ " i m ", " i am ", " i ve ", " i have", " i d ", " i had ", " i ll ", " i will ", " i won't ", " i wont ", - " today", + " today ", " fairly ", + ], + .3: [ + " wrong ", " extremely ", " hope ", " why ", + " enough ", " free ", " good ", " wouldn t ", ], 1: [ - " i want ", - " my ", " myself", " he ", " she ", " his ", " her ", " you ", " you d ", " you ll ", - " we ", " we ll ", " we d ", + " my ", " myself", " he ", " she ", " his ", " her ", + " you ", " you d ", " you ll ", " you re ", + " we ", " we ll ", " we d ", " we re ", " we should ", " i should", " he should", " she should", - " think", " thinking", " feeling", " beautiful", " happy", " sad ", + " think", " thinking", " feeling", " opinion ", " beautiful", + " happy", " sad ", " sadly ", "paradox", " worry", " wise", " sane ", " weird ", " wierd " # !sic " days ", " hours ", " yesterday", " tomorrow", " year", + " summer", " winter", " autumn", "music", " personal", " cool ", " interesting", " offend", " frankly", " fortunate", " however ", "didn't help", "thanks", "thank you", "anyway", " enjoy ", re.compile(" a+h+ "), re.compile(" o+h+ "), re.compile(" oo+ps "), " do the tricks ", " garbage", " depressing ", - " love", "love ", " appease ", + " love", "love ", " appease ", " suspicion ", " deeply ", + " buttload ", " dunno ", " quirk ", + " literally ", " giant ", " illusion ", " brute ", + " turf ", " turfs ", " outright ", " repetitious ", " adware ", + " bugged ", " heart ", " sleep ", " poor ", " juicy ", " facts ", ], 2: [ re.compile(" a+rr+g+h* "), re.compile(" mm+h+ "), re.compile(" ba+h+ "), re.compile(" ha\s*ha "),re.compile(" ha\s*ha\s*ha "), re.compile(" har\s*har "), - " my brain", + " i want ", " my brain", " shot ", "pandemic", " war ", "science", "scientist", " hack", " hacks", " wild ", " claw", " crazy ", " beast ", - "drinking", " exhaustive", " headache", + "drinking", " exhaustive", " headache", " shocked ", " puzzled ", " poorly", " unreadable", " theory", " theorize", " diary", " wonder ", " depression ", " revenge ", - " intelligent machine", "psychedelic", " deeply ", + " intelligent machine", "psychedelic", " exciting ", " angel ", + " quirky ", " mistake ", " fluctuation", " twinkle ", " twinkling ", " hookin ", #! sic + " enjoy ", " artists ", ], 3: [ " yeah", re.compile(" yah+ "), " my opinion", " not that", "kinda ", "to be honest", "honestly", " doesn't make sense", " confess", " remember", " funny", " hilarious", " ridiculous", " amazing", " wonky", - " stupid", " awful", " silly", " ugly", " clunk", " creep", + " stupid", " awful", " silly", " ugly", " ugliness ", " clunk", " creep", " shut up", " suck", " sucking", " sick ", " screw you", " idiot", " idiotic", " sadness", " emotion", " pain", " miracle", " despair", " despar", # !sic " insane", " insanit", " rage ", " die ", " lynch", + " commercialized ", " ideology ", " suicid", " revolution ", + " joke ", " joking ", " jokes ", " laugh", " lousy ", " guru", " lovely", " hate ", " hateful", " hating", " curse", "damn", " joy ", " frustrati", "humbug", " gosh ", " blood", " annoy", " trouble", " evil", " god ", " devil ", " god s ", "praise", " holy ", "church", " verse ", " faith ", - re.compile(" omg+ "), + re.compile(" omg+ "), " salvation ", " terror", " bless", "ascetic", " spirit", "demonic", " demon ", " heaven", " hell ", " hellish", "fantasy", " mystery", " magic", " moral", " immoral", " boy", " girl", "friend", "acquaintance", - " shame", "thoughts", "sorry", + " shame", "thoughts", " sorry", " female", " male ", " cock", " ass ", " arse ", "bitch", - - " life ", " dream", " grief", " tears", + " smoker ", " violent", + " life ", " dream", " grief", " tears", " angels ", "paranoia", "paranoid", " society", " social", " forgive", " forgiving", - " cheat", "singularity", " hot potato", + " cheat", "singularity", " hot potato", " crisis ", " espionage ", " breakfast", " dinner", " lunch", " morning", " evening", "night ", - + " my parents" " sister", " brother", " mama ", " papa ", " stunned", " stunning", " horribl", " humiliate", " inspiration", "experience", " darkness", " misery", " suffering", " struggling", " obsess", "ignorant", - " please understand", + " please understand", " armageddon", ], 4: [ - "fuck", " shit", "bullshit", " sin ", "facepalm", - "cthulhu", + "fuck", " shit", "bullshit", " sinner ", "facepalm", + "cthulhu", " junkie", re.compile("notes? to .*self"), " hate myself", - "dear diary", + "dear diary", " tinfoil ", + " should be sleep", " my parents", " why me ", ], 10: [ " hate my life", "fuck you",