Skip to content

Commit

Permalink
apply normalization to all strings / configurable escape char
Browse files Browse the repository at this point in the history
  • Loading branch information
git2samus committed Feb 29, 2020
1 parent dff4ce5 commit 4e37b40
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 19 deletions.
2 changes: 1 addition & 1 deletion award_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
if database_url is None:
raise DocoptExit("Missing DATABASE_URL variable.\n")

# we'll use the lowercase variant of PRAW_SITE as the reference since that's what praw uses
# we use uppercase for consistency but convert it to lowercase since that's what PRAW uses
praw_site = os.getenv('PRAW_SITE')
if praw_site is not None:
os.environ['praw_site'] = praw_site
Expand Down
42 changes: 25 additions & 17 deletions modules/awards.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,40 @@
import re, json, random, string
from unidecode import unidecode
from modules.shared.base import APIProcess
from modules.shared.utils import normalize_str


class AwardBotProcess(APIProcess):
def __init__(self, source_version, subreddit_name):
# setup PRAW, db and http session
super().__init__(source_version)

# internal variables
self.subreddit_name = subreddit_name

# prepare search patterns and keyword info
negate_char_config = self.reddit.config.custom['bot_negate_char']
negate_char = re.escape(negate_char_config)
negated_keyword_pattern = f'(?P<negated>{negate_char})'

with open('keyword_mapping.json') as json_file:
self.keyword_mapping = json.load(json_file)
self.keyword_mapping = {
normalize_str(key): value
for key, value in json.load(json_file).items()
}

keywords = map(unidecode, self.keyword_mapping.keys())
keyword_join = '|'.join(
re.escape(keyword) for keyword in keywords
re.escape(keyword) for keyword in self.keyword_mapping.keys()
)
keyword_join_pattern = f'(?P<keyword>{keyword_join})'

negate_char = re.escape('\\')
keyword_pattern = '{}{}{}'.format(
f'(?P<negated>{negate_char})?',
f'(?P<keyword>{keyword_join})',
f'(?:[{string.punctuation}])?',
)
punctuation_pattern = '(?:[{}])'.format(re.escape(string.punctuation))

self.keyword_re = re.compile(keyword_pattern, re.IGNORECASE)
keyword_pattern = '{negated}?{keyword}{punctuation}?'.format(
negated=negated_keyword_pattern,
keyword=keyword_join_pattern,
punctuation=punctuation_pattern,
)
self.keyword_re = re.compile(keyword_pattern)

def add_reply(self, comment, matched_keywords):
print(f'@@@ {matched_keywords}')
Expand All @@ -37,18 +46,17 @@ def run(self):
# ignored users (bots and such)
blacklist_config = self.reddit.config.custom['bot_comments_blacklist']
blacklist = {
name.strip().lower() for name in blacklist_config.split(',')
name.strip() for name in normalize_str(blacklist_config).split(',')
}

# process submissions
subreddit = self.reddit.subreddit(self.subreddit_name)
for comment in subreddit.stream.comments(skip_existing=True):
print([comment.author, comment.body])
# Use .name to avoid another API call
if comment.author.name.lower() not in blacklist:
tokens = comment.body.split()
matches = map(self.keyword_re.fullmatch, tokens)
# Use author.name to avoid another API call
if normalize_str(comment.author.name) not in blacklist:
tokens = normalize_str(comment.body).split()

matches = map(self.keyword_re.fullmatch, tokens)
matched_keywords = {
match['keyword'] for match in matches
if match is not None and match['negated'] is None
Expand Down
2 changes: 1 addition & 1 deletion modules/shared
Submodule shared updated 1 files
+5 −0 utils.py
2 changes: 2 additions & 0 deletions praw.ini.example
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ bot_version=<bot-version>
bot_author=<bot-author>
user_agent=script:%(bot_name)s:v%(bot_version)s (by /u/%(bot_author)s)

# Character prefix used to ignore keywords on comments, e.g. \
bot_negate_char=<negate-char>
# Reset day: Monday == 0 ... Sunday == 6
bot_reset_day=<reset-day-number>
# Comma-separated list of users to ignore when checking keywords
Expand Down
7 changes: 7 additions & 0 deletions scripts/json_encode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env python3
import sys, json

raw = sys.stdin.read()
encoded = json.dumps(raw)

print(encoded)

0 comments on commit 4e37b40

Please sign in to comment.