Modularize tokenization; add markdown tokens to redditron tokenizer

chromakode · Feb 7, 2009 · c64240b · c64240b
1 parent 4503239
commit c64240b
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 44 deletions.
diff --git a/src/markov.py b/src/markov.py
@@ -6,7 +6,7 @@
 import bisect
 from zlib import crc32
 
-from tokenizer import Tokenizer, RegexTokenType, SpecialTokenType
+from tokenizer import Tokenizer, RegexTokenType, CharacterTokenType
 
 def weighted_choice(weight_dict):
     accum = 0
@@ -42,38 +42,44 @@ def merge_countlists(countlists):
 
     return acc
 
+def simple_english_tokenizer(tokenizer=None):
+    if not tokenizer:
+        tokenizer   = Tokenizer()
+
+    word        = tokenizer.type['Word']        = RegexTokenType(r'(\w+)')
+    punctuation = tokenizer.type['Punctuation'] = RegexTokenType(r'([^\w\s%s]+)')
+
+    tokenizer.joins = {
+        (punctuation,word,'\'')   : '',
+        (punctuation,word)        : ' ',
+        (punctuation,punctuation) : '',
+        (word,word)               : ' ',
+        None                      : ''
+    }
+
+    return tokenizer
 
-SPECIAL_CHARACTERS = {'start':'\x02', 'end':'\x03'}
-
-tokenizer = Tokenizer()
-tokenizer.type['Special']     = SpecialTokenType(**SPECIAL_CHARACTERS)
-tokenizer.type['Word']        = RegexTokenType(r'(\w+)')
-tokenizer.type['Punctuation'] = RegexTokenType(r'([^\w\s%s]+)' % ''.join(SPECIAL_CHARACTERS.values()))
-
-SpecialToken     = tokenizer.type['Special'] 
-WordToken        = tokenizer.type['Word']
-PunctuationToken = tokenizer.type['Punctuation']
-
-tokenizer.joins = {
-    (PunctuationToken,WordToken,'\'')   : '',
-    (PunctuationToken,WordToken)        : ' ',
-    (PunctuationToken,PunctuationToken) : '',
-    (WordToken,WordToken)               : ' ',
-    None                                : ''         
-}
+MarkovMarkerToken = CharacterTokenType(priority=-10, start='\x02', end='\x03')
 
 class Chain(dict):
-    def __init__(self, basis=None, N=1, mhash=crc32):
+    def __init__(self, tokenizer=None, basis=None, N=1, mhash=crc32):
+        if tokenizer:
+            self.tokenizer = tokenizer
+        else:
+            self.tokenizer = simple_english_tokenizer()
+
+        self.tokenizer.type['MarkovMarker'] = MarkovMarkerToken
+
         if isinstance(basis, dict):
             self.update(basis)
 
         self.N = N
         self.mhash = mhash
 
     def train(self, text):
-        tokens = tokenizer.tokenize(text.lower())
-        tokens.insert(0, SpecialToken['start'])
-        tokens.append(SpecialToken['end'])
+        tokens = self.tokenizer.tokenize(text.lower())
+        tokens.insert(0, MarkovMarkerToken['start'])
+        tokens.append(MarkovMarkerToken['end'])
 
         for x in range(len(tokens)):
             for y in range(1, self.N+1):
@@ -87,7 +93,7 @@ def train(self, text):
                     else:
                         self[before_hash] = {me: 1}
 
-    def generate(self, tokens=[SpecialToken['start']], N=None, maxlength=None):
+    def generate(self, tokens=[MarkovMarkerToken['start']], N=None, maxlength=None):
         tokens = list(tokens)
 
         if N:
@@ -96,7 +102,7 @@ def generate(self, tokens=[SpecialToken['start']], N=None, maxlength=None):
             N = self.N
 
         picked = None
-        while picked != SpecialToken['end'] and (len(tokens) < maxlength or maxlength is None):
+        while picked != MarkovMarkerToken['end'] and (len(tokens) < maxlength or maxlength is None):
             # Truncate previous token list to the length of the max association distance (N)
             if len(tokens) > N:
                 tokens = tokens[-N:]
@@ -114,15 +120,15 @@ def generate(self, tokens=[SpecialToken['start']], N=None, maxlength=None):
             candidates = merge_countlists(weights)
 
             if candidates:            
-                picked = tokenizer.token(weighted_choice(candidates))
+                picked = self.tokenizer.token(weighted_choice(candidates))
             else:
                 raise ValueError('No candidate tokens available.')
 
-            if picked.token_type is not SpecialToken:
+            if picked.token_type is not MarkovMarkerToken:
                 yield picked
 
             tokens.append(picked)
 
-    def generate_text(self, text=SpecialToken['start'], N=None, maxlength=None):
-        tokens = tokenizer.tokenize(text)
-        return tokenizer.join(self.generate(tokens, N, maxlength))
+    def generate_text(self, text=MarkovMarkerToken['start'], N=None, maxlength=None):
+        tokens = self.tokenizer.tokenize(text)
+        return self.tokenizer.join(self.generate(tokens, N, maxlength))
diff --git a/src/redditron.py b/src/redditron.py
@@ -1,16 +1,43 @@
 from couchdb.client import Server, Database, ResourceConflict
 
-from markov import Chain
+from markov import Chain, simple_english_tokenizer
+from tokenizer import Tokenizer, RegexTokenType
 from utils import set_line
+from lib.markdown import _Markdown as Markdown
 
-def build_chain(db):
-    c = Chain(N=20)
+import hashlib
+
+def markdown_tokenizer(tokenizer=None):
+    if not tokenizer:
+        tokenizer = Tokenizer()
+
+    simple_english_tokenizer(tokenizer)
+
+    tokenizer.type['MarkdownAutoLink'] = RegexTokenType(Markdown.r_link, priority=-6)
+    tokenizer.type['MarkdownLink'] = RegexTokenType(Markdown.r_DoAnchors2, priority=-6)
+    tokenizer.type['MarkdownBold'] = RegexTokenType(Markdown.r_DoBold, priority=-5)
+    tokenizer.type['MarkdownItalic'] = RegexTokenType(Markdown.r_DoItalics, priority=-4)
+
+    # From markdown.py, but without the surrounding angle brackets
+    tokenizer.type['Link'] = RegexTokenType(r"((https?|ftp):[^\'\">\s]+)", priority=-2)
+
+
+    return tokenizer
+
+def build_chain(db, max_comments=None):
+    c = Chain(tokenizer=markdown_tokenizer(), N=10)
 
     comment_count = len(db)
+    if max_comments:
+        comment_count = min(comment_count, max_comments)
+
     for index, comment_id in enumerate(db):
         body = db[comment_id]['body']
         c.train(body)
         set_line('Loaded %s/%s comments...' % (index+1, comment_count))
+        if max_comments and index+1 >= max_comments:
+            break
+
     set_line('')
 
     return c
@@ -23,10 +50,11 @@ def main():
     db = server[db_name]
 
     c = build_chain(db)
+    outputs = []
     for i in range(0, 100):
-        print "---"
-        print c.generate_text()
-        print "---"
+        outputs.append(c.generate_text())
+
+    print "\n---\n".join(outputs)
 
 if __name__ == '__main__':
      main()
diff --git a/src/tokenizer.py b/src/tokenizer.py
@@ -35,19 +35,13 @@ def find_all(self, text):
         return [(Token(m.group(0),self), m.start())
                 for m in self.regex.finditer(text)]
 
-class SpecialTokenType(TokenType):
+class CharacterTokenType(RegexTokenType):
     def __init__(self, priority=0, **characters):
-        TokenType.__init__(self, priority)
+        RegexTokenType.__init__(self, '|'.join(characters.values()), priority)
         self.characters = characters
 
     def __getitem__(self, key):
         return self.characters[key]
-
-    def is_match(self, text):
-        return text in self.characters.values()
-
-    def find_all(self, text):
-        return []
 
 class Tokenizer(object):
     def __init__(self, types={}, joins={}):