Skip to content

Commit

Permalink
Modularize tokenization; add markdown tokens to redditron tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
chromakode committed Feb 7, 2009
1 parent 4503239 commit c64240b
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 44 deletions.
66 changes: 36 additions & 30 deletions src/markov.py
Expand Up @@ -6,7 +6,7 @@
import bisect
from zlib import crc32

from tokenizer import Tokenizer, RegexTokenType, SpecialTokenType
from tokenizer import Tokenizer, RegexTokenType, CharacterTokenType

def weighted_choice(weight_dict):
accum = 0
Expand Down Expand Up @@ -42,38 +42,44 @@ def merge_countlists(countlists):

return acc

def simple_english_tokenizer(tokenizer=None):
if not tokenizer:
tokenizer = Tokenizer()

word = tokenizer.type['Word'] = RegexTokenType(r'(\w+)')
punctuation = tokenizer.type['Punctuation'] = RegexTokenType(r'([^\w\s%s]+)')

tokenizer.joins = {
(punctuation,word,'\'') : '',
(punctuation,word) : ' ',
(punctuation,punctuation) : '',
(word,word) : ' ',
None : ''
}

return tokenizer

SPECIAL_CHARACTERS = {'start':'\x02', 'end':'\x03'}

tokenizer = Tokenizer()
tokenizer.type['Special'] = SpecialTokenType(**SPECIAL_CHARACTERS)
tokenizer.type['Word'] = RegexTokenType(r'(\w+)')
tokenizer.type['Punctuation'] = RegexTokenType(r'([^\w\s%s]+)' % ''.join(SPECIAL_CHARACTERS.values()))

SpecialToken = tokenizer.type['Special']
WordToken = tokenizer.type['Word']
PunctuationToken = tokenizer.type['Punctuation']

tokenizer.joins = {
(PunctuationToken,WordToken,'\'') : '',
(PunctuationToken,WordToken) : ' ',
(PunctuationToken,PunctuationToken) : '',
(WordToken,WordToken) : ' ',
None : ''
}
MarkovMarkerToken = CharacterTokenType(priority=-10, start='\x02', end='\x03')

class Chain(dict):
def __init__(self, basis=None, N=1, mhash=crc32):
def __init__(self, tokenizer=None, basis=None, N=1, mhash=crc32):
if tokenizer:
self.tokenizer = tokenizer
else:
self.tokenizer = simple_english_tokenizer()

self.tokenizer.type['MarkovMarker'] = MarkovMarkerToken

if isinstance(basis, dict):
self.update(basis)

self.N = N
self.mhash = mhash

def train(self, text):
tokens = tokenizer.tokenize(text.lower())
tokens.insert(0, SpecialToken['start'])
tokens.append(SpecialToken['end'])
tokens = self.tokenizer.tokenize(text.lower())
tokens.insert(0, MarkovMarkerToken['start'])
tokens.append(MarkovMarkerToken['end'])

for x in range(len(tokens)):
for y in range(1, self.N+1):
Expand All @@ -87,7 +93,7 @@ def train(self, text):
else:
self[before_hash] = {me: 1}

def generate(self, tokens=[SpecialToken['start']], N=None, maxlength=None):
def generate(self, tokens=[MarkovMarkerToken['start']], N=None, maxlength=None):
tokens = list(tokens)

if N:
Expand All @@ -96,7 +102,7 @@ def generate(self, tokens=[SpecialToken['start']], N=None, maxlength=None):
N = self.N

picked = None
while picked != SpecialToken['end'] and (len(tokens) < maxlength or maxlength is None):
while picked != MarkovMarkerToken['end'] and (len(tokens) < maxlength or maxlength is None):
# Truncate previous token list to the length of the max association distance (N)
if len(tokens) > N:
tokens = tokens[-N:]
Expand All @@ -114,15 +120,15 @@ def generate(self, tokens=[SpecialToken['start']], N=None, maxlength=None):
candidates = merge_countlists(weights)

if candidates:
picked = tokenizer.token(weighted_choice(candidates))
picked = self.tokenizer.token(weighted_choice(candidates))
else:
raise ValueError('No candidate tokens available.')

if picked.token_type is not SpecialToken:
if picked.token_type is not MarkovMarkerToken:
yield picked

tokens.append(picked)

def generate_text(self, text=SpecialToken['start'], N=None, maxlength=None):
tokens = tokenizer.tokenize(text)
return tokenizer.join(self.generate(tokens, N, maxlength))
def generate_text(self, text=MarkovMarkerToken['start'], N=None, maxlength=None):
tokens = self.tokenizer.tokenize(text)
return self.tokenizer.join(self.generate(tokens, N, maxlength))
40 changes: 34 additions & 6 deletions src/redditron.py
@@ -1,16 +1,43 @@
from couchdb.client import Server, Database, ResourceConflict

from markov import Chain
from markov import Chain, simple_english_tokenizer
from tokenizer import Tokenizer, RegexTokenType
from utils import set_line
from lib.markdown import _Markdown as Markdown

def build_chain(db):
c = Chain(N=20)
import hashlib

def markdown_tokenizer(tokenizer=None):
if not tokenizer:
tokenizer = Tokenizer()

simple_english_tokenizer(tokenizer)

tokenizer.type['MarkdownAutoLink'] = RegexTokenType(Markdown.r_link, priority=-6)
tokenizer.type['MarkdownLink'] = RegexTokenType(Markdown.r_DoAnchors2, priority=-6)
tokenizer.type['MarkdownBold'] = RegexTokenType(Markdown.r_DoBold, priority=-5)
tokenizer.type['MarkdownItalic'] = RegexTokenType(Markdown.r_DoItalics, priority=-4)

# From markdown.py, but without the surrounding angle brackets
tokenizer.type['Link'] = RegexTokenType(r"((https?|ftp):[^\'\">\s]+)", priority=-2)


return tokenizer

def build_chain(db, max_comments=None):
c = Chain(tokenizer=markdown_tokenizer(), N=10)

comment_count = len(db)
if max_comments:
comment_count = min(comment_count, max_comments)

for index, comment_id in enumerate(db):
body = db[comment_id]['body']
c.train(body)
set_line('Loaded %s/%s comments...' % (index+1, comment_count))
if max_comments and index+1 >= max_comments:
break

set_line('')

return c
Expand All @@ -23,10 +50,11 @@ def main():
db = server[db_name]

c = build_chain(db)
outputs = []
for i in range(0, 100):
print "---"
print c.generate_text()
print "---"
outputs.append(c.generate_text())

print "\n---\n".join(outputs)

if __name__ == '__main__':
main()
10 changes: 2 additions & 8 deletions src/tokenizer.py
Expand Up @@ -35,19 +35,13 @@ def find_all(self, text):
return [(Token(m.group(0),self), m.start())
for m in self.regex.finditer(text)]

class SpecialTokenType(TokenType):
class CharacterTokenType(RegexTokenType):
def __init__(self, priority=0, **characters):
TokenType.__init__(self, priority)
RegexTokenType.__init__(self, '|'.join(characters.values()), priority)
self.characters = characters

def __getitem__(self, key):
return self.characters[key]

def is_match(self, text):
return text in self.characters.values()

def find_all(self, text):
return []

class Tokenizer(object):
def __init__(self, types={}, joins={}):
Expand Down

0 comments on commit c64240b

Please sign in to comment.