Skip to content

Commit

Permalink
recognises years
Browse files Browse the repository at this point in the history
  • Loading branch information
Gareth Lloyd committed Jan 22, 2011
1 parent 28316c7 commit e9c5826
Showing 1 changed file with 31 additions and 12 deletions.
43 changes: 31 additions & 12 deletions wikipedia_processor/page_parser.py
Expand Up @@ -5,6 +5,7 @@
from xml.sax.handler import feature_namespaces
from xml.sax.saxutils import escape, XMLFilterBase
from unicodedata import normalize
import re

class Coords(object):
def __init__(self, lat=None, long=None):
Expand All @@ -17,19 +18,42 @@ def __init__(self):
self.id = u''
self.text = u''
self.coords = []
self.isYearPattern = re.compile(r"^\d{1,4}(_BC)?$")

def containsCoords(self):
def processForCoords(self):
"""
Try to skip early if it's not relevant (e.g. it's a redirect)
otherwise detect all Coordinates and return True if some found
"""
return False

def isYear(self):
return False
"""
Is this a year page?
"""
return self.isYearPattern.match(self.title)

def __str__(self):
return 'ID %s TITLE %s' % (self.id.encode('utf_8'), self.title.encode('utf_8'))

def __unicode(self):
def __unicode__(self):
return 'ID %s TITLE %s' % (self.id, self.title)


def processAndSaveEvents(page):
"""
Take a Wikipedia page and add it to the
Pages collection
"""
print page.title

def savePage(page):
"""
Take a Wikipedia page and add it to the
Pages collection
"""
pass

class text_normalize_filter(XMLFilterBase):
"""
SAX filter to ensure that contiguous texts nodes are merged into one
Expand Down Expand Up @@ -61,12 +85,6 @@ def method(self, *a, **k):
for n in '''startElement endElement endDocument'''.split():
_wrap_complete(n)

def processYear(text):
pass

def processPage(text):
pass

class WikipediaHandler(handler.ContentHandler):
def __init__(self, out=sys.stdout):
handler.ContentHandler.__init__(self)
Expand Down Expand Up @@ -101,10 +119,11 @@ def characters(self, content):
self.currentPage.text += content

def analysePage(self):
if (self.currentPage.containsCoords()):
processPage(self.currentPage)
if (self.currentPage.isYear()):
processYear(self.currentPage)
processAndSaveEvents(self.currentPage)
elif (self.currentPage.processForCoords()):
savePage(self.currentPage)


if __name__ == '__main__':
parser = make_parser()
Expand Down

0 comments on commit e9c5826

Please sign in to comment.