Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Update to use the observer pattern.

As print can only get you so far.
  • Loading branch information...
commit 2085e4e152064d23cc2cdf524283af2b4448f77b 1 parent 7f06b6d
@gpeterson2 authored
Showing with 252 additions and 179 deletions.
  1. +9 −30 JMdictParser.py
  2. +156 −142 data.py
  3. +13 −7 main.py
  4. +28 −0 observer.py
  5. +46 −0 readme.txt
View
39 JMdictParser.py
@@ -5,7 +5,7 @@
from lxml import etree
-from data import write_list_to_database
+from observer import Subject
__all__ = ['Entry', 'Gloss', 'Parser']
@@ -77,36 +77,18 @@ def __eq__(self, other):
def __hash__(self):
return hash(unicode(self.gloss) + unicode(self.pos) + unicode(self.lang))
-class Parser(object):
+class Parser(Subject):
- def __init__(self, infile=None, message_out=None):
+ def __init__(self, *args, **kwargs):
''' Reads a JMDict file.
:params infile: The JMDict input file.
:params message_out: An output stream for parsing messages,
defaults to none.
'''
+ super(Parser, self).__init__(*args, **kwargs)
- # TODO - need to change the default output, maybe once I'm done it wont
- # be necessary?
-
- if infile:
- self.infile = infile
-
- self.kana_dict = set()
-
- self.message_out = message_out
-
- # TODO - should move this somewhere else.
- # At the very least it should follow an observer pattern, rather than writing and flushing now.
- def __write_output(self, msg):
- ''' Writes and flushes a message to message_output. '''
-
- if not self.message_out:
- return
-
- self.message_out.write(u'{0}\n'.format(msg).encode('utf-8'))
- self.message_out.flush()
+ # TODO - call super class.
def parse_from_file(self, path=None):
''' Parse a JMDict file from the given a filepath.
@@ -114,9 +96,6 @@ def parse_from_file(self, path=None):
:param path: Path to a file to read.
'''
- if not path:
- path = self.infile
-
xml = open(path, 'r')
return self.parse(xml)
@@ -135,11 +114,11 @@ def parse(self, xml):
''' Performs the parsing of the file. '''
events = ('start', 'end')
- context = etree.iterparse(xml, events=events)
+ context = etree.iterparse(xml, events=events, encoding='utf-8')
entries = []
- self.__write_output(u'start reading')
+ self.notify(u'start reading')
pos = None
@@ -181,7 +160,7 @@ def parse(self, xml):
# Shouldn't happen, of course...
# but write an error message if the text isn't found.
if not pos:
- self.__write_output(u'Error: Can\'t find: {0} {1}'.format(ent_seq, pos_text))
+ self.notify(u'Error: Can\'t find: {0} {1}'.format(ent_seq, pos_text))
if tag == 'gloss' and action == 'start':
gloss = elem.text
@@ -203,7 +182,7 @@ def parse(self, xml):
if tag == 'entry' and action == 'end':
entries.append(entry)
- self.__write_output(u'done reading')
+ self.notify(u'done reading')
return entries
View
298 data.py
@@ -4,6 +4,8 @@
import os
import sqlite3
+from observer import Subject
+
def write_from_list(conn, items, sql, table_sql=None):
''' General function to write lists of items to table.
@@ -56,150 +58,162 @@ def create_dict_from_sql(conn, sql):
return d
+class Writer(Subject):
+ def __init__(self, *args, **kwargs):
+ super(Writer, self).__init__(*args, **kwargs)
+
+ def write(self):
+ pass
+
# TODO - this should probably be an object which can be sub classed.
# So that it can more easily be switched between database types.
# Or it should use an ORM, which was the original plan before running
# it meant waiting three hours...
-def write_list_to_database(entries):
- ''' Writes the various lists to a database.
-
- :param entries: A list of entries objects.
- '''
-
- connection_string = 'test.db'
-
- if os.path.exists(connection_string):
- os.remove(connection_string)
-
- conn = sqlite3.connect(connection_string)
-
- cur = conn.cursor()
-
- # Unique tables
-
- # Need to pull this infromation out of the main list. Should be in a generic class,
- # or done while reading it - although I imaging that some writeable formats
- # won't need it.
- pos = set()
- kanas = set()
- kanjis = set()
- glosses = set()
-
- # TODO - this is taking far too much time, proably go back to getting this when reading.
- print('start reading unique values')
- length = float(len(entries))
- for i, entry in enumerate(entries):
- print('{0}: {1:.0%}'.format(entry.entry_seq, i / length))
- for kana in entry.kanas:
- kanas.add(kana)
-
- for kanji in entry.kanjis:
- kanjis.add(kanji)
-
- for gloss in entry.glosses:
- pos.add(gloss.pos)
- glosses.add(gloss)
-
- # TODO - need to figure this out from what I'm passing in now.
- #table = "create table part_of_speach ( code varchar, text varchar);"
- #sql = 'insert into part_of_speach(code, text) values(?, ?);'
- #poss = [(v, k.replace("'", "\'")) for k, v in pos_dict.items()]
- #write_from_list(conn, poss, sql, table)
-
- print('start writing entries')
- table = "create table entry ( id integer primary key, entry );"
- sql = "insert into entry(entry) values(?);"
- items = [(i.entry_seq,) for i in entries]
- write_from_list(conn, items, sql, table)
-
- print('start writing kana')
- table = "create table kana ( id integer primary key, kana varchar );"
- sql = "insert into kana(kana) values(?)"
- items = convert_to_tuple_list(kanas)
- write_from_list(conn, items, sql, table)
-
- print('start writing kanji')
- table = "create table kanji ( id integer primary key, kanji varchar );"
- sql = "insert into kanji(kanji) values(?);"
- items = convert_to_tuple_list(kanjis)
- write_from_list(conn, items, sql, table)
-
- print('start writing glosses')
- table = "create table gloss ( id integer primary key, gloss varchar, pos, lang varchar );"
- sql = "insert into gloss(gloss, pos, lang) values(?, ?, ?);"
- items = [(g.gloss, g.pos, g.lang) for g in glosses]
- write_from_list(conn, items, sql, table)
-
- print('start writing warehouse')
- table = "create table warehouse (id integer primary key, entry int, kana varchar, kanji varchar, gloss varchar); "
- sql = "insert into warehouse(entry, kana, kanji, gloss) values(?, ?, ?, ?)"
- items = [
- (
- entry.entry_seq,
- u','.join(entry.kanas),
- u','.join(entry.kanjis),
- u','.join([g.gloss for g in entry.glosses])
- )
- for entry in entries]
- write_from_list(conn, items, sql, table)
-
- # Join tables
- sql = "select id, entry from entry"
- entry_dict = create_dict_from_sql(conn, sql)
-
- sql = "select id, kana from kana"
- kana_dict = create_dict_from_sql(conn, sql)
-
- items = []
- for entry in entries:
- for kana in entry.kanas:
- items.append((entry_dict.get(entry.entry_seq, 0), kana_dict.get(kana, 0)))
-
- print('start writing kana entry join table')
- table = "create table kana_entry ( entry_id varchar, kana_id varchar );"
- sql = "insert into kana_entry(entry_id, kana_id) values(?, ?);"
- write_from_list(conn, items, sql, table)
-
- sql = "select id, kanji from kanji"
- kanji_dict = create_dict_from_sql(conn, sql)
-
- items = []
- for entry in entries:
- for kana in entry.kanjis:
- items.append((entry_dict.get(entry.entry_seq, 0), kanji_dict.get(kana, 0)))
-
- print('start writing kanji entry join table')
- table = "create table kanji_entry (entry_id varchar, kanji_id varchar );"
- sql = "insert into kanji_entry(entry_id, kanji_id) values(?, ?);"
- write_from_list(conn, items, sql, table)
-
- print('start writing gloss entry join table')
- #table = "create table gloss_entry ( entry_id, gloss_id );"
- #sql = """insert into gloss_entry(entry_id, gloss_id) values(?, ?);"""
- #write_from_list(conn, gloss_entries, sql, table)
-
- cur = conn.cursor()
- sql = """
- create view list_all as
- select
- entry.entry,
- kana.id as kana_id,
- kana.kana,
- kanji.id as kanji_id,
- kanji.kanji
- from
- entry
- join kana_entry
- on entry.id = kana_entry.entry_id
- join kanji_entry
- on entry.id = kanji_entry.entry_id
- left join kana
- on kana_entry.kana_id = kana.id
- left join kanji
- on kanji_entry.kanji_id = kanji.id;
- """
- cur.execute(sql)
- conn.commit()
-
- conn.close()
+class SqliteWriter(Writer):
+
+ def write(self, entries):
+ ''' Writes the various lists to a database.
+
+ :param entries: A list of entries objects.
+ '''
+
+ self.notify('start saving')
+ connection_string = 'test.db'
+
+ if os.path.exists(connection_string):
+ os.remove(connection_string)
+
+ conn = sqlite3.connect(connection_string)
+
+ cur = conn.cursor()
+
+ # Unique tables
+
+ # Need to pull this infromation out of the main list. Should be in a generic class,
+ # or done while reading it - although I imaging that some writeable formats
+ # won't need it.
+ pos = set()
+ kanas = set()
+ kanjis = set()
+ glosses = set()
+
+ # TODO - this is taking far too much time, proably go back to getting this when reading.
+ self.notify('start reading unique values')
+ length = float(len(entries))
+ for i, entry in enumerate(entries):
+ #self.notify('{0}: {1:.0%}'.format(entry.entry_seq, i / length))
+ for kana in entry.kanas:
+ kanas.add(kana)
+
+ for kanji in entry.kanjis:
+ kanjis.add(kanji)
+
+ for gloss in entry.glosses:
+ pos.add(gloss.pos)
+ glosses.add(gloss)
+
+ # TODO - need to figure this out from what I'm passing in now.
+ #table = "create table part_of_speach ( code varchar, text varchar);"
+ #sql = 'insert into part_of_speach(code, text) values(?, ?);'
+ #poss = [(v, k.replace("'", "\'")) for k, v in pos_dict.items()]
+ #write_from_list(conn, poss, sql, table)
+
+ self.notify('start writing entries')
+ table = "create table entry ( id integer primary key, entry );"
+ sql = "insert into entry(entry) values(?);"
+ items = [(i.entry_seq,) for i in entries]
+ write_from_list(conn, items, sql, table)
+
+ self.notify('start writing kana')
+ table = "create table kana ( id integer primary key, kana varchar );"
+ sql = "insert into kana(kana) values(?)"
+ items = convert_to_tuple_list(kanas)
+ write_from_list(conn, items, sql, table)
+
+ self.notify('start writing kanji')
+ table = "create table kanji ( id integer primary key, kanji varchar );"
+ sql = "insert into kanji(kanji) values(?);"
+ items = convert_to_tuple_list(kanjis)
+ write_from_list(conn, items, sql, table)
+
+ self.notify('start writing glosses')
+ table = "create table gloss ( id integer primary key, gloss varchar, pos, lang varchar );"
+ sql = "insert into gloss(gloss, pos, lang) values(?, ?, ?);"
+ items = [(g.gloss, g.pos, g.lang) for g in glosses]
+ write_from_list(conn, items, sql, table)
+
+ self.notify('start writing warehouse')
+ table = "create table warehouse (id integer primary key, entry int, kana varchar, kanji varchar, gloss varchar); "
+ sql = "insert into warehouse(entry, kana, kanji, gloss) values(?, ?, ?, ?)"
+ items = [
+ (
+ entry.entry_seq,
+ u','.join(entry.kanas),
+ u','.join(entry.kanjis),
+ u','.join([g.gloss for g in entry.glosses])
+ )
+ for entry in entries]
+ write_from_list(conn, items, sql, table)
+
+ # Join tables
+ sql = "select id, entry from entry"
+ entry_dict = create_dict_from_sql(conn, sql)
+
+ sql = "select id, kana from kana"
+ kana_dict = create_dict_from_sql(conn, sql)
+
+ items = []
+ for entry in entries:
+ for kana in entry.kanas:
+ items.append((entry_dict.get(entry.entry_seq, 0), kana_dict.get(kana, 0)))
+
+ self.notify('start writing kana entry join table')
+ table = "create table kana_entry ( entry_id varchar, kana_id varchar );"
+ sql = "insert into kana_entry(entry_id, kana_id) values(?, ?);"
+ write_from_list(conn, items, sql, table)
+
+ sql = "select id, kanji from kanji"
+ kanji_dict = create_dict_from_sql(conn, sql)
+
+ items = []
+ for entry in entries:
+ for kana in entry.kanjis:
+ items.append((entry_dict.get(entry.entry_seq, 0), kanji_dict.get(kana, 0)))
+
+ self.notify('start writing kanji entry join table')
+ table = "create table kanji_entry (entry_id varchar, kanji_id varchar );"
+ sql = "insert into kanji_entry(entry_id, kanji_id) values(?, ?);"
+ write_from_list(conn, items, sql, table)
+
+ self.notify('start writing gloss entry join table')
+ #table = "create table gloss_entry ( entry_id, gloss_id );"
+ #sql = """insert into gloss_entry(entry_id, gloss_id) values(?, ?);"""
+ #write_from_list(conn, gloss_entries, sql, table)
+
+ cur = conn.cursor()
+ sql = """
+ create view list_all as
+ select
+ entry.entry,
+ kana.id as kana_id,
+ kana.kana,
+ kanji.id as kanji_id,
+ kanji.kanji
+ from
+ entry
+ join kana_entry
+ on entry.id = kana_entry.entry_id
+ join kanji_entry
+ on entry.id = kanji_entry.entry_id
+ left join kana
+ on kana_entry.kana_id = kana.id
+ left join kanji
+ on kanji_entry.kanji_id = kanji.id;
+ """
+ cur.execute(sql)
+ conn.commit()
+
+ conn.close()
+
+ self.notify('done saving')
View
20 main.py
@@ -6,7 +6,9 @@
import sys
from JMdictParser import Parser
-from data import write_list_to_database
+from data import SqliteWriter
+
+from observer import ConsoleViewer
def main():
parser = argparse.ArgumentParser(description='Import edict xml')
@@ -28,12 +30,16 @@ def main():
print('Invalid file name')
exit(1)
- print('start reading')
- entries = Parser(filename, sys.stdout).parse_from_file()
- print('done reading')
- print('start saving')
- write_list_to_database(entries)
- print('done saving')
+ viewer = ConsoleViewer()
+ parser = Parser()
+ parser.attach(viewer)
+
+ entries = parser.parse_from_file(filename)
+
+
+ writer = SqliteWriter()
+ writer.attach(viewer)
+ writer.write(entries)
if list_values:
print('Not implemented')
View
28 observer.py
@@ -0,0 +1,28 @@
+# Grabbed from http://code.activestate.com/recipes/131499-observer-pattern/
+# Even though it's simple enough to have figured out to begin with.
+
+__all__ = ['Subject', 'ConsoleViewer']
+
+class Subject(object):
+ def __init__(self):
+ self._observers = []
+
+ def attach(self, observer):
+ if not observer in self._observers:
+ self._observers.append(observer)
+
+ def detach(self, observer):
+ try:
+ self._observers.remove(observer)
+ except ValueError:
+ pass
+
+ def notify(self, message='', modifier=None):
+ for observer in self._observers:
+ if modifier != observer:
+ observer.update(message)
+
+class ConsoleViewer(object):
+ def update(self, message):
+ print('{0}'.format(message))
+
View
46 readme.txt
@@ -0,0 +1,46 @@
+The ultimate goal of this project is to feed a list of japanese words into a
+program and get a list of translations back out.
+
+The first step was to read a Japanese translation dictionary into a format
+that could then be queried. Then create something to break up Japanese text,
+feed the words into this, and print out the results.
+
+My original approach was to insert the contents of the JMdict
+Japanese translation file into a sqlite database. I was hoping than I could
+then use sql syntax to make searching easier.
+
+Inserting the data into a sqlite database was relatively easy, despite
+initially running into issues using SqlAlchemy. It may eventually be useful
+but the insert queries it ran would take hours to complete. I've now managed
+to get it down to a couple minutes, but the join required on fully normalized
+data meant that it was slower than reading the file directly from xml. I was
+in the process of creating a single warehouse table before getting distracted
+by other things. That would still allow the sqlite file to be a cross
+platform data file, but it would loose
+
+I don't necessarily want to entirely scrap that idea, but for any data analysis
+I may try other databases backends instead.
+
+The current goal is still to read the dictionary file and convert it into some
+kind of non-xml store that can be quickly read in or queried. I haven't gotten
+into any other specifics yet.
+
+The current project setup is a little cluttered. At some point it will have to
+be cleaned up.
+
+Required packages:
+- lxml
+- SqlAlchemy - for databse setup (Need to eventually remove, or at least move,
+ this requirement, as not all stores are going to need it).
+
+TODO:
+- Create a means of querying a data store.
+- Develop companion readers/writers for each existing type - ideally you will
+be able to read in anything that has been written out, and write out anything
+that has been written in.
+- Figure out why sqlite on windows isn't saving the data as unicode, or if it
+is just a console issue.
+- Perhaps move some of the sqlite normalized table infomration into the reader
+as it is currently an extra step. Although it may only be useful for sql
+stores.
+
Please sign in to comment.
Something went wrong with that request. Please try again.