From 24471a052de1df233811f830c606850abf710445 Mon Sep 17 00:00:00 2001 From: Derek Eder Date: Wed, 12 Dec 2012 17:34:16 -0600 Subject: [PATCH] uncommented rest of sqlite_example script, still bugs to fix with sqlite3.Row factory change --- examples/sqlite_example/sqlite_example.py | 89 +++++++++++------------ 1 file changed, 41 insertions(+), 48 deletions(-) diff --git a/examples/sqlite_example/sqlite_example.py b/examples/sqlite_example/sqlite_example.py index 38fa11f35..58d2a1493 100644 --- a/examples/sqlite_example/sqlite_example.py +++ b/examples/sqlite_example/sqlite_example.py @@ -116,7 +116,7 @@ def selector(doc_ids) : print 'creating inverted index' -full_data = ((row['donor_id'], row) for row in con.execute("SELECT * FROM donors")) +full_data = ((row['donor_id'], row) for row in con.execute("SELECT * FROM donors LIMIT 1000")) blocker.invertIndex(full_data) print 'creating canopies' @@ -129,60 +129,53 @@ def selector(doc_ids) : blocker.canopies[threshold.__name__ + field] = canopy counter += 1 -# print 'writing blocking map' -# def block_data() : -# for donor_id, record in con.execute("SELECT * FROM donors LIMIT 1000") : -# if donor_id % 10000 == 0 : -# print donor_id -# for key in blocker((donor_id, record)): -# yield (str(key), donor_id) +print 'writing blocking map' +def block_data() : + full_data = ((row['donor_id'], row) for row in con.execute("SELECT * FROM donors LIMIT 1000")) + for donor_id, record in full_data : + if donor_id % 10000 == 0 : + print donor_id + for key in blocker((donor_id, record)): + yield (str(key), donor_id) -# con.executemany("INSERT OR IGNORE INTO blocking_map VALUES (?, ?)", -# block_data()) +con.executemany("INSERT OR IGNORE INTO blocking_map VALUES (?, ?)", + block_data()) +con.commit() +print 'writing largest blocks to file' -# con.commit() +with open('sqlite_example_block_sizes.txt', 'a') as f: + con.row_factory = None + f.write(time.asctime()) + f.write('\n') + for row in con.execute("SELECT key, COUNT(donor_id) AS block_size " + "FROM blocking_map GROUP BY key " + "ORDER BY block_size DESC LIMIT 10") : + print row + f.write(str(row)) + f.write('\n') + con.row_factory = dict_factory - - -# print 'writing largest blocks to file' - - - -# with open('sqlite_example_block_sizes.txt', 'a') as f: -# con.row_factory = None -# f.write(time.asctime()) -# f.write('\n') -# for row in con.execute("SELECT key, COUNT(donor_id) AS block_size " -# "FROM blocking_map GROUP BY key " -# "ORDER BY block_size DESC LIMIT 10") : - -# print row -# f.write(str(row)) -# f.write('\n') -# con.row_factory = dict_factory - - -# print 'reading blocked data' -# con.row_factory = blocking_factory -# cur = con.cursor() -# cur.execute('select * from donors join ' -# '(select key, donor_id from blocking_map ' -# 'join (select key, count(donor_id) num_candidates from blocking_map ' -# 'group by key having num_candidates > 1) ' -# 'as bucket using (key)) as candidates using (donor_id)') -# blocked_data = defaultdict(list) -# for k, v in cur : -# blocked_data[k].append(v) - -# print 'clustering...' -# clustered_dupes = deduper.duplicateClusters(blocked_data) - -# print '# duplicate sets' -# print len(clustered_dupes) +print 'reading blocked data' +con.row_factory = blocking_factory +cur = con.cursor() +cur.execute('select * from donors join ' + '(select key, donor_id from blocking_map ' + 'join (select key, count(donor_id) num_candidates from blocking_map ' + 'group by key having num_candidates > 1) ' + 'as bucket using (key)) as candidates using (donor_id)') +blocked_data = defaultdict(list) +for k, v in cur : + blocked_data[k].append(v) + +print 'clustering...' +clustered_dupes = deduper.duplicateClusters(blocked_data) + +print '# duplicate sets' +print len(clustered_dupes) cur.close() con.close()