Skip to content

Commit

Permalink
uncommented rest of sqlite_example script, still bugs to fix with sql…
Browse files Browse the repository at this point in the history
…ite3.Row factory change
  • Loading branch information
derekeder committed Dec 12, 2012
1 parent 4de1a52 commit 24471a0
Showing 1 changed file with 41 additions and 48 deletions.
89 changes: 41 additions & 48 deletions examples/sqlite_example/sqlite_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def selector(doc_ids) :


print 'creating inverted index'
full_data = ((row['donor_id'], row) for row in con.execute("SELECT * FROM donors"))
full_data = ((row['donor_id'], row) for row in con.execute("SELECT * FROM donors LIMIT 1000"))
blocker.invertIndex(full_data)

print 'creating canopies'
Expand All @@ -129,60 +129,53 @@ def selector(doc_ids) :
blocker.canopies[threshold.__name__ + field] = canopy
counter += 1

# print 'writing blocking map'
# def block_data() :
# for donor_id, record in con.execute("SELECT * FROM donors LIMIT 1000") :
# if donor_id % 10000 == 0 :
# print donor_id
# for key in blocker((donor_id, record)):
# yield (str(key), donor_id)
print 'writing blocking map'
def block_data() :
full_data = ((row['donor_id'], row) for row in con.execute("SELECT * FROM donors LIMIT 1000"))
for donor_id, record in full_data :
if donor_id % 10000 == 0 :
print donor_id
for key in blocker((donor_id, record)):
yield (str(key), donor_id)


# con.executemany("INSERT OR IGNORE INTO blocking_map VALUES (?, ?)",
# block_data())
con.executemany("INSERT OR IGNORE INTO blocking_map VALUES (?, ?)",
block_data())

con.commit()

print 'writing largest blocks to file'

# con.commit()
with open('sqlite_example_block_sizes.txt', 'a') as f:
con.row_factory = None
f.write(time.asctime())
f.write('\n')
for row in con.execute("SELECT key, COUNT(donor_id) AS block_size "
"FROM blocking_map GROUP BY key "
"ORDER BY block_size DESC LIMIT 10") :

print row
f.write(str(row))
f.write('\n')
con.row_factory = dict_factory



# print 'writing largest blocks to file'



# with open('sqlite_example_block_sizes.txt', 'a') as f:
# con.row_factory = None
# f.write(time.asctime())
# f.write('\n')
# for row in con.execute("SELECT key, COUNT(donor_id) AS block_size "
# "FROM blocking_map GROUP BY key "
# "ORDER BY block_size DESC LIMIT 10") :

# print row
# f.write(str(row))
# f.write('\n')
# con.row_factory = dict_factory


# print 'reading blocked data'
# con.row_factory = blocking_factory
# cur = con.cursor()
# cur.execute('select * from donors join '
# '(select key, donor_id from blocking_map '
# 'join (select key, count(donor_id) num_candidates from blocking_map '
# 'group by key having num_candidates > 1) '
# 'as bucket using (key)) as candidates using (donor_id)')
# blocked_data = defaultdict(list)
# for k, v in cur :
# blocked_data[k].append(v)

# print 'clustering...'
# clustered_dupes = deduper.duplicateClusters(blocked_data)

# print '# duplicate sets'
# print len(clustered_dupes)
print 'reading blocked data'
con.row_factory = blocking_factory
cur = con.cursor()
cur.execute('select * from donors join '
'(select key, donor_id from blocking_map '
'join (select key, count(donor_id) num_candidates from blocking_map '
'group by key having num_candidates > 1) '
'as bucket using (key)) as candidates using (donor_id)')
blocked_data = defaultdict(list)
for k, v in cur :
blocked_data[k].append(v)

print 'clustering...'
clustered_dupes = deduper.duplicateClusters(blocked_data)

print '# duplicate sets'
print len(clustered_dupes)

cur.close()
con.close()
Expand Down

0 comments on commit 24471a0

Please sign in to comment.