Skip to content

Commit

Permalink
Merge pull request #33 from open-city/packaging
Browse files Browse the repository at this point in the history
bundle in to a python package, resolves #19
  • Loading branch information
fgregg committed Jul 20, 2012
2 parents f2935b3 + 331f93f commit 31e2a7f
Show file tree
Hide file tree
Showing 22 changed files with 68 additions and 263 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ logfile
examples/output/*.*
kernprof.py
possible_classifiers
.DS_Store
24 changes: 16 additions & 8 deletions README.rdoc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
= Deduplication Library
= Dedupe Python Library
A free python library for accurate and scaleable deduplication and entity-resolution.

Based on Mikhail Yuryevich Bilenko's Ph. D dissertation: Learnable Similarity Functions and their Application to Record Linkage and Clustering
Expand All @@ -11,20 +11,28 @@ Current solutions break easily, don’t scale, and require significant developer

== Team

* Forest Gregg
* Forest Gregg mailto:fgregg@gmail.com
* Derek Eder mailto:derek.eder@opencityapps.org

== Usage
> python setup.py build_ext --inplace
> python dedupe.py
> python setup.py install
> cd examples
> python active_canonical_example.py
(use 'y', 'n' and 'u' keys to flag duplicates for active learning)

== Other Executable Modules
== Example datasets

As we continue to refine this library, we have added several datasets to test against. These can all be executed from the examples/ directory.

The following use human input to flag duplicates:

* active_canonical_example.py - 864 rows. canonical restaurant dataset from Bilenko's research
* early_childhood.py - 3,720 rows. compilation of 9 datasets containing locations for early childhood education in Chicago
* tech_locator.py - 852 rows. compilation of 2 lists of locations of technology resources in the City of Chicago.

The following do not use human input:

* blocking.py - loads in test data and finds optimum blocking predicates
* canonical_example.py - loads in canonical restaurant test data and trains based on provided known duplicates. outputs precision and recall values
* predicates.py - tests the functionality of defined predicates
* training_sample.py - tests active learning with user input

== Errors / Bugs

Expand Down
9 changes: 9 additions & 0 deletions dedupe/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
__all__ = ["affinegap", "blocking", "clustering", "core", "lr", "predicates", "training_sample"]

import affinegap
import blocking
import clustering
import core
import lr
import predicates
import training_sample
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file added dedupe/test/__init__.py
Empty file.
3 changes: 2 additions & 1 deletion testaffine.py → dedupe/test/testaffine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import cProfile
from affinegap import affineGapDistance, normalizedAffineGapDistance

from ..affinegap import affineGapDistance, normalizedAffineGapDistance

def performanceTest() :
for i in xrange(100000) :
Expand Down
2 changes: 1 addition & 1 deletion testclustering.py → dedupe/test/testclustering.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from clustering import *
from ..clustering import *


dupes = (((1,2), .95),
Expand Down
File renamed without changes.
11 changes: 6 additions & 5 deletions examples/active_canonical_example.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from dedupe import *
from canonical_example import init
from training_sample import activeLearning, consoleLabel
from blocking import trainBlocking, blockingIndex, mergeBlocks
from predicates import *
from core import scorePairs
from clustering import cluster
from dedupe.training_sample import activeLearning, consoleLabel
from dedupe.blocking import trainBlocking, blockingIndex, mergeBlocks
from dedupe.predicates import *
from dedupe.core import scorePairs
from dedupe.clustering import cluster

num_training_dupes = 200
num_training_distinct = 16000
Expand Down
15 changes: 7 additions & 8 deletions examples/canonical_example.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from itertools import combinations
import csv
import re
from core import frozendict
from clustering import cluster

#dedupe modules
from dedupe import *
from dedupe.core import frozendict
from dedupe.clustering import cluster

def canonicalImport(filename) :

Expand Down Expand Up @@ -67,13 +70,9 @@ def init() :
# main execution
if __name__ == '__main__':

from predicates import *
import core
import training_sample
import blocking
import clustering
from dedupe import *
from dedupe.predicates import *
import os


import time
t0 = time.time()
Expand Down
18 changes: 10 additions & 8 deletions examples/early_childhood.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from training_sample import activeLearning, consoleLabel
from blocking import trainBlocking, blockingIndex, mergeBlocks
from predicates import *
import core
from random import sample
import clustering
import csv
import re
import os

#dedupe modules
from dedupe.training_sample import activeLearning, consoleLabel
from dedupe.blocking import trainBlocking, blockingIndex, mergeBlocks
from dedupe.predicates import *
import dedupe.core
import dedupe.clustering

def earlyChildhoodImport(filename) :
data_d = {}
duplicates_d = {}
Expand All @@ -21,7 +23,7 @@ def earlyChildhoodImport(filename) :
col = re.sub('\n', ' ', col)
instance[header[j]] = col.strip().strip('"').strip("'").lower()

data_d[i] = core.frozendict(instance)
data_d[i] = dedupe.core.frozendict(instance)

return(data_d, header)

Expand All @@ -46,7 +48,7 @@ def init(inputFile) :
def dictSubset(d, keys) :
return dict((k,d[k]) for k in keys if k in d)

inputFile = "examples/datasets/ECP_all_raw_input.csv"
inputFile = "datasets/ECP_all_raw_input.csv"
num_training_dupes = 200
num_training_distinct = 16000
numIterations = 100
Expand Down Expand Up @@ -121,7 +123,7 @@ def dictSubset(d, keys) :
orig_data[row_id] = row


with open("examples/output/ECP_dupes_list_" + str(time.time()) + ".csv","w") as f :
with open("output/ECP_dupes_list_" + str(time.time()) + ".csv","w") as f :
writer = csv.writer(f)
heading_row = header
heading_row.insert(0, "Group_ID")
Expand Down
19 changes: 10 additions & 9 deletions examples/tech_locator.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from training_sample import activeLearning, consoleLabel
from blocking import trainBlocking, blockingIndex, mergeBlocks
from predicates import *
import core
from random import sample
import clustering
import csv
import re
import os

#dedupe modules
from dedupe.training_sample import activeLearning, consoleLabel
from dedupe.blocking import trainBlocking, blockingIndex, mergeBlocks
from dedupe.predicates import *
import dedupe.core
import dedupe.clustering

def techLocatorImport(filename) :
data_d = {}
duplicates_d = {}
Expand All @@ -21,7 +22,7 @@ def techLocatorImport(filename) :
col = re.sub('\n', ' ', col)
instance[header[j]] = col.strip().strip('"').strip("'").lower()

data_d[i] = core.frozendict(instance)
data_d[i] = dedupe.core.frozendict(instance)

return(data_d, header)

Expand All @@ -47,7 +48,7 @@ def dictSubset(d, keys) :
return dict((k,d[k]) for k in keys if k in d)


inputFile = "examples/datasets/Tech Locator Master List.csv"
inputFile = "datasets/Tech Locator Master List.csv"
num_training_dupes = 200
num_training_distinct = 16000
numIterations = 100
Expand Down Expand Up @@ -120,7 +121,7 @@ def dictSubset(d, keys) :
orig_data[row_id] = row


with open("examples/output/TL_dupes_list_" + str(time.time()) + ".csv","w") as f :
with open("output/TL_dupes_list_" + str(time.time()) + ".csv","w") as f :
writer = csv.writer(f)
heading_row = header
heading_row.insert(0, "Group_ID")
Expand Down
172 changes: 0 additions & 172 deletions lib/log_train.py

This file was deleted.

Loading

0 comments on commit 31e2a7f

Please sign in to comment.