Skip to content

Commit

Permalink
Merge pull request #344 from Fideln8/master
Browse files Browse the repository at this point in the history
Update tfidf.py
  • Loading branch information
fgregg committed Jan 27, 2015
2 parents a7ccddd + 14de4eb commit 16fd040
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions dedupe/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import logging
from index import CanopyIndex
import math
import collections
import itertools

logger = logging.getLogger(__name__)

Expand All @@ -13,15 +15,13 @@ def __init__(self, field, stop_words=[]) :
self._index = CanopyIndex(stop_words)

self._i_to_id = {}
self._parseTerms = self._index.lexicon.parseTerms

def _hash32(self, x) :
i = hash(x)
return int(math.copysign(i % (2**31), i))

self._id_to_i = collections.defaultdict(itertools.count(-2**31).next)

self._parseTerms = self._index.lexicon.parseTerms

def index(self, record_id, doc) :
i = self._hash32(record_id)
i = self._id_to_i[record_id]
self._i_to_id[i] = record_id

try :
Expand All @@ -31,7 +31,7 @@ def index(self, record_id, doc) :
raise

def unindex(self, record_id) :
i = self._hash32(record_id)
i = self._id_to_i.pop(record_id)
del self._i_to_id[i]
self._index.unindex_doc(i)

Expand Down

0 comments on commit 16fd040

Please sign in to comment.