Permalink
Browse files

better script handling

  • Loading branch information...
1 parent dddc784 commit 4584ac6b9511123a8741f7afef663907b370f527 @grobie committed Oct 8, 2009
Showing with 42 additions and 29 deletions.
  1. +14 −2 workshop.rb → datenreinigung.rb
  2. +28 −27 detector.rb
View
@@ -1,3 +1,5 @@
+#!/usr/bin/env ruby
+
# dependencies
require "rubygems"
require "activerecord"
@@ -19,5 +21,15 @@
# ActiveRecord::Base.logger = Logger.new(STDOUT)
ActiveRecord::Base.establish_connection(Datenreinigung::Config['database'][Datenreinigung::Config['database']['adapter']].merge(:adapter => Datenreinigung::Config['database']['adapter']))
-# Clean.process
-# Detector.new("nachvorstrasse", true).process
+if __FILE__ == $0
+ if ARGV.size != 1 || !%w(all clean detect).include?(ARGV[0])
+ puts "usage: datenreinigung {all|clean|detect}"
+ else
+ if ARGV[0] == "all" || ARGV[0] == "clean"
+ Clean.process
+ end
+ if ARGV[0] == "all" || ARGV[0] == "detect"
+ Detector.process
+ end
+ end
+end
View
@@ -4,55 +4,56 @@ class Detector
THRESHOLD = 0.3
STEPSIZE = 10000
- def initialize(key, save = false)
- @key = key
- @save = save
- end
-
- def output(results)
+ def self.output(results, key)
values = []
results.sort_by { |r| r[:distance] }.each do |result|
- if @save
+ if Datenreinigung::Config["detector"]["save"]
minid, maxid = result[:k1].origid > result[:k2].origid ? [result[:k2].origid, result[:k1].origid] : [result[:k1].origid, result[:k2].origid]
- values << [minid, maxid, "'#{@key}'", result[:distance]].join(",")
+ values << [minid, maxid, "'#{key}'", result[:distance]].join(",")
else
puts "#{result[:distance]}: #{result[:k1]} ---- #{result[:k2]}"
end
end
- if @save
+ if Datenreinigung::Config["detector"]["save"] && !values.empty?
query = 'INSERT INTO matches (kunde1_id,kunde2_id,key,distance) VALUES (' + values.join('),(') + ')'
ActiveRecord::Base.connection.execute(query)
end
end
- def process
+ def self.process
count = Kunde.count
steps = (count / STEPSIZE) + 1
- (0..steps).each do |step|
- retrieve = Time.now
- kunden = Kunde.all(:conditions => "kunden_keys.keyname = '#{@key}'", :joins => :key, :order => 'value ASC', :offset => [step*STEPSIZE - WINDOWSIZE, 0].max, :limit => (step+1)*STEPSIZE)
+ Datenreinigung::Config["detector"]["keys"].each do |key|
+ puts "Search by #{key}"
- detect = Time.now
- result = []
- kunden.each_with_index do |kunde, i|
- (0..([i, WINDOWSIZE].min-1)).each do |k|
- other = kunden[i-k-1]
- distance = kunde.distance_to(other)
- result << {:distance => distance, :k1 => kunde, :k2 => other} if distance < THRESHOLD
+ (0..steps).each do |step|
+ retrieve = Time.now
+ kunden = Kunde.all(:conditions => "kunden_keys.keyname = '#{key}'", :joins => :key, :order => 'value ASC', :offset => [step*STEPSIZE - WINDOWSIZE, 0].max, :limit => (step+1)*STEPSIZE)
+
+ next if kunden.empty?
+
+ detect = Time.now
+ result = []
+ kunden.each_with_index do |kunde, i|
+ (0..([i, WINDOWSIZE].min-1)).each do |k|
+ other = kunden[i-k-1]
+ distance = kunde.distance_to(other)
+ result << {:distance => distance, :k1 => kunde, :k2 => other} if distance < THRESHOLD
+ end
end
+
+ insert = Time.now
+ output(result, key)
+
+ finish = Time.now
+ puts " processed #{kunden.size} objects and found #{result.size} duplicates in #{time_format(finish-retrieve)} minutes (select: #{time_format(detect-retrieve)}, detect: #{time_format(insert-detect)}, insert: #{time_format(finish-insert)})"
end
-
- insert = Time.now
- output(result)
-
- finish = Time.now
- puts "processed #{kunden.size} objects and found #{result.size} duplicates in #{time_format(finish-retrieve)} minutes (select: #{time_format(detect-retrieve)}, detect: #{time_format(insert-detect)}, insert: #{time_format(finish-insert)})"
end
end
- def time_format(seconds)
+ def self.time_format(seconds)
"#{(seconds / 60).to_i}:#{"%.1f" % (seconds % 60)}"
end

0 comments on commit 4584ac6

Please sign in to comment.