Permalink
Browse files

init

  • Loading branch information...
0 parents commit 280b251b1bb9830798d021171eb6f8f4fa0bffff @grobie committed Oct 8, 2009
Showing with 647 additions and 0 deletions.
  1. +2 −0 .gitignore
  2. +5 −0 TODO
  3. +39 −0 clean.rb
  4. +244 −0 cleaner.rb
  5. +16 −0 config.example.yml
  6. +15 −0 config.rb
  7. +59 −0 detector.rb
  8. +22 −0 distance.rb
  9. +34 −0 export.rb
  10. +3 −0 key.rb
  11. +146 −0 kunde.rb
  12. +17 −0 marketing.rb
  13. +2 −0 match.rb
  14. +15 −0 versand.rb
  15. +5 −0 webshop.rb
  16. +23 −0 workshop.rb
@@ -0,0 +1,2 @@
+.DS_Store
+config.yml
5 TODO
@@ -0,0 +1,5 @@
+* Datenreinigung Herr und Frau ...
+
+* Ähnlichkeit Vergleich Referenzdaten
+* vornamen matching (Ä. <=> Ämil)
+* hausnummern range
@@ -0,0 +1,39 @@
+class Clean
+
+ def self.insert(values)
+ query = 'INSERT INTO "kunden" ("origid","origtable","anrede","titel","vorname","mittelname","nachname","strasse","hausnummervon","hausnummerbis","postfach","postleitzahl","ort","ortzusatz","vorwahl","telefonnummer","geburtsdatum") VALUES (' + values.join('),(') + ")\n\n"
+ ActiveRecord::Base.connection.execute(query)
+ end
+
+ def self.process
+ %w(Versand).each do |klass|
+ puts klass
+
+ i = 0
+ values = []
+
+ klass.constantize.all().each do |object|
+ i += 1
+
+ attributes = [object.id, klass, object.anrede, object.titel, object.vorname, object.mittelname, object.nachname, object.strasse, object.hausnummervon, object.hausnummerbis, object.postfach, object.postleitzahl, object.ort, object.ortzusatz, object.vorwahl, object.telefonnummer, object.geburtsdatum]
+ values << attributes.map do |attribute|
+ case attribute.class.to_s
+ when "String" then "'#{ActiveRecord::Base.connection.quote_string(attribute)}'"
+ when "Date" then "'#{attribute}'"
+ when "NilClass" then "null"
+ else attribute
+ end
+ end.join(", ")
+
+ if i % 5000 == 0
+ puts "Processed #{i} tupel"
+ insert(values)
+ values = []
+ end
+ end
+
+ insert(values) unless values.empty?
+ end
+ end
+
+end
@@ -0,0 +1,244 @@
+module Cleaner
+
+ def id
+ self[:id].to_i
+ end
+
+ def anrede
+ process_name unless @processed_name
+ @anrede
+ end
+
+ def titel
+ process_name unless @processed_name
+ @titel
+ end
+
+ def vorname
+ process_name unless @processed_name
+ @vorname
+ end
+
+ def mittelname
+ process_name unless @processed_name
+ @mittelname
+ end
+
+ def nachname
+ process_name unless @processed_name
+ @nachname
+ end
+
+ def strasse
+ unless self[:strasse].blank?
+ @strasse = self[:strasse].strip
+ case self[:strasse]
+ when /Postfach (\d+)/
+ @strasse = nil
+ @postfach = $1
+ when /(.*?) *(\d+[a-zA-Z]*)/
+ @strasse = $1
+ self[:hausnummer] = $2
+ end
+ @strasse = @strasse.sub(/ Str.$/, 'str.') unless @strasse.blank?
+ @strasse
+ end
+ end
+
+ def hausnummervon
+ process_hausnummer unless @processed_hausnummer
+ @hausnummervon
+ end
+
+ def hausnummerbis
+ process_hausnummer unless @processed_hausnummer
+ @hausnummerbis
+ end
+
+ def postleitzahl
+ unless self[:postleitzahl].blank?
+ self[:postleitzahl].sub("D-", "")
+ end
+ end
+
+ def postfach
+ result = @postfach || self[:postfach]
+ result.blank? || result == 0 ? nil : result.to_s.gsub(/[^0-9]/, '').to_i
+ end
+
+ def ort
+ process_ort unless @processed_ort
+ @ort
+ end
+
+ def ortzusatz
+ process_ort unless @processed_ort
+ @ortzusatz
+ end
+
+ def vorwahl
+ parse_telefon unless @parsed_telefon
+ @vorwahl
+ end
+
+ def telefonnummer
+ parse_telefon unless @parsed_telefon
+ @telefonnummer
+ end
+
+ def geburtsdatum
+ date = nil
+
+ unless self[:geburtsdatum].blank?
+ case self[:geburtsdatum]
+ when /(\d{2})\.(\d{2})\.(\d{4})/
+ day = $1.to_i
+ month = $2.to_i
+ year = $3.to_i
+ when /(\d{4})(\d{2})(\d{2})/
+ day = $3.to_i
+ month = $2.to_i
+ year = $1.to_i
+ when /(\d{2})\/(\d{2})\/(\d{2})/
+ day = $1.to_i
+ month = $2.to_i
+ year = "19#{$3}".to_i
+ end
+
+ if month > 12
+ day, month = month, day
+ end
+
+ # create date objects
+ tries = 0
+ begin
+ date = Date.parse("#{year}-#{month}-#{day}")
+ rescue ArgumentError
+ if tries < 1
+ tries += 1
+ # correct days
+ if day > 31 || (month == 2 && day > 28) || (month % 2 == 0 && day > 30)
+ day -= 1
+ end
+ retry
+ end
+ date = nil
+ end
+
+ # correct dates in the future
+ date -= 100.years if date && date > Date.today
+ end
+
+ date
+ end
+
+ private
+
+ def parse_telefon
+ unless self[:telefon].blank?
+ tel = self[:telefon].strip.gsub(/\+49|\(|\)/, '')
+ if tel =~ /\d+(\.|\/)\d+(\.|\/)\d+/
+ self.geburtsdatum = self.telefon
+ self[:telefon] = nil
+ else
+ parts = tel.split("/")
+ if parts.size == 2
+ @vorwahl = parts[0]
+ @telefonnummer = parts[1].to_i
+ else
+ @telefonnummer = parts.join.to_i
+ end
+ end
+ end
+ @telefonnummer = nil if @telefonnummer == 0
+ @parsed_telefon = true
+ end
+
+ def process_name
+ process_nachname
+ process_vorname
+ process_anrede
+ process_titel
+ @processed_name = true
+ end
+
+ def process_nachname
+ @nachname = self[:nachname].strip
+ @nachname = nil if @nachname.blank?
+
+ if @nachname =~ /(Frau und Herr|Herr und Frau|Herr|Frau)($| )(.*)/
+ @anrede = $1
+ @nachname = $3
+ end
+ if @nachname =~ /(Dipl\. Ing\.|Dr\.|Prof\.|Dr\. Prof\.|Prof\. Dr\.)* *(.*)/
+ @titel = $1
+ @nachname = $2
+ end
+
+ @nachname = @nachname.gsub(/ +/, ' ') unless @nachname.blank?
+ end
+
+ def process_vorname
+ @vorname = self[:vorname].strip
+ @vorname = nil if @vorname.blank?
+
+ if @vorname =~ /(Frau und Herr|Herr und Frau|Herr|Frau)($| )(.*)/
+ @anrede = $1
+ @vorname = $3
+ end
+ if @vorname =~ /(Dipl\. Ing\.|Dr\.|Prof\.|Dr\. Prof\.|Prof\. Dr\.)* *(.*)/
+ @titel = $1
+ @vorname = $2
+ end
+
+ unless @vorname.nil? || @anrede == "Herr und Frau"
+ parts = @vorname.split(" ")
+ if parts.size > 1 && !%w(und u. +).include?(parts[1])
+ @vorname = parts.shift
+ @mittelname = parts.join(" ")
+ end
+ end
+
+ # rechange first chars
+ if !@vorname.blank? && @vorname =~ /([a-zäöü])([A-ZÄÖÜ])(.*)/u
+ @vorname = $2+$1+$3
+ end
+ end
+
+ def process_hausnummer
+ unless self[:hausnummer].blank?
+ self[:hausnummer] = self[:hausnummer].strip.upcase
+ parts = self[:hausnummer].split('-')
+ if parts.size == 2
+ @hausnummervon = parts[0]
+ @hausnummerbis = parts[1] =~ /^[A-Z]+$/ ? "#{parts[0].to_i}#{parts[1]}" : parts[1]
+ else
+ @hausnummervon = self[:hausnummer]
+ @hausnummerbis = self[:hausnummer]
+ end
+ end
+ @processed_hausnummer = true
+ end
+
+ def process_anrede
+ @anrede
+ end
+
+ def process_titel
+ @titel
+ end
+
+ def process_ort
+ unless self[:ort].blank?
+ self[:ort] = self[:ort].strip
+ if self[:ort] =~ /(.*?) *, *(.*)/
+ @ort = $1
+ @ortzusatz = $2
+ else
+ @ort = self[:ort]
+ end
+ end
+ @processed_ort = true
+ end
+
+end
@@ -0,0 +1,16 @@
+database:
+ adapter: "postgresql"
+
+ postgresql:
+ host: "localhost"
+ username: "postgres"
+ database: "datenreinigung"
+
+ mysql:
+ host: "localhost"
+ username: "root"
+ database: "datenreinigung"
+
+detector:
+ save: true
+ keys: ["nachvorstrasse"]
@@ -0,0 +1,15 @@
+module Datenreinigung
+ class Config
+ @@config = nil
+
+ def self.reload
+ @@config = YAML.load(File.open(File.join(File.dirname(__FILE__),"config.yml")))
+ end
+
+ def self.[](attribute)
+ @@config[attribute.to_s]
+ end
+
+ reload
+ end
+end
@@ -0,0 +1,59 @@
+class Detector
+
+ WINDOWSIZE = 20
+ THRESHOLD = 0.3
+ STEPSIZE = 10000
+
+ def initialize(key, save = false)
+ @key = key
+ @save = save
+ end
+
+ def output(results)
+ values = []
+ results.sort_by { |r| r[:distance] }.each do |result|
+ if @save
+ minid, maxid = result[:k1].origid > result[:k2].origid ? [result[:k2].origid, result[:k1].origid] : [result[:k1].origid, result[:k2].origid]
+ values << [minid, maxid, "'#{@key}'", result[:distance]].join(",")
+ else
+ puts "#{result[:distance]}: #{result[:k1]} ---- #{result[:k2]}"
+ end
+ end
+
+ if @save
+ query = 'INSERT INTO "matches" ("kunde1_id","kunde2_id","key","distance") VALUES (' + values.join('),(') + ")\n\n"
+ ActiveRecord::Base.connection.execute(query)
+ end
+ end
+
+ def process
+ count = Kunde.count
+ steps = (count / STEPSIZE) + 1
+
+ (0..steps).each do |step|
+ retrieve = Time.now
+ kunden = Kunde.all(:conditions => "kunden_keys.keyname = '#{@key}'", :joins => :key, :order => 'value ASC', :offset => [step*STEPSIZE - WINDOWSIZE, 0].max, :limit => (step+1)*STEPSIZE)
+
+ detect = Time.now
+ result = []
+ kunden.each_with_index do |kunde, i|
+ (0..([i, WINDOWSIZE].min-1)).each do |k|
+ other = kunden[i-k-1]
+ distance = kunde.distance_to(other)
+ result << {:distance => distance, :k1 => kunde, :k2 => other} if distance < THRESHOLD
+ end
+ end
+
+ insert = Time.now
+ output(result)
+
+ finish = Time.now
+ puts "processed #{kunden.size} objects and found #{result.size} duplicates in #{time_format(finish-retrieve)} minutes (select: #{time_format(detect-retrieve)}, detect: #{time_format(insert-detect)}, insert: #{time_format(finish-insert)})"
+ end
+ end
+
+ def time_format(seconds)
+ "#{(seconds / 60).to_i}:#{"%.1f" % (seconds % 60)}"
+ end
+
+end
Oops, something went wrong.

0 comments on commit 280b251

Please sign in to comment.