Permalink
Browse files

some more bug fixes

  • Loading branch information...
1 parent 3532477 commit bb51935e18a8e1673f1fcede9061982fe81d7d4b @dsisnero committed Jan 30, 2012
Showing with 133 additions and 74 deletions.
  1. +20 −11 lib/deduper/app.rb
  2. +5 −1 lib/deduper/file_info.rb
  3. +7 −6 lib/deduper/hasher.rb
  4. +75 −29 lib/deduper/possible_match.rb
  5. +26 −27 lib/deduper/scanner.rb
View
@@ -10,12 +10,13 @@ module Deduper
class App
- attr_accessor :scanned_files,:possible_dups,:possible_matches,:matches
+ attr_accessor :scanned_files,:possible_dups,:possible_matches,:matches, :zero_sized_files
def initialize(*args)
@scanner = Scanner.new(*args)
@original_files = []
+ @zero_sized_files = []
end
@@ -29,9 +30,15 @@ def run
def run_scan
results = []
@scanner.scan do |path|
- results << FileInfo.new(path)
+ file = FileInfo.new(path)
+ if file.size == 0
+ @zero_sized_files << file
+ else
+ results << file
+ end
end
@scanned_files = results
+ results
end
def scanned_files
@@ -44,15 +51,13 @@ def possible_dups
end
def run_match_by_size
- index_by_size(scanned_files).select{|size,array| array.size > 1}
+ indexed_by_size.select{|key,values| values.size > 1}.values
end
def run_possible_matches
- possible_dups.values.map{|same| PossibleMatch.new(same)}
- end
-
-
+ possible_dups.map{|same| PossibleMatch.new(same)}
+ end
def possible_matches
@possible_matches ||= run_possible_matches
@@ -64,14 +69,18 @@ def matches
def run_matches
possible_matches.map do |possibles|
- possibles.find_matched_files
+ possibles.matched_files
end
end
- def index_by_size(array)
- result = Hash.new{|h,k| h[k] = []}
+ def indexed_by_size(array = scanned_files)
+ @index_by_size ||= run_index_by_size(array)
+ end
+
+ def run_index_by_size(array = scanned_files)
+ result = Hash.new{|h,k| h[k] = []}
array.each do |file|
- result[size] << file
+ result[file.size] << file
end
result
end
View
@@ -4,7 +4,7 @@ module Deduper
class FileInfo
- attr_accessor :md5_head, :md5_tail, :md5
+ attr_accessor :md5_head, :md5_tail, :md5, :path
def initialize(path)
@path = path
@@ -15,5 +15,9 @@ def size
@stat.size rescue 0
end
+ def to_path
+ @path
+ end
+
end
end
View
@@ -1,4 +1,4 @@
-require 'celluloid'
+#require 'celluloid'
require 'digest/md5'
# Ask the hasher to perform a complex computation. However, since we're using
@@ -15,7 +15,7 @@
class Hasher
- include Celluloid
+ # include Celluloid
class << self
attr_accessor :blksize
@@ -32,20 +32,21 @@ def digest_head(file, blksize = @blksize)
Digest::MD5.hexdigest(file.read(blksize))
end
- def digest_tail(file,blksize = @blksize)
- file = File.new(file)
- file.seek(-blksize, SEEK::END)
+ def digest_tail(filename,blksize = @blksize)
+ file = File.new(filename)
+ file.seek(-blksize, IO::SEEK_END) if file.stat.size > blksize
Digest::MD5.hexdigest(file.read(blksize))
end
def md5_file(file, blksize = @blksize)
digest = Digest::MD5.new
File.open(file,'rb') do |io|
buffer = ''
- while buffer = file.read(blksize)
+ while buffer = io.read(blksize)
digest.update(buffer)
end
end
+ digest.digest
end
end
@@ -26,50 +26,96 @@ def files
@files.to_a
end
- def update_array(array,method,values)
- #debugger
- array.each_with_index do |item,index|
- method = method.chop if method =~ /=$/
- item.send("#{method}=", values[index])
+ def md5_head(array = @files)
+ array.pmap do |f|
+ digest = digester.digest_head(file.path)
+ f.md5_head = digest
end
end
- def find_matches_for_array(array,attrib,&block)
- debugger
- attrib_values = array.pmap(&block)
- result = update_array(array,attrib,attrib_values)
- array.non_unique_by{|i| i.send(attrib)}
+ def md5_tail(array =same_head)
+ array.pmap do |f|
+ digest = digester.digest_tail(file.path)
+ f.md5_tail = digest
+ end
end
-
-
+ def md5(array = same_tail)
+ array.pmap do |f|
+ digest = digester.md5(file.path)
+ f.md5 = digest
+ end
+ end
- def find_matches(attrib,&block)
- find_matches_for_array(@files,attrib,&block)
- end
- def find_same_head(array)
- find_matches_for_array('md5_head'){|file| digester.digest_head(file)}
- end
- def find_same_tail(array)
- find_matches_for_array(array,'md5_tail'){|file| digester.digest_tail(file)}
+ def same_head
+ @same_head ||= md5_head(@files).non_unique_by{|i| i.md5_head}
end
- def find_same_md5(array)
- find_matches_for_array(array,'md5'){|file| digester.md5_file(file)}
+ def same_tail
+ @same_tail ||= md5_tail(same_head).non_unique_by{|i| i.md5_tail}
end
- def find_matched_files
- @same_head = find_same_head(@files)
- @same_tail = find_same_tail(@same_head)
- @same_md5 = find_same_md5(@same_tail)
+ def same_md5
+ @same_md5 ||= md5(same_tail).non_unique_by{|i| i.md5_tail}
end
- def matched_files
- @matched_files ||= find_matched_files
- end
+
+
+ # def update_array(array,method,values)
+ # #debugger
+ # array.each_with_index do |item,index|
+ # method = method.chop if method =~ /=$/
+ # item.send("#{method}=", values[index])
+ # end
+ # end
+
+ # def find_matches_for_array(array,attrib,&block)
+ # # debugger
+ # attrib_values = array.pmap(&block)
+ # result = update_array(array,attrib,attrib_values)
+ # array.non_unique_by{|i| i.send(attrib)}
+ # end
+
+ # def same_head(array = @files)
+ # @same_head ||= find_same_head(array)
+ # end
+
+ # def same_tail(array = same_head)
+ # @same_tail ||= find_same_tail(array)
+ # end
+
+ # def same_md5(array = same_tail)
+ # @same_md5 ||= find_same_md5(array)
+ # end
+
+ def matched_files
+ same_md5.select{|m| m.size > 1 }
+ end
+
+ # def size
+ # matched_files.size
+ # end
+
+
+
+ # def find_matches(attrib,&block)
+ # find_matches_for_array(@files,attrib,&block)
+ # end
+
+ # def find_same_head(array)
+ # find_matches_for_array(array, 'md5_head'){|file|}
+ # end
+
+ # def find_same_tail(array)
+ # find_matches_for_array(array,'md5_tail'){|file| digester.digest_tail(file.path)}
+ # end
+
+ # def find_same_md5(array)
+ # find_matches_for_array(array,'md5'){|file| digester.md5_file(file.path)}
+ # end
end
View
@@ -34,40 +34,39 @@ def exclude_proc(&block)
def scan
files = []
begin
- Find.find(*scan_dirs) do |path|
-
- if File.directory? path
- Find.prune if should_exclude?(path)
- Find.prune if path =~ /\.VirtualBox/
- next
- else
- next if should_exclude?(path)
- next unless File.exist?(path)
- if block_given?
- yield path
+ Find.find(*scan_dirs) do |path|
+
+ if File.directory? path
+ Find.prune if should_exclude?(path)
+ Find.prune if path =~ /\.VirtualBox/
+ next
else
- files << path
+ next if should_exclude?(path)
+ next unless File.exist?(path)
+ if block_given?
+ yield path
+ else
+ files << path
+ end
end
end
+
+ return files unless block_given?
end
-
- return files unless block_given?
- end
- rescue
- nil
- end
- end
-
- def scanned_files
- @scanned_files ||= scan
+ rescue
+ nil
end
+ end
-
+ def scanned_files
+ @scanned_files ||= scan
+ end
- def add_path(path)
- file =
- @files << file
- end
+
+ def add_path(path)
+ file =
+ @files << file
end
+
end

0 comments on commit bb51935

Please sign in to comment.