dupes.py

#!/usr/bin/env python
"""
dupes.py
Find duplicate files

 Copyright 2007 Justin Azoff

  License:
      GPL
"""

import os

BLOCKSIZE = 1024 * 8

class Files:
    def __init__(self):
        self._files = {}
        self._open_files = 0
        self._max_open_files = 512
    
    def _open(self, fn, pos=0):
        """Open or re-open a file"""
        if self._open_files == self._max_open_files:
            self._close_a_file()
        fh = open(fn)
        if pos:
            fh.seek(pos)
        self._files[fn] = {'file': fh, 'pos': pos}

        self._open_files += 1
        return fh

    def open(self, fn):
        if fn in self._files:
            f = self._files[fn]
            if f['file']:
                return f['file']
            pos = f['pos']
        else :
            pos = 0

        return self._open(fn, pos)

    def _close_a_file(self):
        for k,v in self._files.items():
            if v['file']:
                self.close(k)
                return

    def close(self, fn, remove=False):
        f = self._files[fn]
        fh = f['file']
        if fh and not fh.closed:
            pos = fh.tell()
            fh.close()
            self._open_files -= 1
        else :
            remove = True

        if remove or pos == 0:
            del self._files[fn]
        else :
            f['pos'] = pos
            f['file'] = None

class dupfinder:
    def __init__(self,dirs=None):
        self._files = {}
        self._inodes = set()
        self._file_handles = Files()
        self._dirs = []
        self._bytes_read = 0
        self._group = 1
        self._num_files = 0
        if dirs:
            self.add_dirs(dirs)

    def add_dirs(self, dirs):
        self._dirs.extend(dirs)

    def _add_file(self, fn):
        if os.path.islink(fn):
            return
        try :
            stat = os.stat(fn)
            size = stat.st_size
            #i really only need to cache the inodes for files which
            #have size collisions, but this way I only have to make
            #one pass and 1 stat call...
            tup = (stat.st_dev, stat.st_ino)
            if tup in self._inodes:
                return
            self._inodes.add(tup)
            self._files.setdefault(size,[]).append(fn)
        except OSError:
            return

    def _walk_dir(self, dir):
        for path, dirs, files in os.walk(dir):
            for f in files:
                fn = os.path.join(path, f)
                self._add_file(fn)
                self._num_files += 1

    def _build_flist(self):
        for dir in self._dirs:
            self._walk_dir(dir)
        chain = [len(v) for v in self._files.values()]
        if chain:
            long = max(chain)
        else :
            long = 0
        self._longest_chain = long

    def _clear_singles(self):
        """Delete files from the list that do not even have a 
           size collision"""
        singles = []
        for size, files in self._files.iteritems():
            if len(files) == 1:
                singles.append(size)
        for size in singles:
            del self._files[size]

        self._groups = len(self._files)

    def _get_fh(self, path):
        return self._file_handles.open(path)

    def _close_fh(self, path):
        #print "Closing", path
        self._file_handles.close(path, remove=True)

    def _identical(self, files, split=False):
        
        while 1:
            chunks = {}
            chains = []
            for p in files:
                f = self._get_fh(p)
                chunk = f.read(BLOCKSIZE)
                self._bytes_read += BLOCKSIZE
                chunks.setdefault(chunk,[]).append(p)
            
            #test for EOF on all files - means I found duplicate files
            if len(chunks) == 1 and '' in chunks:
                for fn in files:
                    self._close_fh(fn)
                if not split:
                    return [files]
                else :
                    return files

            for matches in chunks.values():
                if len(matches) != 1:
                    #matches - a list of filenames, are possibly duplicates
                    chains.append(matches)
                else :
                    #for a chunk that had no dupes, I can stop reading this file
                    fn = matches[0]
                    self._close_fh(fn)
                    files.remove(fn) 
            #say I have 4 files, and 1 == 3 and 2 == 4, but 1 != 2, I need to
            #re-call this function with [1,3] and [2,4] as files
            #otherwise, all the files were the same so far, and I can stay in this loop
            if len(chains) != 1:
                break

        same = []
        for chain in chains:
            same.append(self._identical(chain, split=True))

        return same

    def _is_dup(self, size, files):
        if size == 0:
            return [files]

        return self._identical(files)

    def find_dups(self):
        self._build_flist()
        self._clear_singles()

        dupes = []
        for size, files in self._files.iteritems():
            same = self._is_dup(size, files)
            if same:
                dupes.extend(same)
            #self._progress()
            
        assert self._file_handles._open_files == 0
        return dupes
        

    def _progress(self):
        sys.stderr.write("\rProgress %d/%d" % (self._group, self._groups))
        if self._group == self._groups:
            sys.stderr.write("\n")
        sys.stderr.flush()
        self._group += 1
        

    def print_stats(self):
        print >> sys.stderr, "%d total files" % self._num_files
        print >> sys.stderr, "%d size collisions, max of length %d" % (self._groups, self._longest_chain)

        print >> sys.stderr, "bytes read %d" % self._bytes_read

if __name__ == "__main__":
    import sys
    d = dupfinder()
    d.add_dirs(sys.argv[1:])
    for dups in d.find_dups():
        print '\n'.join(dups)
        print

    d.print_stats()