mdgrep.py

#!/usr/bin/env python
import sys
import os
import re
from optparse import OptionParser


#debug output.
global debug
debug = None

#Some silly constants for further optimization
PIPE = 1
FILE = 0

#this sets up file associations for extracting text
#from a variety of file types.  Each entry should be 
#a tuple with everything before the filepath 
#in cell 0, and everything after the filepath in cell 1.

fileAssoc = {
       
    #yes, this is probably an unnecessary use of cat,
    #but you might want to use fmt for text or tidy for 
    #html.
    
    #On second thought, throw in a switch to read from a file.
    #the third argument here can be either FILE or PIPE.  So if you 
    #want to use tidy:
    #"html" : ("tidy -iw, "",PIPE),
    "txt" : ("cat", "",FILE),
    "html" : ("cat", "",FILE),
    "htm" : ("cat", "",FILE)
    }
    
#the default file action. use mdiport and pull 
#information from stderr. This takes advantage of 
#file indexing plugins.
defaultFileAction=("mdimport -nfd2", "2>&1",PIPE)


#parse options
parser = OptionParser()
parser.add_option("-i", "--ignore-case", dest="ignoreCase", action="store_true",
    help="""Use case-insensitive searches.  ('Foo' matches 'foo')""")
parser.add_option("-r", "--regex", dest="regex", action="store",
    help="""Use regular expression instead of phrase.""")
parser.add_option("-s", "--spotlight-expression", dest="spotlightExpression",
    action="store", help="""An extra expression used to seach the spotlight index
    for example mdgrep "Frank Zappa" -s "utopia" finds all items in the spotlight
    index matching "utopia" with the phrase "Frank Zappa".""")
parser.add_option("-o", "--only-in", dest="directory",
    action="store", help="""Search only in this directory.""")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
    help="Print out some debug messages.")


parser.set_usage("%prog seachPhrase [options]")

(options, args) = parser.parse_args()

if debug:
    print "Options:" + options
    print "Args" + args


class textExtractor:
    """Extracts the text contents of file at filepath and then
    performs a case-insensive seach on searchPhrase"""
    def __init__(self,filepath):
        self.filepath = filepath
        self.status = None
        #self.text = None
        #self.searchPhrase=searchPhrase.lower()
    def searchString(self,searchPhrase,ignoreCase=True):
        """Convert a file to text and search for a match,
        if a match is found, set status to true."""
        #threadPool.acquire()
        
        #this is just used for debugging
        command = buildCommand(self.filepath)
        
        #get a reader from the 
        reader = buildReader(self.filepath)
        
        #provide some helpful information 
        #about what process is running.
        global debug
        if debug:
            print "Running: " + command
        #reader = os.popen(command, "r")
        while 1:
            text = reader.readline()
            #print text
            if not text: break
            if ignoreCase:
                text = text.lower()
            if text.find(searchPhrase) > -1:
                self.status = True
                break
        #close our resources.
        reader.close()
        #threadPool.release()

    def searchRegex(self,expression):
        """Search a parsed file using a regular expression 
        object. expression must be an re object."""
        #threadPool.acquire()
        
        #this is just used for debugging
        command = buildCommand(self.filepath)
        
        #get a reader from the 
        reader = buildReader(self.filepath)
        
        #provide some helpful information 
        #about what process is running.
        if debug:
            print "Running: " + command
        #reader = os.popen(command, "r")
        while 1:
            text = reader.readline()
            #print text
            if not text: break
            if expression.search(text):
                self.status = True
                break
        #close our resources.
        reader.close()
        #threadPool.release()

        
def spacesToAnd(text):
    """This function converts a phrase query to an and query"""
    return(text.replace(r" ",r"&"))
    
def buildCommand(filepath):
    """Build a command for parsing filepath."""
    extension = filepath.split('.')[-1]
    action = fileAssoc.get(extension,defaultFileAction)
    return "".join([action[0], ' "' , filepath, '" ', action[1]]) 
    
def buildReader(filepath):
    """Return the proper reader for filepath, this will be a 
    basic file handle for text-based objects, and a popen 
    handle for other objects."""
    extension = filepath.split('.')[-1]
    action = fileAssoc.get(extension,defaultFileAction)
    if action[2] == FILE:
        return open(filepath,"r")
    else:
        return os.popen("".join([action[0], ' "' , filepath, '" ', action[1]]),"r")
        

def searchSpotlight(searchPhrase,spotlightQuery=None,onlyin=None,regexObject=None,
                    case=False):
    """Run a spotlightQuery with our parameters and then pass the results to 
    our file reader function"""
    if not spotlightQuery:
        spotlightQuery = spacesToAnd(searchPhrase)
    if onlyin:
        onlyinText = ' -onlyin "' + onlyin + '" '
    else:
        onlyinText = ""
        
    #extractorList = []
    queryCommand = 'mdfind ' + onlyinText + '"' + spotlightQuery + '"'
    if debug:
        print queryCommand
        
    #open mdfind as a pipe.    
    mdQueryObj = os.popen(queryCommand)
    
    #iterate over the results of mdfind.
    #mdfind can be a bit slow, so parse the results
    #as soon as we get it, rather than wait for readlines()
    #to exit.
    while 1:
        text = mdQueryObj.readline()
        if not text: break
        current = textExtractor(text.strip())
        #extractorList.append(current)
        if regexObject:
            current.searchRegex(regexObject)
        else:    
            current.searchString(searchPhrase,ignoreCase=case)
        if current.status:
            print current.filepath

def main():
    """process args and pass on control to the searchSpotlight function"""

    #parse options
    parser = OptionParser()
    parser.add_option("-i", "--ignore-case", dest="ignoreCase", action="store_true",
        help="""Use case-insensitive searches.  ('Foo' matches 'foo')""")
    parser.add_option("-r", "--regex", dest="regex", action="store",
        help="""Use regular expression instead of phrase.""")
    parser.add_option("-s", "--spotlight-expression", dest="spotlightExpression",
        action="store", help="""An extra expression used to seach the spotlight index
        for example mdgrep "Frank Zappa" -s "utopia" finds all items in the spotlight
        index matching "utopia" with the phrase "Frank Zappa".""")
    parser.add_option("-o", "--only-in", dest="directory",
        action="store", help="""Search only in this directory.""")
    parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
        help="Print out some debug messages.")
    
    
    parser.set_usage("%prog seachPhrase [options]")
    
    (options, args) = parser.parse_args()
    
  
    #set our debug level
    global debug
    debug = options.verbose

    if debug:
        print "Options:" 
        print options
        print "Args:"
        print args
  

    #get the search phrase from the args
    searchPhrase = None
    if len(args):
        searchPhrase = args[0]
        
        
    #error checking. We need either a search phrase or a spotLight expression.
    #for any query.
    if  not (searchPhrase or options.spotlightExpression):
        print """You must supply either a search phrase, or a spotlight expression."""
        parser.print_help()
        sys.exit()
    
    #set options for ignoreCase
    if options.ignoreCase:
        ignoreCase = True
        regexFlags = re.I
    else:
        ignoreCase = None
        regexFlags = 0
        
    #if there is a regular expression specified, do the "phrase" search using that.    
    if options.regex:
        reObject = re.compile(options.regex,regexFlags)
        searchSpotlight(searchPhrase,
            spotlightQuery=options.spotlightExpression,
            onlyin=options.directory,
            regexObject=reObject)
            
    #run a normal text search.        
    else:
        searchSpotlight(searchPhrase,
            spotlightQuery = options.spotlightExpression,
            onlyin=options.directory,
            case=ignoreCase)
        
        
if __name__ == "__main__":   
    main()