src/minoe/SearchFiles.java

package minoe;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;
import java.util.Vector;
import javax.swing.JOptionPane;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.openide.util.Exceptions;

   /**
    *
    * To calculate filled cells / number of search operations:
    *
    * y = ((x^2 - x) / 2) + x
    *
    * Where
    * x = number of terms
    * y = number of operations
    *
    * For 10 terms there would be ((10^2 - 10) / 2) + 10 = 55 search operations.
    * For 11 terms there would be ((11^2 - 11) / 2) + 11 = 66 search operations.
    * For 50 terms there would be ((50^2 - 50) / 2) + 50 = 1,275 search operations.
    * For 100 terms there would be ((100^2 - 100) / 2) + 100 = 5050 search operations.
    *
   */

public class SearchFiles {

    private MetaDataController metadata;

    String index = "indexes";
    String field = "contents"; // the field that Lucene searches, i.e. the document contents.
    RAMDirectory ramDir;
    Directory directory;
    IndexReader reader;

    public SearchFiles(MetaDataController mdc) {
        this.metadata = mdc;
        try {
            // try loading index into memory
            directory = FSDirectory.getDirectory(index);
            ramDir = new RAMDirectory(directory);
            reader = IndexReader.open(ramDir);
        } catch (Exception ex) {
            System.out.println("Error loading index: " + ex.toString());
        } catch (java.lang.OutOfMemoryError err){
            // if there isn't memory available, then just read index from disk.
            try {
                directory = FSDirectory.getDirectory(index);
                reader = IndexReader.open(directory);
            } catch (IOException ex) {
                System.out.println("Error loading index: " + err.toString());
            }
        }

    }

    /**
     * Accepts two strings to search within a distance of each other.
     * Strings are divided into query components and matched with each component.
     * 
     * Ex:
     * termA:  "fishing licence"
     * termB:  "lobster trap" crab*
     *
     * Results in:
     * Comparing 
     * "fishing license" and "lobster trap" within distance x
     * and
     * "fishing license" and crab* within distance x
     *
     * Multiphrase query = "foo ba*".  Not supported.
     *
     * Returns all of the documents that match these distance comparisons.
     *
     * @param termA
     * @param termB
     * @param criteria
     * @return
     * @throws org.apache.lucene.index.CorruptIndexException
     * @throws java.io.IOException
     * @throws org.apache.lucene.queryParser.ParseException
     */
    public Hashtable<String, Integer> returnResults(String termA, String termB, SearchCriteria criteria) throws CorruptIndexException, IOException, ParseException{

//      System.out.println(termA + " - " + termB);

      int slop = criteria.getSlop();
      boolean inOrder = false;
      
      Analyzer analyzer = new StandardAnalyzer();

      if (termA == null || termB == null) {
          return null;
      }

      QueryParser queryParser = new QueryParser(field, analyzer);

      Vector<Spans> spansVec = new Vector<Spans>();

      // If both termA and termB are the same we want to run a different type of search...
      if(termA.equalsIgnoreCase(termB)){
          // Divide the term into query components.
          // Can use termA or termB since they're both the same.
          Query query = queryParser.parse(termA);
          ArrayList<SpanQuery> termList = new ArrayList<SpanQuery>();
          this.buildClauses(query, termList, reader);

          for (int i = 0; i < termList.size(); i++) {
              SpanQuery singleSpan = termList.get(i);
              Spans spans;
              if(singleSpan instanceof SpanTermQuery){
                  SpanTermQuery stq = (SpanTermQuery) singleSpan;
                 // Now run the search
                 spans = stq.getSpans(reader);
              }else{
                  SpanNearQuery snq = (SpanNearQuery)singleSpan;
                  // Now run the search
                  spans = snq.getSpans(reader);
              }

              // store the results in a vector of spans
              spansVec.add(spans);
          }
      } else {
          // ...termA and termB are different.
          // Divide the first term into query components.
          Query queryA = queryParser.parse(termA);
          ArrayList<SpanQuery> termAList = new ArrayList<SpanQuery>();
          this.buildClauses(queryA, termAList, reader);

          // Divide the second term into query components.
          Query queryB = queryParser.parse(termB);
          ArrayList<SpanQuery> termBList = new ArrayList<SpanQuery>();
          this.buildClauses(queryB, termBList, reader);

          // Search each query type from each term.
          int termalistsize = termAList.size();
          int termblistsize = termBList.size();
          for (int i = 0; i < termalistsize; i++) {
               SpanQuery aSpan = termAList.get(i);
               for (int j = 0; j < termblistsize; j++) {
                   SpanQuery bSpan = termBList.get(j);

                   // Now run the search
                   SpanNearQuery snq = new SpanNearQuery(new SpanQuery[]{aSpan, bSpan}, slop, inOrder);
                   Spans spans = snq.getSpans(reader);

                   // store the results in a vector of spans
                   spansVec.add(spans);
               }
          }
      }

      // Filename => matches.
      Hashtable<String, Integer> counts = new Hashtable<String, Integer>();

      // Get the term counts (span counts) for all documents.
      for (Spans spans : spansVec) {
          while(spans.next()){
              int id = spans.doc();
              Document doc = reader.document(id);
              String docname = doc.get("file name");
              if(counts.containsKey(docname)){
                 int count = counts.get(docname).intValue();
                 count++;
                 counts.put(docname, count);
              } else{
                 counts.put(docname, 1);
              }
          }
      }

      // The documents matching the search criteria.
      List<String> docList = new ArrayList<String>();

      // If user specified certain documents to search in.
      List<String> criterialist = criteria.getDocumentList();

      // Build a list of documents that the search is limited to.
      if(criterialist.size() > 0){
          // document search
          docList = criterialist;
      } else{
          // metadata search
          docList = this.metadata.getDocumentsBySearchCriteria(criteria);
      }
      // Now filter the documents based upon the criteria.
      Hashtable<String, Integer> results = new Hashtable<String, Integer>();
      Enumeration<String> e = counts.keys();
      while(e.hasMoreElements()){
          String doc = e.nextElement();
          // document search
          if(docList.size() > 0 && docList.contains(doc)){
              // More cleanup before we add to our final output list -
              // Spans keep track of a beginning and end position it appears,
              // so we must divide the total count by 2.
              int finalcount = counts.get(doc);
              if(finalcount > 1){
                  // Note:  This isn't true now?  Keep testing.
                  //finalcount = finalcount / 2;
              }
              results.put(doc, finalcount);
          }
      }

      return results;
    }

    /**
     * Separates a query into spanqueries.
     * @param query
     * @param termList
     * @param reader
     * @return
     */
    public ArrayList buildClauses(Query query, ArrayList<SpanQuery> termList, IndexReader reader){
        try {
            if(query instanceof BooleanQuery){
              // this is a boolean query OR this is a boolean query
                BooleanQuery bq = (BooleanQuery) query;
                BooleanClause[] bclauses = bq.getClauses();
                for (int i = 0; i < bclauses.length; i++){
                    Query childQuery = bclauses[i].getQuery();
                    // Rewrite this clause e.g one* becomes (one OR onerous)
                    childQuery.rewrite(reader);
                    buildClauses(childQuery, termList, reader);
                }
            } else if(query instanceof PhraseQuery){
               // phrasequery ex: "this is a phrase query"
               // convert phrase queries to SpanNearQuery because
               // phrase queries will do the phrase out of order.
               PhraseQuery pq = (PhraseQuery)query;
               Term[] termArr = pq.getTerms();
               SpanTermQuery[] thisSpan = new SpanTermQuery[termArr.length];
               for (int i = 0; i < termArr.length; i++) {
                   Term term = termArr[i];
                   SpanTermQuery termSpan = new SpanTermQuery(term);
                   thisSpan[i] = termSpan;
               }
               SpanNearQuery snq = new SpanNearQuery(thisSpan, 0, true);
               termList.add(snq);
            } else if(query instanceof TermQuery){
               // add to queryList
               TermQuery tq = (TermQuery) query;
               Term term = tq.getTerm();
               SpanQuery stq = new SpanTermQuery(term);
               termList.add(stq);
            } else if(query instanceof WildcardQuery){
                // wildcard query can be like:  "?ild*"
                WildcardQuery wq = (WildcardQuery) query;
                // Rewrite this clause e.g one* becomes (one OR onerous)
                Query q = wq.rewrite(reader);
                buildClauses(q, termList, reader);
            } else if(query instanceof PrefixQuery){
                // prefix query ex:  "fish*"
                PrefixQuery pq = (PrefixQuery) query;
                Query q = pq.rewrite(reader);
                buildClauses(q, termList, reader);
            }
        } catch (Exception ex) {
            JOptionPane.showMessageDialog(null, "Unknown query type: " + ex.toString());
        }
        return termList;
    }

    /**
     * Returns the document names and search scores for a given search string.
     * @param searchString
     * @param criteria
     * @return
     * @throws org.apache.lucene.index.CorruptIndexException
     * @throws java.io.IOException
     * @throws org.apache.lucene.queryParser.ParseException
     */
    public Hashtable<String, Float> returnResults(String searchString, SearchCriteria criteria) throws CorruptIndexException, IOException, ParseException{
      IndexSearcher searcher = new IndexSearcher(reader);
      Analyzer analyzer = new StandardAnalyzer();

      if (searchString == null) {
          return null;
      }

      QueryParser parser = new QueryParser(field, analyzer);
      Query query = parser.parse(searchString);

      // Search the collection
      CustomHitCollector collector = new CustomHitCollector(searcher, CustomHitCollector.ID_TYPE);
      searcher.search(query, collector);

      // Output each file that the term was found in.
      Hashtable<String, Float> counts = collector.getDocumentsList();

      List<String> docList = new ArrayList<String>();
      List<String> criterialist = criteria.getDocumentList();

      // Build a list of documents that the search is limited to.
      if(criterialist.size() > 0){
          // document search
          docList = criterialist;
      } else{
          // metadata search
          docList = this.metadata.getDocumentsBySearchCriteria(criteria);
      }

      // Now filter the documents based upon the criteria.
      Hashtable<String, Float> results = new Hashtable<String, Float>();
      Enumeration<String> e = counts.keys();
      while(e.hasMoreElements()){
          String doc = e.nextElement();
          // document search
          if(docList.size() > 0 && docList.contains(doc)){
              results.put(doc, counts.get(doc));
          }
      }

      return results;
    }

    /**
     * Returns the absolute path that the file was indexed with.
     * Used for opening the contents of the file.
     * @param inFileName
     * @return
     * @throws org.apache.lucene.index.CorruptIndexException
     * @throws java.io.IOException
     * @throws org.apache.lucene.queryParser.ParseException
     */
    public String getPath(String inFileName) throws CorruptIndexException, IOException{

        IndexSearcher searcher = new IndexSearcher(reader);

        String retVal = null;

        Term t = new Term("file name", inFileName);
        Query query = new TermQuery(t);
        CustomHitCollector collector = new CustomHitCollector(searcher, CustomHitCollector.PATH_TYPE);
        searcher.search(query, collector);

        Hashtable<String, Float> results = collector.getDocumentsList();
        Enumeration<String> keys = results.keys();
        while(keys.hasMoreElements()){
            String path = keys.nextElement();
            return path;
        }

        return retVal;
    }

    /**
     * Returns all of the file names in the index.
     * @return
     */
    public Vector<String> getAllFileNames() throws CorruptIndexException, IOException{
        Vector<String> files = null;

        int numdocs = this.reader.numDocs();

        files = new Vector<String>();

        for (int i = 0; i < numdocs; i++) {
           Document doc = this.reader.document(i);
           String thisFileName = doc.get("file name");
           files.add(thisFileName);
        }

        return files;
    }

    class CustomHitCollector extends HitCollector{

        private IndexSearcher searcher;
        // list of documents and search scores
        private Hashtable<String, Float> documentsList = new Hashtable<String, Float>();
        // count of hits per document
        private Hashtable<String, Integer> documentHits = new Hashtable<String, Integer>();
        public static final String PATH_TYPE = "path";
        public static final String ID_TYPE = "id";
        private String type = ID_TYPE; // default

        public CustomHitCollector(IndexSearcher searcher, String type){
            this.searcher = searcher;
            this.type = type;
        }

        @Override
        public void collect(int doc, float score) {
            Document document;
            try {
                document = searcher.doc(doc);
                String fileName = null;
                if(this.type.equalsIgnoreCase(PATH_TYPE)){
                   fileName = document.get("path");
                   String separator = "\\" + java.io.File.separator;
                   fileName = fileName.replaceAll("::", separator);
                } else{
                   fileName = document.get("file name");
                }
                // track score per document
                this.documentsList.put(fileName, score);
                // track hits per document.
                if(this.documentHits.containsKey(fileName)){
                   int count = this.documentHits.get(fileName);
                   count++;
                   this.documentHits.put(fileName, count);
                } else{
                    this.documentHits.put(fileName, 1);
                }
            } catch (CorruptIndexException ex) {
            } catch (IOException ex) {
            }
        }

        public Hashtable<String, Float> getDocumentsList(){
            return this.documentsList;
        }
        public Hashtable<String, Integer> getDocumentHits(){
            return this.documentHits;
        }
    }//end class CustomHitCollector

}//end class