/
SearchFiles.java
451 lines (395 loc) · 16.4 KB
/
SearchFiles.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
package minoe;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;
import java.util.Vector;
import javax.swing.JOptionPane;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.openide.util.Exceptions;
/**
*
* To calculate filled cells / number of search operations:
*
* y = ((x^2 - x) / 2) + x
*
* Where
* x = number of terms
* y = number of operations
*
* For 10 terms there would be ((10^2 - 10) / 2) + 10 = 55 search operations.
* For 11 terms there would be ((11^2 - 11) / 2) + 11 = 66 search operations.
* For 50 terms there would be ((50^2 - 50) / 2) + 50 = 1,275 search operations.
* For 100 terms there would be ((100^2 - 100) / 2) + 100 = 5050 search operations.
*
*/
public class SearchFiles {
private MetaDataController metadata;
String index = "indexes";
String field = "contents"; // the field that Lucene searches, i.e. the document contents.
RAMDirectory ramDir;
Directory directory;
IndexReader reader;
public SearchFiles(MetaDataController mdc) {
this.metadata = mdc;
try {
// try loading index into memory
directory = FSDirectory.getDirectory(index);
ramDir = new RAMDirectory(directory);
reader = IndexReader.open(ramDir);
} catch (Exception ex) {
System.out.println("Error loading index: " + ex.toString());
} catch (java.lang.OutOfMemoryError err){
// if there isn't memory available, then just read index from disk.
try {
directory = FSDirectory.getDirectory(index);
reader = IndexReader.open(directory);
} catch (IOException ex) {
System.out.println("Error loading index: " + err.toString());
}
}
}
/**
* Accepts two strings to search within a distance of each other.
* Strings are divided into query components and matched with each component.
*
* Ex:
* termA: "fishing licence"
* termB: "lobster trap" crab*
*
* Results in:
* Comparing
* "fishing license" and "lobster trap" within distance x
* and
* "fishing license" and crab* within distance x
*
* Multiphrase query = "foo ba*". Not supported.
*
* Returns all of the documents that match these distance comparisons.
*
* @param termA
* @param termB
* @param criteria
* @return
* @throws org.apache.lucene.index.CorruptIndexException
* @throws java.io.IOException
* @throws org.apache.lucene.queryParser.ParseException
*/
public Hashtable<String, Integer> returnResults(String termA, String termB, SearchCriteria criteria) throws CorruptIndexException, IOException, ParseException{
// System.out.println(termA + " - " + termB);
int slop = criteria.getSlop();
boolean inOrder = false;
Analyzer analyzer = new StandardAnalyzer();
if (termA == null || termB == null) {
return null;
}
QueryParser queryParser = new QueryParser(field, analyzer);
Vector<Spans> spansVec = new Vector<Spans>();
// If both termA and termB are the same we want to run a different type of search...
if(termA.equalsIgnoreCase(termB)){
// Divide the term into query components.
// Can use termA or termB since they're both the same.
Query query = queryParser.parse(termA);
ArrayList<SpanQuery> termList = new ArrayList<SpanQuery>();
this.buildClauses(query, termList, reader);
for (int i = 0; i < termList.size(); i++) {
SpanQuery singleSpan = termList.get(i);
Spans spans;
if(singleSpan instanceof SpanTermQuery){
SpanTermQuery stq = (SpanTermQuery) singleSpan;
// Now run the search
spans = stq.getSpans(reader);
}else{
SpanNearQuery snq = (SpanNearQuery)singleSpan;
// Now run the search
spans = snq.getSpans(reader);
}
// store the results in a vector of spans
spansVec.add(spans);
}
} else {
// ...termA and termB are different.
// Divide the first term into query components.
Query queryA = queryParser.parse(termA);
ArrayList<SpanQuery> termAList = new ArrayList<SpanQuery>();
this.buildClauses(queryA, termAList, reader);
// Divide the second term into query components.
Query queryB = queryParser.parse(termB);
ArrayList<SpanQuery> termBList = new ArrayList<SpanQuery>();
this.buildClauses(queryB, termBList, reader);
// Search each query type from each term.
int termalistsize = termAList.size();
int termblistsize = termBList.size();
for (int i = 0; i < termalistsize; i++) {
SpanQuery aSpan = termAList.get(i);
for (int j = 0; j < termblistsize; j++) {
SpanQuery bSpan = termBList.get(j);
// Now run the search
SpanNearQuery snq = new SpanNearQuery(new SpanQuery[]{aSpan, bSpan}, slop, inOrder);
Spans spans = snq.getSpans(reader);
// store the results in a vector of spans
spansVec.add(spans);
}
}
}
// Filename => matches.
Hashtable<String, Integer> counts = new Hashtable<String, Integer>();
// Get the term counts (span counts) for all documents.
for (Spans spans : spansVec) {
while(spans.next()){
int id = spans.doc();
Document doc = reader.document(id);
String docname = doc.get("file name");
if(counts.containsKey(docname)){
int count = counts.get(docname).intValue();
count++;
counts.put(docname, count);
} else{
counts.put(docname, 1);
}
}
}
// The documents matching the search criteria.
List<String> docList = new ArrayList<String>();
// If user specified certain documents to search in.
List<String> criterialist = criteria.getDocumentList();
// Build a list of documents that the search is limited to.
if(criterialist.size() > 0){
// document search
docList = criterialist;
} else{
// metadata search
docList = this.metadata.getDocumentsBySearchCriteria(criteria);
}
// Now filter the documents based upon the criteria.
Hashtable<String, Integer> results = new Hashtable<String, Integer>();
Enumeration<String> e = counts.keys();
while(e.hasMoreElements()){
String doc = e.nextElement();
// document search
if(docList.size() > 0 && docList.contains(doc)){
// More cleanup before we add to our final output list -
// Spans keep track of a beginning and end position it appears,
// so we must divide the total count by 2.
int finalcount = counts.get(doc);
if(finalcount > 1){
// Note: This isn't true now? Keep testing.
//finalcount = finalcount / 2;
}
results.put(doc, finalcount);
}
}
return results;
}
/**
* Separates a query into spanqueries.
* @param query
* @param termList
* @param reader
* @return
*/
public ArrayList buildClauses(Query query, ArrayList<SpanQuery> termList, IndexReader reader){
try {
if(query instanceof BooleanQuery){
// this is a boolean query OR this is a boolean query
BooleanQuery bq = (BooleanQuery) query;
BooleanClause[] bclauses = bq.getClauses();
for (int i = 0; i < bclauses.length; i++){
Query childQuery = bclauses[i].getQuery();
// Rewrite this clause e.g one* becomes (one OR onerous)
childQuery.rewrite(reader);
buildClauses(childQuery, termList, reader);
}
} else if(query instanceof PhraseQuery){
// phrasequery ex: "this is a phrase query"
// convert phrase queries to SpanNearQuery because
// phrase queries will do the phrase out of order.
PhraseQuery pq = (PhraseQuery)query;
Term[] termArr = pq.getTerms();
SpanTermQuery[] thisSpan = new SpanTermQuery[termArr.length];
for (int i = 0; i < termArr.length; i++) {
Term term = termArr[i];
SpanTermQuery termSpan = new SpanTermQuery(term);
thisSpan[i] = termSpan;
}
SpanNearQuery snq = new SpanNearQuery(thisSpan, 0, true);
termList.add(snq);
} else if(query instanceof TermQuery){
// add to queryList
TermQuery tq = (TermQuery) query;
Term term = tq.getTerm();
SpanQuery stq = new SpanTermQuery(term);
termList.add(stq);
} else if(query instanceof WildcardQuery){
// wildcard query can be like: "?ild*"
WildcardQuery wq = (WildcardQuery) query;
// Rewrite this clause e.g one* becomes (one OR onerous)
Query q = wq.rewrite(reader);
buildClauses(q, termList, reader);
} else if(query instanceof PrefixQuery){
// prefix query ex: "fish*"
PrefixQuery pq = (PrefixQuery) query;
Query q = pq.rewrite(reader);
buildClauses(q, termList, reader);
}
} catch (Exception ex) {
JOptionPane.showMessageDialog(null, "Unknown query type: " + ex.toString());
}
return termList;
}
/**
* Returns the document names and search scores for a given search string.
* @param searchString
* @param criteria
* @return
* @throws org.apache.lucene.index.CorruptIndexException
* @throws java.io.IOException
* @throws org.apache.lucene.queryParser.ParseException
*/
public Hashtable<String, Float> returnResults(String searchString, SearchCriteria criteria) throws CorruptIndexException, IOException, ParseException{
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
if (searchString == null) {
return null;
}
QueryParser parser = new QueryParser(field, analyzer);
Query query = parser.parse(searchString);
// Search the collection
CustomHitCollector collector = new CustomHitCollector(searcher, CustomHitCollector.ID_TYPE);
searcher.search(query, collector);
// Output each file that the term was found in.
Hashtable<String, Float> counts = collector.getDocumentsList();
List<String> docList = new ArrayList<String>();
List<String> criterialist = criteria.getDocumentList();
// Build a list of documents that the search is limited to.
if(criterialist.size() > 0){
// document search
docList = criterialist;
} else{
// metadata search
docList = this.metadata.getDocumentsBySearchCriteria(criteria);
}
// Now filter the documents based upon the criteria.
Hashtable<String, Float> results = new Hashtable<String, Float>();
Enumeration<String> e = counts.keys();
while(e.hasMoreElements()){
String doc = e.nextElement();
// document search
if(docList.size() > 0 && docList.contains(doc)){
results.put(doc, counts.get(doc));
}
}
return results;
}
/**
* Returns the absolute path that the file was indexed with.
* Used for opening the contents of the file.
* @param inFileName
* @return
* @throws org.apache.lucene.index.CorruptIndexException
* @throws java.io.IOException
* @throws org.apache.lucene.queryParser.ParseException
*/
public String getPath(String inFileName) throws CorruptIndexException, IOException{
IndexSearcher searcher = new IndexSearcher(reader);
String retVal = null;
Term t = new Term("file name", inFileName);
Query query = new TermQuery(t);
CustomHitCollector collector = new CustomHitCollector(searcher, CustomHitCollector.PATH_TYPE);
searcher.search(query, collector);
Hashtable<String, Float> results = collector.getDocumentsList();
Enumeration<String> keys = results.keys();
while(keys.hasMoreElements()){
String path = keys.nextElement();
return path;
}
return retVal;
}
/**
* Returns all of the file names in the index.
* @return
*/
public Vector<String> getAllFileNames() throws CorruptIndexException, IOException{
Vector<String> files = null;
int numdocs = this.reader.numDocs();
files = new Vector<String>();
for (int i = 0; i < numdocs; i++) {
Document doc = this.reader.document(i);
String thisFileName = doc.get("file name");
files.add(thisFileName);
}
return files;
}
class CustomHitCollector extends HitCollector{
private IndexSearcher searcher;
// list of documents and search scores
private Hashtable<String, Float> documentsList = new Hashtable<String, Float>();
// count of hits per document
private Hashtable<String, Integer> documentHits = new Hashtable<String, Integer>();
public static final String PATH_TYPE = "path";
public static final String ID_TYPE = "id";
private String type = ID_TYPE; // default
public CustomHitCollector(IndexSearcher searcher, String type){
this.searcher = searcher;
this.type = type;
}
@Override
public void collect(int doc, float score) {
Document document;
try {
document = searcher.doc(doc);
String fileName = null;
if(this.type.equalsIgnoreCase(PATH_TYPE)){
fileName = document.get("path");
String separator = "\\" + java.io.File.separator;
fileName = fileName.replaceAll("::", separator);
} else{
fileName = document.get("file name");
}
// track score per document
this.documentsList.put(fileName, score);
// track hits per document.
if(this.documentHits.containsKey(fileName)){
int count = this.documentHits.get(fileName);
count++;
this.documentHits.put(fileName, count);
} else{
this.documentHits.put(fileName, 1);
}
} catch (CorruptIndexException ex) {
} catch (IOException ex) {
}
}
public Hashtable<String, Float> getDocumentsList(){
return this.documentsList;
}
public Hashtable<String, Integer> getDocumentHits(){
return this.documentHits;
}
}//end class CustomHitCollector
}//end class