Skip to content

Commit

Permalink
Updates to spike exploring how to write the analyzer, and then update…
Browse files Browse the repository at this point in the history
…s to the analyzer that are the result of the spike explorations.
  • Loading branch information
gtarcea committed Nov 15, 2011
1 parent 1f39397 commit 3fb0e96
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 3 deletions.
@@ -0,0 +1,26 @@
package org.fdl.lucene.analyzer;

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.MappingCharFilter;
import org.apache.lucene.analysis.NormalizeCharMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;

public class CongressionalBillAnalyzer extends Analyzer
{
@Override
public TokenStream tokenStream(String fieldName, Reader reader)
{
NormalizeCharMap charMap = new NormalizeCharMap();
charMap.add("/", " ");
CharStream cstream = new MappingCharFilter(charMap, reader);
TokenStream tokenStream = new StandardFilter(Version.LUCENE_30, new StandardTokenizer(Version.LUCENE_30, cstream));
return new CongressionalBillFilter(new LowerCaseFilter(Version.LUCENE_30, tokenStream));
}
}
18 changes: 15 additions & 3 deletions src/test/groovy/lia/analysis/AnalyzerAndFilterSpikeTest.java
Expand Up @@ -4,12 +4,15 @@
import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.MappingCharFilter;
import org.apache.lucene.analysis.NormalizeCharMap;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
import org.junit.Test;

Expand Down Expand Up @@ -77,8 +80,14 @@ private static class CongressionalTextAnalyzer extends Analyzer
{
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream s = new StandardTokenizer(Version.LUCENE_30, reader);
TokenStream result = new CongressionalBillFilter(new LowerCaseFilter(Version.LUCENE_30, s));
NormalizeCharMap charMap = new NormalizeCharMap();
charMap.add("/", " ");
CharStream cstream = new MappingCharFilter(charMap, reader);
//TokenStream s = new StandardTokenizer(Version.LUCENE_30, reader);
TokenStream s = new StandardTokenizer(Version.LUCENE_30, cstream);
//TokenStream tokenStream = new MappingCharFilter(charMap);
TokenStream tokenStream = new StandardFilter(Version.LUCENE_30, s);
TokenStream result = new CongressionalBillFilter(new LowerCaseFilter(Version.LUCENE_30, tokenStream));
return result;
}
}
Expand All @@ -89,6 +98,9 @@ public void test() throws IOException
String sentence1 = "oppose HR 503, H.R. 504, (HR 505), H.R.506, HR507, S 727, S. 728, S.729, (S.730), S731 HR 1293/S. 1338 criminalizing transport and export of equines for slaughter for human consumption";
Analyzer a = new CongressionalTextAnalyzer();
AnalyzerUtils.displayTokensWithFullDetails(a, sentence1);

//a = new StandardAnalyzer(Version.LUCENE_30);
//AnalyzerUtils.displayTokensWithFullDetails(a, sentence1);
}

}

0 comments on commit 3fb0e96

Please sign in to comment.