Permalink
Browse files

few minor changes

  • Loading branch information...
ferhanture committed Mar 27, 2012
1 parent 8805c92 commit 70372d143747ffb3fc62ebe62cb3622ca3353f22
@@ -0,0 +1,77 @@
+package ivory.core.tokenize;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+
+public class LuceneArabicAnalyzer extends Tokenizer {
+ ArabicAnalyzer a;
+ TokenStream ts;
+
+ @Override
+ public void configure(Configuration conf) {
+ a = new ArabicAnalyzer(Version.LUCENE_40);
+ }
+
+ @Override
+ public void configure(Configuration mJobConf, FileSystem fs) {
+ a = new ArabicAnalyzer(Version.LUCENE_40);
+ }
+
+ @Override
+ public String[] processContent(String text) {
+ List<String> tokens = new ArrayList<String>();
+ try {
+ ts = a.tokenStream("dummy", new StringReader(text));
+ CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+ ts.clearAttributes();
+ while (ts.incrementToken()) {
+ tokens.add(termAtt.toString());
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ String[] arr = new String[tokens.size()];
+ return tokens.toArray(arr);
+ }
+
+ public static void main(String[] args) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException{
+ if(args.length < 3){
+ System.err.println("usage: [input] [language] [output-file]");
+ System.exit(-1);
+ }
+// ivory.core.tokenize.Tokenizer tokenizer = TokenizerFactory.createTokenizer(args[1], args[2], null);
+ ivory.core.tokenize.Tokenizer tokenizer = new LuceneArabicAnalyzer();
+ tokenizer.configure(null);
+ BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[2]), "UTF8"));
+ BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF8"));
+
+ // DataInput in = new DataInputStream(new BufferedInputStream(FileSystem.getLocal(new Configuration()).open(new Path(args[0]))));
+ String line = null;
+ while((line = in.readLine()) != null){
+ String[] tokens = tokenizer.processContent(line);
+ System.out.println("Found "+tokens.length+" tokens:");
+ String s = "";
+ for (String token : tokens) {
+ s += token+"||";
+ }
+ out.write(s+"\n");
+ }
+ out.close();
+ }
+
+}
@@ -867,7 +867,7 @@ public void setVocab(VocabularyWritable v){
for(String token : tokens){
token = removeNonUnicodeChars(token);
if(isDiscard(token)){
- sLogger.warn("Discarded stopword "+token);
+// sLogger.warn("Discarded stopword "+token);
continue;
}
@@ -881,7 +881,7 @@ public void setVocab(VocabularyWritable v){
//skip if out of vocab
if(vocab!=null && vocab.get(stemmed)<=0){
- sLogger.warn("Discarded OOV "+token);
+// sLogger.warn("Discarded OOV "+token);
continue;
}
stemmedTokens.add(stemmed);
@@ -935,6 +935,7 @@ public boolean isStopWord(String token) {
public static void main(String[] args) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException{
if(args.length < 4){
System.err.println("usage: [input] [language] [tokenizer-model-path] [output-file]");
+ System.exit(-1);
}
ivory.core.tokenize.Tokenizer tokenizer = TokenizerFactory.createTokenizer(args[1], args[2], null);
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[3]), "UTF8"));
@@ -92,6 +92,7 @@ public void configure(Configuration conf, FileSystem fs) {
public static void main(String[] args) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException{
if(args.length < 4){
System.err.println("usage: [input] [language] [tokenizer-model-path] [output-file]");
+ System.exit(-1);
}
ivory.core.tokenize.Tokenizer tokenizer = TokenizerFactory.createTokenizer(args[1], args[2], null);
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[3]), "UTF8"));
@@ -45,9 +45,9 @@ public int getNumberTokens(String text){
*/
public String removeNonUnicodeChars(String token) {
StringBuffer fixedToken = new StringBuffer();
- for(int i=0; i<token.length(); i++){
+ for (int i = 0; i < token.length(); i++) {
char c = token.charAt(i);
- if(Character.getNumericValue(c)>=0){
+ if (Character.getNumericValue(c) >= -1) {
fixedToken.append(c);
}
}
@@ -172,16 +172,11 @@ public static float cosineNormalized(HMapSFW vectorA, HMapSFW vectorB) {
return sum;
}
- /**
- ************
- */
public static float cosineNormalized2(HMapSFW vectorA, HMapSFW vectorB) {
-// logger.setLevel(Level.DEBUG);
float sum = 0;
for(edu.umd.cloud9.util.map.MapKF.Entry<String> e : vectorA.entrySet()){
float value = e.getValue();
if(vectorB.containsKey(e.getKey())){
-// logger.debug("Matched "+ e.getKey()+"="+value+" x "+vectorB.get(e.getKey()));
sum+= value*vectorB.get(e.getKey());
}
}

0 comments on commit 70372d1

Please sign in to comment.