diff --git a/IRProject/configure.properties b/IRProject/configure.properties index 3a93fa6..36714aa 100644 --- a/IRProject/configure.properties +++ b/IRProject/configure.properties @@ -1,5 +1,6 @@ irp.input.weights=0.2,0.3,0.1,0.1,0.3 +irp.input.file.iterator=input.CSVFileContentIterator #check https://lucene.apache.org/core/5_0_0/analyzers-common/overview-summary.html for various analyzer options irp.processing.algorigthm.class=textanalysis.TypicalAlgorithm @@ -11,10 +12,9 @@ irp.processing.analyzer=textanalysis.CustomAnalyzer irp.processing.hashing=hashing.SimHash32 irp.processing.distance=hashing.HammingDistance +irp.processing.processor=processing.Processor -irp.processing.processor=processing.ProcessorClass(changeme) - -irp.output.class=output.OutputImplementation(change me) +irp.output.class=output.SimpleOutput #logging information #log.file.path=/path/to/store/LOGS/ diff --git a/IRProject/src/main/java/configuration/Configuration.java b/IRProject/src/main/java/configuration/Configuration.java index 0012482..2397155 100644 --- a/IRProject/src/main/java/configuration/Configuration.java +++ b/IRProject/src/main/java/configuration/Configuration.java @@ -21,6 +21,7 @@ public class Configuration { private String hashingAlgorithm = null; private String hashDistance = null; private String weights; + private String contentIterator; private File inputFile; private Properties props = null; @@ -41,6 +42,7 @@ public Configuration(String filePath) { hashingAlgorithm = props.getProperty(Constants.hashingAlgorithm); hashDistance = props.getProperty(Constants.hashDistance); weights = props.getProperty(Constants.weights); + contentIterator = props.getProperty(Constants.contentIterator); if (!checkPropertiesIntegrity()) System.exit(-1); @@ -66,6 +68,10 @@ public List getWeights() { return floatWeights; } + public String getContentIterator() { + return contentIterator; + } + public String getHashDistance() { return hashDistance; } @@ -127,6 +133,11 @@ protected Boolean checkPropertiesIntegrity() { logger.error("no valid irp.processing.algorigthm.class value"); confHealth = false; } + if (contentIterator == null || contentIterator.trim().equals("")) { + + logger.error("no valid irp.input.file.iterator value"); + confHealth = false; + } if (getWeights() == null) { logger.error("no valid irp.input.weights values"); diff --git a/IRProject/src/main/java/configuration/Constants.java b/IRProject/src/main/java/configuration/Constants.java index 3e4cbae..4d1ba53 100644 --- a/IRProject/src/main/java/configuration/Constants.java +++ b/IRProject/src/main/java/configuration/Constants.java @@ -7,17 +7,15 @@ public final class Constants { public static final String analyzerClass = "irp.processing.analyzer"; public static final String outputClass = "irp.output.class"; - + public static final String processorClass = "irp.processing.processor"; public static final String hashingAlgorithm = "irp.processing.hashing"; - - public static final String hashDistance = "irp.processing.distance"; - - public static final String weights = "irp.input.weights"; - + public static final String hashDistance = "irp.processing.distance"; + public static final String weights = "irp.input.weights"; + public static final String contentIterator = "irp.input.file.iterator"; } diff --git a/IRProject/src/main/java/configuration/PropertiesSingleton.java b/IRProject/src/main/java/configuration/PropertiesSingleton.java index f0279c1..568cd40 100644 --- a/IRProject/src/main/java/configuration/PropertiesSingleton.java +++ b/IRProject/src/main/java/configuration/PropertiesSingleton.java @@ -31,7 +31,7 @@ private PropertiesSingleton() { } - protected static synchronized Properties getInstance() { + public static synchronized Properties getInstance() { if (instance == null) { instance = new PropertiesSingleton(); } diff --git a/IRProject/src/main/java/entry/IRP.java b/IRProject/src/main/java/entry/IRP.java index 378f3b0..20c1004 100644 --- a/IRProject/src/main/java/entry/IRP.java +++ b/IRProject/src/main/java/entry/IRP.java @@ -1,5 +1,10 @@ package entry; +import java.lang.reflect.InvocationTargetException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; + import org.apache.lucene.analysis.Analyzer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,6 +30,8 @@ public static void main(String[] args) String processorClassString = configuration.getProcessorClass(); String hashingAlgorithm = configuration.getHashingAlgorithm(); String hashingDistance = configuration.getHashDistance(); + String contentIteratorName = configuration.getContentIterator(); + String outputClassName = configuration.getOutputClass(); Analyzer analyzer = (Analyzer) ClassObjectCreation.createClassObject(analyzerClassString); logger.info("Analyzer class is:" + analyzerClassString); @@ -39,11 +46,26 @@ public static void main(String[] args) HashDistance hashDistance = (HashDistance) ClassObjectCreation.createClassObject(hashingDistance); logger.info("Hashing distance algorithm class is:" + hashingDistance); + Iterator contentIterator = null; + try { + contentIterator = (Iterator) ClassObjectCreation.createClassObjectWithFileParam(contentIteratorName, + configuration.getInputFile()); + } catch (IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + System.exit(-1); + } + logger.info("Content iterator class:" + contentIteratorName); + Processor processor = (Processor) ClassObjectCreation.createClassObject(processorClassString); - Output result = processor.process(configuration.getInputFile(), textAnalysisAlgorithm, analyzer, ha, - hashDistance); - logger.info(result.toString()); + HashMap> result = processor.process(configuration.getInputFile(), textAnalysisAlgorithm, + analyzer, ha, hashDistance, contentIterator, configuration.getWeights()); + + Output outputResult = (Output) ClassObjectCreation.createClassObject(outputClassName); + logger.info("Output class name:" + outputClassName); + + logger.info("Final results:" + outputResult.presentData(result)); } diff --git a/IRProject/src/main/java/input/CSVFileContentIterator.java b/IRProject/src/main/java/input/CSVFileContentIterator.java new file mode 100644 index 0000000..4de8410 --- /dev/null +++ b/IRProject/src/main/java/input/CSVFileContentIterator.java @@ -0,0 +1,40 @@ +package input; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.LineIterator; + +public class CSVFileContentIterator implements Iterator { + private File inputFile; + int position = 0; + LineIterator it; + + + public CSVFileContentIterator(File inputFile) throws IOException { + // TODO Auto-generated constructor stub + this.inputFile = inputFile; + it = FileUtils.lineIterator(this.inputFile, "UTF-8"); + } + + @Override + public boolean hasNext() { + // TODO Auto-generated method stub + return it.hasNext(); + } + + @Override + public String next() { + // TODO Auto-generated method stub + return it.next(); + } + + @Override + public void remove() { + // TODO Auto-generated method stub + + } + +} diff --git a/IRProject/src/main/java/output/Output.java b/IRProject/src/main/java/output/Output.java index 9a2bc3f..4af1855 100644 --- a/IRProject/src/main/java/output/Output.java +++ b/IRProject/src/main/java/output/Output.java @@ -1,10 +1,11 @@ package output; import java.util.HashMap; +import java.util.List; public interface Output { - public Object presentData(HashMap finalMatrix); + public Object presentData(HashMap> fingerprints); } diff --git a/IRProject/src/main/java/output/SimpleOutput.java b/IRProject/src/main/java/output/SimpleOutput.java new file mode 100644 index 0000000..4f69bc5 --- /dev/null +++ b/IRProject/src/main/java/output/SimpleOutput.java @@ -0,0 +1,14 @@ +package output; + +import java.util.HashMap; +import java.util.List; + +public class SimpleOutput implements Output { + + @Override + public Object presentData(HashMap> fingerprints) { + // TODO Auto-generated method stub + return fingerprints; + } + +} diff --git a/IRProject/src/main/java/processing/Processor.java b/IRProject/src/main/java/processing/Processor.java index 64a5f3f..80b9095 100644 --- a/IRProject/src/main/java/processing/Processor.java +++ b/IRProject/src/main/java/processing/Processor.java @@ -1,7 +1,10 @@ package processing; import java.io.File; +import java.util.ArrayList; import java.util.HashMap; +import java.util.Iterator; +import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.slf4j.Logger; @@ -10,10 +13,9 @@ import entry.IRP; import hashing.HashDistance; import hashing.SimHash; -import output.Output; import textanalysis.IRTextAnalysisAlgorithm; -public abstract class Processor { +public class Processor { private static final Logger logger = LoggerFactory.getLogger(IRP.class); File inputFile; @@ -21,37 +23,87 @@ public abstract class Processor { Analyzer textAnalyzer; SimHash hashingAlgorithm; HashDistance hashDistanceAlgorithm; + Iterator contentIterator; + List weights; + List> vMatrix; + List> fingerprintMatrix; private void setup(File inputFile, IRTextAnalysisAlgorithm textAnalysisAlgorithm, Analyzer textAnalyzer, - SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm) { + SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm, Iterator contentIterator, + List weights) { this.inputFile = inputFile; this.textAnalysisAlgorithm = textAnalysisAlgorithm; this.textAnalyzer = textAnalyzer; this.hashingAlgorithm = hashingAlgorithm; this.hashDistanceAlgorithm = hashDistanceAlgorithm; + this.contentIterator = contentIterator; + this.weights = weights; + vMatrix = new ArrayList>(); + fingerprintMatrix = new ArrayList>(); + } private boolean checkInput(File inputFile, IRTextAnalysisAlgorithm textAnalysisAlgorithm, Analyzer textAnalyzer, - SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm) { + SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm, Iterator contentIterator, + List weights) { + + if (inputFile != null && textAnalysisAlgorithm != null && textAnalyzer != null && hashingAlgorithm != null + && hashDistanceAlgorithm != null && contentIterator != null && weights != null) + return true; + + return false; + } + + protected List simhashTokens(List tokens) { + + return null; + } + + protected List calculateVMatrix(List simhashedTokens) { + + return null; + } + + protected List createFingerprint(List vMatrix) { + return null; + } - return true; + protected HashMap> fingerprintComparison(List> fingerprintMatrix) { + return null; } - public final Output process(File inputFile, IRTextAnalysisAlgorithm textAnalysisAlgorithm, Analyzer textAnalyzer, - SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm) { - if (this.checkInput(inputFile, textAnalysisAlgorithm, textAnalyzer, hashingAlgorithm, hashDistanceAlgorithm)) - this.setup(inputFile, textAnalysisAlgorithm, textAnalyzer, hashingAlgorithm, hashDistanceAlgorithm); + public final HashMap> process(File inputFile, IRTextAnalysisAlgorithm textAnalysisAlgorithm, + Analyzer textAnalyzer, SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm, + Iterator contentIterator, List weights) { + if (this.checkInput(inputFile, textAnalysisAlgorithm, textAnalyzer, hashingAlgorithm, hashDistanceAlgorithm, + contentIterator, weights)) + this.setup(inputFile, textAnalysisAlgorithm, textAnalyzer, hashingAlgorithm, hashDistanceAlgorithm, + contentIterator, weights); else { logger.error("Input parameters are wrong, fix and re-run"); System.exit(-1); } - this.createFingerprintMatrix(); - ///etc - return null; + while (contentIterator.hasNext()) { + String fileLine = contentIterator.next(); + List lineTokens = textAnalysisAlgorithm.analyzeTextInput(textAnalyzer, fileLine); + logger.info("Tokens extracted:"+lineTokens.toString()); + List simhashedTokens = simhashTokens(lineTokens); + logger.info("Simhashed Tokens extracted:"+simhashedTokens); + List vMatrixRecord = calculateVMatrix(simhashedTokens); + logger.info("VMatrix record created:"+vMatrixRecord); + vMatrix.add(vMatrixRecord); + List fingerprint = createFingerprint(vMatrixRecord); + logger.info("Fingerpring record created:"+fingerprint); + fingerprintMatrix.add(fingerprint); + } + + return fingerprintComparison(fingerprintMatrix); } - abstract HashMap createFingerprintMatrix(); + public Iterator getContentIterator() { + return contentIterator; + } public HashDistance getHashDistanceAlgorithm() { return hashDistanceAlgorithm; diff --git a/IRProject/src/main/java/textanalysis/TypicalAlgorithm.java b/IRProject/src/main/java/textanalysis/TypicalAlgorithm.java index a409a3d..ad8e8ba 100644 --- a/IRProject/src/main/java/textanalysis/TypicalAlgorithm.java +++ b/IRProject/src/main/java/textanalysis/TypicalAlgorithm.java @@ -11,7 +11,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; public class TypicalAlgorithm implements IRTextAnalysisAlgorithm { - @Override public List analyzeTextInput(Analyzer analyzer, String inputText) { @@ -22,7 +21,8 @@ public List analyzeTextInput(Analyzer analyzer, String inputText) { stream.reset(); while (stream.incrementToken()) { - result.add(stream.getAttribute(CharTermAttribute.class).toString()); + String token = stream.getAttribute(CharTermAttribute.class).toString(); + result.add(token.trim()); } return result; } catch (IOException e) { diff --git a/IRProject/src/main/java/textanalysis/TypicalAlgotrithmTest.java b/IRProject/src/main/java/textanalysis/TypicalAlgotrithmTest.java index 136a0f5..f1949c6 100644 --- a/IRProject/src/main/java/textanalysis/TypicalAlgotrithmTest.java +++ b/IRProject/src/main/java/textanalysis/TypicalAlgotrithmTest.java @@ -16,7 +16,7 @@ public void testTokenizeInputAnalyzerString() { Analyzer analyzer = new CustomAnalyzer(); - List processInput = algotrithm.analyzeTextInput(analyzer, "(terminal?terminator),terminate"); + List processInput = algotrithm.analyzeTextInput(analyzer, "shit, shit, shit"); logger.info(processInput.toString()); diff --git a/IRProject/src/main/java/utilities/ClassObjectCreation.java b/IRProject/src/main/java/utilities/ClassObjectCreation.java index a308823..e489eef 100644 --- a/IRProject/src/main/java/utilities/ClassObjectCreation.java +++ b/IRProject/src/main/java/utilities/ClassObjectCreation.java @@ -1,5 +1,8 @@ package utilities; +import java.io.File; +import java.lang.reflect.InvocationTargetException; + public class ClassObjectCreation { private ClassObjectCreation() { @@ -17,4 +20,21 @@ public static Object createClassObject(String classname) } + public static Object createClassObjectWithFileParam(String classname, File inputFile) + throws ClassNotFoundException, InstantiationException, IllegalAccessException, IllegalArgumentException, + InvocationTargetException, NoSuchMethodException, SecurityException { + + ClassLoader classLoader = ClassLoader.getSystemClassLoader(); + Class class2Load; + + class2Load = classLoader.loadClass(classname); + + @SuppressWarnings("rawtypes") + Class[] cArg = new Class[1]; // Our constructor has 1 argument + cArg[0] = File.class; // First argument is of *object* type File + + return class2Load.getDeclaredConstructor(cArg).newInstance(inputFile); + + } + }