Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions IRProject/configure.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

irp.input.weights=0.2,0.3,0.1,0.1,0.3
irp.input.file.iterator=input.CSVFileContentIterator

#check https://lucene.apache.org/core/5_0_0/analyzers-common/overview-summary.html for various analyzer options
irp.processing.algorigthm.class=textanalysis.TypicalAlgorithm
Expand All @@ -11,10 +12,9 @@ irp.processing.analyzer=textanalysis.CustomAnalyzer
irp.processing.hashing=hashing.SimHash32
irp.processing.distance=hashing.HammingDistance

irp.processing.processor=processing.Processor

irp.processing.processor=processing.ProcessorClass(changeme)

irp.output.class=output.OutputImplementation(change me)
irp.output.class=output.SimpleOutput

#logging information
#log.file.path=/path/to/store/LOGS/
Expand Down
11 changes: 11 additions & 0 deletions IRProject/src/main/java/configuration/Configuration.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public class Configuration {
private String hashingAlgorithm = null;
private String hashDistance = null;
private String weights;
private String contentIterator;

private File inputFile;
private Properties props = null;
Expand All @@ -41,6 +42,7 @@ public Configuration(String filePath) {
hashingAlgorithm = props.getProperty(Constants.hashingAlgorithm);
hashDistance = props.getProperty(Constants.hashDistance);
weights = props.getProperty(Constants.weights);
contentIterator = props.getProperty(Constants.contentIterator);

if (!checkPropertiesIntegrity())
System.exit(-1);
Expand All @@ -66,6 +68,10 @@ public List<Float> getWeights() {
return floatWeights;
}

public String getContentIterator() {
return contentIterator;
}

public String getHashDistance() {
return hashDistance;
}
Expand Down Expand Up @@ -127,6 +133,11 @@ protected Boolean checkPropertiesIntegrity() {
logger.error("no valid irp.processing.algorigthm.class value");
confHealth = false;
}
if (contentIterator == null || contentIterator.trim().equals("")) {

logger.error("no valid irp.input.file.iterator value");
confHealth = false;
}
if (getWeights() == null) {

logger.error("no valid irp.input.weights values");
Expand Down
10 changes: 4 additions & 6 deletions IRProject/src/main/java/configuration/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,15 @@ public final class Constants {
public static final String analyzerClass = "irp.processing.analyzer";

public static final String outputClass = "irp.output.class";

public static final String processorClass = "irp.processing.processor";

public static final String hashingAlgorithm = "irp.processing.hashing";

public static final String hashDistance = "irp.processing.distance";

public static final String weights = "irp.input.weights";


public static final String hashDistance = "irp.processing.distance";

public static final String weights = "irp.input.weights";

public static final String contentIterator = "irp.input.file.iterator";

}
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ private PropertiesSingleton() {

}

protected static synchronized Properties getInstance() {
public static synchronized Properties getInstance() {
if (instance == null) {
instance = new PropertiesSingleton();
}
Expand Down
28 changes: 25 additions & 3 deletions IRProject/src/main/java/entry/IRP.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
package entry;

import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -25,6 +30,8 @@ public static void main(String[] args)
String processorClassString = configuration.getProcessorClass();
String hashingAlgorithm = configuration.getHashingAlgorithm();
String hashingDistance = configuration.getHashDistance();
String contentIteratorName = configuration.getContentIterator();
String outputClassName = configuration.getOutputClass();

Analyzer analyzer = (Analyzer) ClassObjectCreation.createClassObject(analyzerClassString);
logger.info("Analyzer class is:" + analyzerClassString);
Expand All @@ -39,11 +46,26 @@ public static void main(String[] args)
HashDistance hashDistance = (HashDistance) ClassObjectCreation.createClassObject(hashingDistance);
logger.info("Hashing distance algorithm class is:" + hashingDistance);

Iterator<String> contentIterator = null;
try {
contentIterator = (Iterator<String>) ClassObjectCreation.createClassObjectWithFileParam(contentIteratorName,
configuration.getInputFile());
} catch (IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.exit(-1);
}
logger.info("Content iterator class:" + contentIteratorName);

Processor processor = (Processor) ClassObjectCreation.createClassObject(processorClassString);

Output result = processor.process(configuration.getInputFile(), textAnalysisAlgorithm, analyzer, ha,
hashDistance);
logger.info(result.toString());
HashMap<String, List<String>> result = processor.process(configuration.getInputFile(), textAnalysisAlgorithm,
analyzer, ha, hashDistance, contentIterator, configuration.getWeights());

Output outputResult = (Output) ClassObjectCreation.createClassObject(outputClassName);
logger.info("Output class name:" + outputClassName);

logger.info("Final results:" + outputResult.presentData(result));

}

Expand Down
40 changes: 40 additions & 0 deletions IRProject/src/main/java/input/CSVFileContentIterator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package input;

import java.io.File;
import java.io.IOException;
import java.util.Iterator;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;

public class CSVFileContentIterator implements Iterator<String> {
private File inputFile;
int position = 0;
LineIterator it;


public CSVFileContentIterator(File inputFile) throws IOException {
// TODO Auto-generated constructor stub
this.inputFile = inputFile;
it = FileUtils.lineIterator(this.inputFile, "UTF-8");
}

@Override
public boolean hasNext() {
// TODO Auto-generated method stub
return it.hasNext();
}

@Override
public String next() {
// TODO Auto-generated method stub
return it.next();
}

@Override
public void remove() {
// TODO Auto-generated method stub

}

}
3 changes: 2 additions & 1 deletion IRProject/src/main/java/output/Output.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package output;

import java.util.HashMap;
import java.util.List;

public interface Output {

public Object presentData(HashMap<String, Integer> finalMatrix);
public Object presentData(HashMap<String, List<String>> fingerprints);


}
14 changes: 14 additions & 0 deletions IRProject/src/main/java/output/SimpleOutput.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package output;

import java.util.HashMap;
import java.util.List;

public class SimpleOutput implements Output {

@Override
public Object presentData(HashMap<String, List<String>> fingerprints) {
// TODO Auto-generated method stub
return fingerprints;
}

}
78 changes: 65 additions & 13 deletions IRProject/src/main/java/processing/Processor.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package processing;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.slf4j.Logger;
Expand All @@ -10,48 +13,97 @@
import entry.IRP;
import hashing.HashDistance;
import hashing.SimHash;
import output.Output;
import textanalysis.IRTextAnalysisAlgorithm;

public abstract class Processor {
public class Processor {
private static final Logger logger = LoggerFactory.getLogger(IRP.class);

File inputFile;
IRTextAnalysisAlgorithm textAnalysisAlgorithm;
Analyzer textAnalyzer;
SimHash hashingAlgorithm;
HashDistance hashDistanceAlgorithm;
Iterator<String> contentIterator;
List<Float> weights;
List<List<Float>> vMatrix;
List<List<Short>> fingerprintMatrix;

private void setup(File inputFile, IRTextAnalysisAlgorithm textAnalysisAlgorithm, Analyzer textAnalyzer,
SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm) {
SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm, Iterator<String> contentIterator,
List<Float> weights) {
this.inputFile = inputFile;
this.textAnalysisAlgorithm = textAnalysisAlgorithm;
this.textAnalyzer = textAnalyzer;
this.hashingAlgorithm = hashingAlgorithm;
this.hashDistanceAlgorithm = hashDistanceAlgorithm;
this.contentIterator = contentIterator;
this.weights = weights;
vMatrix = new ArrayList<List<Float>>();
fingerprintMatrix = new ArrayList<List<Short>>();

}

private boolean checkInput(File inputFile, IRTextAnalysisAlgorithm textAnalysisAlgorithm, Analyzer textAnalyzer,
SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm) {
SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm, Iterator<String> contentIterator,
List<Float> weights) {

if (inputFile != null && textAnalysisAlgorithm != null && textAnalyzer != null && hashingAlgorithm != null
&& hashDistanceAlgorithm != null && contentIterator != null && weights != null)
return true;

return false;
}

protected List<Long> simhashTokens(List<String> tokens) {

return null;
}

protected List<Float> calculateVMatrix(List<Long> simhashedTokens) {

return null;
}

protected List<Short> createFingerprint(List<Float> vMatrix) {
return null;
}

return true;
protected HashMap<String, List<String>> fingerprintComparison(List<List<Short>> fingerprintMatrix) {
return null;
}

public final Output process(File inputFile, IRTextAnalysisAlgorithm textAnalysisAlgorithm, Analyzer textAnalyzer,
SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm) {
if (this.checkInput(inputFile, textAnalysisAlgorithm, textAnalyzer, hashingAlgorithm, hashDistanceAlgorithm))
this.setup(inputFile, textAnalysisAlgorithm, textAnalyzer, hashingAlgorithm, hashDistanceAlgorithm);
public final HashMap<String, List<String>> process(File inputFile, IRTextAnalysisAlgorithm textAnalysisAlgorithm,
Analyzer textAnalyzer, SimHash hashingAlgorithm, HashDistance hashDistanceAlgorithm,
Iterator<String> contentIterator, List<Float> weights) {
if (this.checkInput(inputFile, textAnalysisAlgorithm, textAnalyzer, hashingAlgorithm, hashDistanceAlgorithm,
contentIterator, weights))
this.setup(inputFile, textAnalysisAlgorithm, textAnalyzer, hashingAlgorithm, hashDistanceAlgorithm,
contentIterator, weights);
else {
logger.error("Input parameters are wrong, fix and re-run");
System.exit(-1);
}
this.createFingerprintMatrix();
///etc

return null;
while (contentIterator.hasNext()) {
String fileLine = contentIterator.next();
List<String> lineTokens = textAnalysisAlgorithm.analyzeTextInput(textAnalyzer, fileLine);
logger.info("Tokens extracted:"+lineTokens.toString());
List<Long> simhashedTokens = simhashTokens(lineTokens);
logger.info("Simhashed Tokens extracted:"+simhashedTokens);
List<Float> vMatrixRecord = calculateVMatrix(simhashedTokens);
logger.info("VMatrix record created:"+vMatrixRecord);
vMatrix.add(vMatrixRecord);
List<Short> fingerprint = createFingerprint(vMatrixRecord);
logger.info("Fingerpring record created:"+fingerprint);
fingerprintMatrix.add(fingerprint);
}

return fingerprintComparison(fingerprintMatrix);
}

abstract HashMap<String, String> createFingerprintMatrix();
public Iterator<String> getContentIterator() {
return contentIterator;
}

public HashDistance getHashDistanceAlgorithm() {
return hashDistanceAlgorithm;
Expand Down
4 changes: 2 additions & 2 deletions IRProject/src/main/java/textanalysis/TypicalAlgorithm.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

public class TypicalAlgorithm implements IRTextAnalysisAlgorithm {


@Override
public List<String> analyzeTextInput(Analyzer analyzer, String inputText) {
Expand All @@ -22,7 +21,8 @@ public List<String> analyzeTextInput(Analyzer analyzer, String inputText) {

stream.reset();
while (stream.incrementToken()) {
result.add(stream.getAttribute(CharTermAttribute.class).toString());
String token = stream.getAttribute(CharTermAttribute.class).toString();
result.add(token.trim());
}
return result;
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public void testTokenizeInputAnalyzerString() {

Analyzer analyzer = new CustomAnalyzer();

List<String> processInput = algotrithm.analyzeTextInput(analyzer, "(terminal?terminator),terminate");
List<String> processInput = algotrithm.analyzeTextInput(analyzer, "shit, shit, shit");

logger.info(processInput.toString());

Expand Down
20 changes: 20 additions & 0 deletions IRProject/src/main/java/utilities/ClassObjectCreation.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package utilities;

import java.io.File;
import java.lang.reflect.InvocationTargetException;

public class ClassObjectCreation {

private ClassObjectCreation() {
Expand All @@ -17,4 +20,21 @@ public static Object createClassObject(String classname)

}

public static Object createClassObjectWithFileParam(String classname, File inputFile)
throws ClassNotFoundException, InstantiationException, IllegalAccessException, IllegalArgumentException,
InvocationTargetException, NoSuchMethodException, SecurityException {

ClassLoader classLoader = ClassLoader.getSystemClassLoader();
Class<?> class2Load;

class2Load = classLoader.loadClass(classname);

@SuppressWarnings("rawtypes")
Class[] cArg = new Class[1]; // Our constructor has 1 argument
cArg[0] = File.class; // First argument is of *object* type File

return class2Load.getDeclaredConstructor(cArg).newInstance(inputFile);

}

}