From 7b3aec6044088e7271a53bf11c4983a4f3bf58f3 Mon Sep 17 00:00:00 2001 From: Pierrik Lassalas Date: Tue, 10 Oct 2017 14:21:51 +0200 Subject: [PATCH] gs-cv: added a comparison based on Levenshtein distance in OCRPlasty --- .../org/genericsystem/cv/utils/OCRPlasty.java | 42 +++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/gs-cv/src/main/java/org/genericsystem/cv/utils/OCRPlasty.java b/gs-cv/src/main/java/org/genericsystem/cv/utils/OCRPlasty.java index 2b9040214..70f17c15d 100644 --- a/gs-cv/src/main/java/org/genericsystem/cv/utils/OCRPlasty.java +++ b/gs-cv/src/main/java/org/genericsystem/cv/utils/OCRPlasty.java @@ -38,17 +38,19 @@ public static List getRansacInliers(List labels) { return Collections.emptyList(); // int maxLength = getMaxLcsLength(trimmed); - double maxSimilarity = getMaxSimilarity(trimmed); + // double maxSimilarity = getMaxSimilarity(trimmed); + double maxLevenshtein = getMaxLevenshtein(trimmed); Map bestFit = new HashMap<>(); for (int i = 1, maxAttempts = 10; bestFit.size() <= 3 && i <= maxAttempts; ++i) { int t = 1; // Ransac ransac = new Ransac<>(trimmed, getModelProviderMaxLcs(maxLength), 3, 50 * i, t, trimmed.size() / 2); - Ransac ransac = new Ransac<>(trimmed, getModelProviderSimilarity(maxSimilarity), 3, 50 * i, t, trimmed.size() / 2); + // Ransac ransac = new Ransac<>(trimmed, getModelProviderSimilarity(maxSimilarity), 3, 50 * i, t, trimmed.size() / 2); + Ransac ransac = new Ransac<>(trimmed, getModelProviderLevenshtein(maxLevenshtein), 3, 50 * i, t, trimmed.size() / 2); try { ransac.compute(); bestFit = ransac.getBestDataSet(); - // bestFit.entrySet().forEach(entry -> System.out.println("key: " + entry.getKey() + " | value: " + entry.getValue())); + bestFit.entrySet().forEach(entry -> System.out.println("key: " + entry.getKey() + " | value: " + entry.getValue())); } catch (Exception e) { // Can't get a good model. Increase the error margin t += 1; @@ -72,6 +74,17 @@ private static double getMaxSimilarity(List labels) { return max; } + private static double getMaxLevenshtein(List labels) { + double max = 0d; + for (int i = 0; i < labels.size(); ++i) { + String base = labels.get(i); + for (int j = 0; j < labels.size(); ++j) { // could use int j = i, but not possible with an iterator in the modelprovider + max += Levenshtein.distance(base, labels.get(j)) / ((double) base.length() + labels.get(j).length()); + } + } + return max; + } + private static Function, Model> getModelProviderMaxLcs(int maxLength) { return datas -> { Iterator it = datas.iterator(); @@ -123,6 +136,29 @@ public Object[] getParams() { }; } + private static Function, Model> getModelProviderLevenshtein(double maxLevenshtein) { // TODO + return datas -> { + return new Model() { + private double max = 0d; + + @Override + public double computeError(String data) { + for (String s : datas) { + if (s != data) { + max += Levenshtein.distance(data, s); + } + } + return Math.abs(max - maxLevenshtein); + } + + @Override + public Object[] getParams() { + return new Object[] { max }; + } + }; + }; + } + public static String ocrPlasty(List labels) { if (labels == null || labels.isEmpty()) throw new IllegalStateException("Attempt to compute the longestCommonSubsequence on an empty list");