diff --git a/pom.xml b/pom.xml
index d840c9c..639b773 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,26 +4,80 @@
de.grundid.maven
java-parent
- 2011.0
+ 2012.1
de.grundid.twiki
wiktionary-parser
- 0.0.1-SNAPSHOT
+ 1.0.0-SNAPSHOT
jar
- wiktionary-parser
- http://maven.apache.org
-
UTF-8
+ 3.1.1.RELEASE
-
- junit
- junit
-
+
+ org.springframework
+ spring-orm
+ ${org.springframework-version}
+
+
+ org.springframework
+ spring-test
+ ${org.springframework-version}
+ test
+
+
+ org.springframework.data
+ spring-data-jpa
+ 1.1.0.RC1
+
+
+ commons-logging
+ commons-logging
+
+
+
+
+ junit
+ junit
+
+
+ org.hibernate
+ hibernate-entitymanager
+ 3.6.4.Final
+
+
+ commons-dbcp
+ commons-dbcp
+ 1.4
+
+
+ org.hibernate.javax.persistence
+ hibernate-jpa-2.0-api
+ 1.0.1.Final
+
+
+ mysql
+ mysql-connector-java
+ 5.1.18
+
+
+
+
+ spring-milestone
+ Spring Maven MILESTONE Repository
+ http://repo.springsource.org/libs-milestone
+
+
+ spring-snapshot
+ Spring Maven SNAPSHOT Repository
+ http://repo.springsource.org/libs-snapshot
+
+
+
diff --git a/src/main/java/de/grundid/twiki/jpa/HibernateJpaVendorAdapter.java b/src/main/java/de/grundid/twiki/jpa/HibernateJpaVendorAdapter.java
new file mode 100644
index 0000000..4d77bce
--- /dev/null
+++ b/src/main/java/de/grundid/twiki/jpa/HibernateJpaVendorAdapter.java
@@ -0,0 +1,15 @@
+package de.grundid.twiki.jpa;
+
+import java.util.Map;
+
+import org.hibernate.cfg.ImprovedNamingStrategy;
+
+public class HibernateJpaVendorAdapter extends org.springframework.orm.jpa.vendor.HibernateJpaVendorAdapter {
+
+ @Override
+ public Map getJpaPropertyMap() {
+ Map map = super.getJpaPropertyMap();
+ map.put("hibernate.ejb.naming_strategy", ImprovedNamingStrategy.class.getName());
+ return map;
+ }
+}
diff --git a/src/main/java/de/grundid/twiki/jpa/WikiEntry.java b/src/main/java/de/grundid/twiki/jpa/WikiEntry.java
new file mode 100644
index 0000000..cd7af82
--- /dev/null
+++ b/src/main/java/de/grundid/twiki/jpa/WikiEntry.java
@@ -0,0 +1,58 @@
+package de.grundid.twiki.jpa;
+
+import javax.persistence.Entity;
+import javax.persistence.GeneratedValue;
+import javax.persistence.Id;
+
+@Entity
+public class WikiEntry {
+
+ @Id
+ @GeneratedValue
+ private Integer wikiEntryId;
+ private String source;
+ private String title;
+ private String category;
+ private String entry;
+
+ public Integer getWikiEntryId() {
+ return wikiEntryId;
+ }
+
+ public void setWikiEntryId(Integer wikiEntryId) {
+ this.wikiEntryId = wikiEntryId;
+ }
+
+ public String getSource() {
+ return source;
+ }
+
+ public void setSource(String source) {
+ this.source = source;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public void setTitle(String title) {
+ this.title = title;
+ }
+
+ public String getCategory() {
+ return category;
+ }
+
+ public void setCategory(String category) {
+ this.category = category;
+ }
+
+ public String getEntry() {
+ return entry;
+ }
+
+ public void setEntry(String entry) {
+ this.entry = entry;
+ }
+
+}
diff --git a/src/main/java/de/grundid/twiki/jpa/WikiEntryRepository.java b/src/main/java/de/grundid/twiki/jpa/WikiEntryRepository.java
new file mode 100644
index 0000000..11c1b8a
--- /dev/null
+++ b/src/main/java/de/grundid/twiki/jpa/WikiEntryRepository.java
@@ -0,0 +1,7 @@
+package de.grundid.twiki.jpa;
+
+import org.springframework.data.jpa.repository.JpaRepository;
+
+public interface WikiEntryRepository extends JpaRepository {
+
+}
diff --git a/src/main/java/de/grundid/twiki/parser/DbWriterConsumer.java b/src/main/java/de/grundid/twiki/parser/DbWriterConsumer.java
deleted file mode 100644
index 071af4c..0000000
--- a/src/main/java/de/grundid/twiki/parser/DbWriterConsumer.java
+++ /dev/null
@@ -1,46 +0,0 @@
-package de.grundid.twiki.parser;
-
-import java.util.HashSet;
-import java.util.Set;
-
-public class DbWriterConsumer extends Consumer {
-
- private static Set ignorePrefixes = new HashSet();
- static {
- ignorePrefixes.add("MediaWiki:");
- ignorePrefixes.add("Wiktionary:");
-
- ignorePrefixes.add("Datei:");
- ignorePrefixes.add("Hilfe:");
- ignorePrefixes.add("Vorlage:");
- ignorePrefixes.add("Kategorie:");
- ignorePrefixes.add("Thesaurus:");
- ignorePrefixes.add("Verzeichnis:");
-
- ignorePrefixes.add("Appendix:");
- ignorePrefixes.add("Category:");
- ignorePrefixes.add("Help:");
- ignorePrefixes.add("Template:");
- ignorePrefixes.add("Rhymes:");
- ignorePrefixes.add("Rhmyes:");
- ignorePrefixes.add("Sign gloss:");
- ignorePrefixes.add("Summary:");
- ignorePrefixes.add("Thread:");
- ignorePrefixes.add("Transwiki:");
- ignorePrefixes.add("Unsupported titles/");
- ignorePrefixes.add("Wikisaurus:");
- ignorePrefixes.add("Citations:");
- ignorePrefixes.add("Concordance:");
- ignorePrefixes.add("File:");
- ignorePrefixes.add("Glossary:");
- ignorePrefixes.add("Index:");
- ignorePrefixes.add("Wiktionary talk:");
- }
-
- @Override
- protected void consume(WiktionaryEntry element) {
- // TODO Auto-generated method stub
-
- }
-
-}
diff --git a/src/main/java/de/grundid/twiki/parser/ImportHandler.java b/src/main/java/de/grundid/twiki/parser/ImportHandler.java
index faee818..35ba155 100644
--- a/src/main/java/de/grundid/twiki/parser/ImportHandler.java
+++ b/src/main/java/de/grundid/twiki/parser/ImportHandler.java
@@ -9,6 +9,8 @@
import org.xml.sax.helpers.DefaultHandler;
+import de.grundid.twiki.parser.consumer.Consumer;
+
public class ImportHandler implements Runnable {
private int defaultQueueSize = 20000;
diff --git a/src/main/java/de/grundid/twiki/parser/WiktionaryData.java b/src/main/java/de/grundid/twiki/parser/WiktionaryData.java
new file mode 100644
index 0000000..814a4db
--- /dev/null
+++ b/src/main/java/de/grundid/twiki/parser/WiktionaryData.java
@@ -0,0 +1,71 @@
+package de.grundid.twiki.parser;
+
+import java.util.HashSet;
+import java.util.Set;
+
+public class WiktionaryData {
+
+ public static String[] partsOfSpeech = { "Substantiv", "KonjugierteForm", "DeklinierteForm", "PartizipI",
+ "Adjektiv", "Verb", "Abkürzung", "Redewendung", "Adverb", "PartizipII", "Wortverbindung",
+ "ErweiterterInfinitiv", "Numerale", "Toponym", "Nachname", "Personalpronomen" };
+
+ public static String[] partsOfSpeechAll = { "Substantiv", "KonjugierteForm", "DeklinierteForm", "PartizipI",
+ "Adjektiv", "Verb", "Abkürzung", "Redewendung", "Adverb", "PartizipII", "Wortverbindung",
+ "ErweiterterInfinitiv", "Numerale", "Toponym", "Nachname", "Personalpronomen", "Interjektion",
+ "Präposition", "Sprichwort", "Konjunktion", "GebundenesLexem", "Präfix", "Gerundium", "Eigenname",
+ "Suffix", "Vorname", "Ortsnamen-Grundwort", "Hiragana", "Indefinitpronomen", "Grußformel", "Pronomen",
+ "Demonstrativpronomen", "Zahlzeichen", "Possessivpronomen", "Artikel", "Partikel", "Buchstabe",
+ "Interrogativpronomen", "Kontraktion", "Subjunktion", "Symbol", "Reflexivpronomen", "Komparativ",
+ "Onomatopoetikum", "Umschrift", "Hilfsverb", "Interrogativadverb", "Relativpronomen", "Zahl",
+ "Gradpartikel", "Pronominaladverb", "Merkspruch", "Affix", "Antwortpartikel", "Konjunktionaladverb",
+ "Superlativ", "Modalpartikel", "Negationspartikel", "Postposition", "Präfixoid",
+ "ReflexivesPersonalpronomen", "Satzzeichen", "Fokuspartikel", "Hanzi", "Katakana",
+ "KontraktionausPräpositionundArtikel", "ReflexivesPossessivpronomen", "Reziprokpronomen",
+ "Singularetantum", "Suffixoid", "Zahladjektiv", "Zahlklassifikator" };
+
+ public static Set ignorePrefixes = new HashSet();
+ static {
+ ignorePrefixes.add("MediaWiki:");
+ ignorePrefixes.add("Wiktionary:");
+
+ ignorePrefixes.add("Datei:");
+ ignorePrefixes.add("Hilfe:");
+ ignorePrefixes.add("Vorlage:");
+ ignorePrefixes.add("Kategorie:");
+ ignorePrefixes.add("Thesaurus:");
+ ignorePrefixes.add("Verzeichnis:");
+
+ ignorePrefixes.add("Appendix:");
+ ignorePrefixes.add("Category:");
+ ignorePrefixes.add("Help:");
+ ignorePrefixes.add("Template:");
+ ignorePrefixes.add("Rhymes:");
+ ignorePrefixes.add("Rhmyes:");
+ ignorePrefixes.add("Sign gloss:");
+ ignorePrefixes.add("Summary:");
+ ignorePrefixes.add("Thread:");
+ ignorePrefixes.add("Transwiki:");
+ ignorePrefixes.add("Unsupported titles/");
+ ignorePrefixes.add("Wikisaurus:");
+ ignorePrefixes.add("Citations:");
+ ignorePrefixes.add("Concordance:");
+ ignorePrefixes.add("File:");
+ ignorePrefixes.add("Glossary:");
+ ignorePrefixes.add("Index:");
+ ignorePrefixes.add("Wiktionary talk:");
+ }
+
+ public static boolean isPrefixed(String title) {
+ return getPrefixIfAny(title) != null;
+ }
+
+ public static String getPrefixIfAny(String title) {
+ for (String prefix : WiktionaryData.ignorePrefixes) {
+ if (title.startsWith(prefix)) {
+ return prefix;
+ }
+ }
+ return null;
+ }
+
+}
diff --git a/src/main/java/de/grundid/twiki/parser/WiktionaryImporter.java b/src/main/java/de/grundid/twiki/parser/WiktionaryImporter.java
index 0d7a9e9..1c0209a 100644
--- a/src/main/java/de/grundid/twiki/parser/WiktionaryImporter.java
+++ b/src/main/java/de/grundid/twiki/parser/WiktionaryImporter.java
@@ -11,29 +11,30 @@ public class WiktionaryImporter extends OsmDefaultHandler {
private WiktionaryEntry entry;
private Deque tagQueue = new LinkedList();
+ private StringBuilder content = new StringBuilder();
@Override
public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
tagQueue.add(name);
+ content.setLength(0);
if (name.equals("page"))
entry = new WiktionaryEntry();
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
- String currentTag = tagQueue.peekLast();
- String content = new String(ch, start, length).trim();
- if (currentTag.equals("title")) {
- entry.setTitle(content);
- }
- else if (currentTag.equals("text")) {
- entry.setText(content);
- }
+ content.append(ch, start, length);
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
- tagQueue.removeLast();
+ String currentTag = tagQueue.removeLast();
+ if (currentTag.equals("title")) {
+ entry.setTitle(content.toString());
+ }
+ else if (currentTag.equals("text")) {
+ entry.setText(content.toString());
+ }
if (entry == null)
return;
diff --git a/src/main/java/de/grundid/twiki/parser/Consumer.java b/src/main/java/de/grundid/twiki/parser/consumer/Consumer.java
similarity index 95%
rename from src/main/java/de/grundid/twiki/parser/Consumer.java
rename to src/main/java/de/grundid/twiki/parser/consumer/Consumer.java
index d2ea89e..f120bb5 100644
--- a/src/main/java/de/grundid/twiki/parser/Consumer.java
+++ b/src/main/java/de/grundid/twiki/parser/consumer/Consumer.java
@@ -1,4 +1,4 @@
-package de.grundid.twiki.parser;
+package de.grundid.twiki.parser.consumer;
import java.util.concurrent.BlockingQueue;
diff --git a/src/main/java/de/grundid/twiki/parser/consumer/DbWriterConsumer.java b/src/main/java/de/grundid/twiki/parser/consumer/DbWriterConsumer.java
new file mode 100644
index 0000000..2e38132
--- /dev/null
+++ b/src/main/java/de/grundid/twiki/parser/consumer/DbWriterConsumer.java
@@ -0,0 +1,42 @@
+package de.grundid.twiki.parser.consumer;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
+import de.grundid.twiki.jpa.WikiEntry;
+import de.grundid.twiki.jpa.WikiEntryRepository;
+import de.grundid.twiki.parser.WiktionaryData;
+import de.grundid.twiki.parser.WiktionaryEntry;
+
+@Component
+public class DbWriterConsumer extends Consumer {
+
+ @Autowired
+ private WikiEntryRepository wikiEntryRepository;
+
+ private String currentSource;
+
+ @Override
+ protected void consume(WiktionaryEntry element) {
+
+ WikiEntry wikiEntry = new WikiEntry();
+ wikiEntry.setSource(currentSource);
+ wikiEntry.setTitle(element.getTitle());
+ wikiEntry.setEntry(element.getText());
+ wikiEntry.setCategory("_word_");
+
+ for (String prefix : WiktionaryData.ignorePrefixes) {
+ if (element.getTitle().startsWith(prefix)) {
+ wikiEntry.setCategory(prefix);
+ break;
+ }
+ }
+
+ wikiEntryRepository.save(wikiEntry);
+
+ }
+
+ public void setCurrentSource(String currentSource) {
+ this.currentSource = currentSource;
+ }
+}
diff --git a/src/main/java/de/grundid/twiki/parser/consumer/DummyConsumer.java b/src/main/java/de/grundid/twiki/parser/consumer/DummyConsumer.java
new file mode 100644
index 0000000..6e2ec25
--- /dev/null
+++ b/src/main/java/de/grundid/twiki/parser/consumer/DummyConsumer.java
@@ -0,0 +1,12 @@
+package de.grundid.twiki.parser.consumer;
+
+import de.grundid.twiki.parser.WiktionaryEntry;
+
+public class DummyConsumer extends Consumer {
+
+ @Override
+ protected void consume(WiktionaryEntry element) {
+
+ }
+
+}
diff --git a/src/main/java/de/grundid/twiki/parser/consumer/SimpleFilterConsumer.java b/src/main/java/de/grundid/twiki/parser/consumer/SimpleFilterConsumer.java
new file mode 100644
index 0000000..04748c2
--- /dev/null
+++ b/src/main/java/de/grundid/twiki/parser/consumer/SimpleFilterConsumer.java
@@ -0,0 +1,63 @@
+package de.grundid.twiki.parser.consumer;
+
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.TreeMap;
+
+import de.grundid.twiki.parser.WiktionaryData;
+import de.grundid.twiki.parser.WiktionaryEntry;
+
+public class SimpleFilterConsumer extends Consumer {
+
+ private Map> matches = new TreeMap>();
+ private int total;
+
+ public void addPattern(String pattern) {
+ matches.put(pattern, new TreeMap());
+ }
+
+ @Override
+ protected void consume(WiktionaryEntry element) {
+ if (!WiktionaryData.isPrefixed(element.getTitle())) {
+ addIfMatches(element.getText());
+ }
+ }
+
+ private void addIfMatches(String text) {
+ for (Entry> entry : matches.entrySet()) {
+ String pattern = entry.getKey();
+ Map subMatches = entry.getValue();
+ int pos = text.indexOf(pattern);
+ if (pos >= 0) {
+ int endOfLine = text.indexOf("\n", pos);
+ String match = text.substring(pos, endOfLine).replaceAll(" |'|,|=|/", "");
+ incMatch(match, subMatches);
+
+ // String[] split = match.split("\\||}");
+ // incMatch(split[1], subMatches);
+ }
+ }
+ }
+
+ private void incMatch(String match, Map subMatches) {
+ Integer c = subMatches.get(match);
+ if (c == null)
+ c = Integer.valueOf(1);
+ else
+ c += 1;
+ subMatches.put(match, c);
+ total++;
+ }
+
+ public void outputMatches() {
+
+ for (Entry> entry : matches.entrySet()) {
+ for (Entry subEntry : entry.getValue().entrySet()) {
+ System.out.println(entry.getKey() + "\t" + subEntry.getKey() + "\t" + subEntry.getValue());
+ }
+ }
+
+ System.out.println("Total: " + total);
+
+ }
+}
diff --git a/src/main/java/de/grundid/twiki/parser/consumer/SortedFileWriterConsumer.java b/src/main/java/de/grundid/twiki/parser/consumer/SortedFileWriterConsumer.java
new file mode 100644
index 0000000..00542ac
--- /dev/null
+++ b/src/main/java/de/grundid/twiki/parser/consumer/SortedFileWriterConsumer.java
@@ -0,0 +1,59 @@
+package de.grundid.twiki.parser.consumer;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Set;
+import java.util.TreeSet;
+
+import de.grundid.twiki.parser.WiktionaryData;
+import de.grundid.twiki.parser.WiktionaryEntry;
+
+public class SortedFileWriterConsumer extends Consumer {
+
+ private Set words = new TreeSet();
+ private String outputFile;
+
+ public void setOutputFile(String outputFile) {
+ this.outputFile = outputFile;
+ }
+
+ @Override
+ protected void consume(WiktionaryEntry element) {
+ for (String prefix : WiktionaryData.ignorePrefixes) {
+ if (prefix.startsWith(element.getTitle()))
+ return;
+ }
+ words.add(element.getTitle());
+ }
+
+ @Override
+ protected void finishConsuming() {
+ writeWords();
+ }
+
+ private void writeWords() {
+ FileWriter fw = null;
+ try {
+ fw = new FileWriter(new File(outputFile));
+ for (String word : words) {
+ fw.write(word);
+ fw.write('\n');
+ }
+ }
+ catch (IOException e) {
+ }
+ finally {
+ if (fw != null) {
+ try {
+ fw.flush();
+ fw.close();
+ }
+ catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+}
diff --git a/src/test/java/de/grundid/twiki/parser/SimpleImportTest.java b/src/test/java/de/grundid/twiki/parser/SimpleImportTest.java
index 9179fd7..6bba3d3 100644
--- a/src/test/java/de/grundid/twiki/parser/SimpleImportTest.java
+++ b/src/test/java/de/grundid/twiki/parser/SimpleImportTest.java
@@ -1,41 +1,14 @@
package de.grundid.twiki.parser;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.Collection;
-
import org.junit.Test;
+import de.grundid.twiki.parser.consumer.SimpleFilterConsumer;
+
public class SimpleImportTest {
private static final String[] files = { "c:\\tmp\\dewiktionary-20120225-pages-articles.xml",
"c:\\tmp\\enwiktionary-20120406-pages-articles.xml" };
- private void writeWords(String file, Collection words) {
- FileWriter fw = null;
- try {
- fw = new FileWriter(new File(file + ".txt"));
- for (String word : words) {
- fw.write(word);
- fw.write('\n');
- }
- }
- catch (IOException e) {
- }
- finally {
- if (fw != null) {
- try {
- fw.flush();
- fw.close();
- }
- catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
-
@Test
public void testImport() throws Exception {
@@ -43,7 +16,7 @@ public void testImport() throws Exception {
SimpleImporter importer = new SimpleImporter();
ImportHandler handler = new ImportHandler();
- WiktionaryImporter de = processFile(importer, handler, files[0]);
+ WiktionaryImporter de = processFile(importer, handler, files[0], "de");
// WiktionaryImporter en = processFile(importer, handler, files[1]);
// System.out.println("DE before: " + de.getWordsCount());
// de.getWords().removeAll(en.getWords());
@@ -52,20 +25,19 @@ public void testImport() throws Exception {
System.out.println("Time: " + (System.currentTimeMillis() - time) + " ms");
}
- private WiktionaryImporter processFile(SimpleImporter importer, ImportHandler handler, String file) {
+ private WiktionaryImporter processFile(SimpleImporter importer, ImportHandler handler,
+ String file, String source) {
WiktionaryImporter wiktionaryImporter = new WiktionaryImporter();
- handler.setProducer(wiktionaryImporter);
- handler.setConsumer(new Consumer() {
-
- @Override
- protected void consume(WiktionaryEntry element) {
- // System.out.println("[" + element.getTitle() + "]");
- }
- });
+ SimpleFilterConsumer consumer = new SimpleFilterConsumer();
+ for (String part : WiktionaryData.partsOfSpeech)
+ consumer.addPattern("{{Wortart|" + part + "|Deutsch");
+ handler.setProducer(wiktionaryImporter);
+ handler.setConsumer(consumer);
importer.run(file, handler);
+ consumer.outputMatches();
return wiktionaryImporter;
}
}
diff --git a/src/test/java/de/grundid/twiki/parser/TestBase.java b/src/test/java/de/grundid/twiki/parser/TestBase.java
new file mode 100644
index 0000000..ac780a3
--- /dev/null
+++ b/src/test/java/de/grundid/twiki/parser/TestBase.java
@@ -0,0 +1,11 @@
+package de.grundid.twiki.parser;
+
+import org.junit.runner.RunWith;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
+
+@RunWith(SpringJUnit4ClassRunner.class)
+@ContextConfiguration(locations = { "/test-context.xml" })
+public abstract class TestBase {
+
+}
diff --git a/src/test/resources/test-context.xml b/src/test/resources/test-context.xml
new file mode 100644
index 0000000..6f612d7
--- /dev/null
+++ b/src/test/resources/test-context.xml
@@ -0,0 +1,65 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+