diff --git a/pom.xml b/pom.xml index d840c9c..639b773 100644 --- a/pom.xml +++ b/pom.xml @@ -4,26 +4,80 @@ de.grundid.maven java-parent - 2011.0 + 2012.1 de.grundid.twiki wiktionary-parser - 0.0.1-SNAPSHOT + 1.0.0-SNAPSHOT jar - wiktionary-parser - http://maven.apache.org - UTF-8 + 3.1.1.RELEASE - - junit - junit - + + org.springframework + spring-orm + ${org.springframework-version} + + + org.springframework + spring-test + ${org.springframework-version} + test + + + org.springframework.data + spring-data-jpa + 1.1.0.RC1 + + + commons-logging + commons-logging + + + + + junit + junit + + + org.hibernate + hibernate-entitymanager + 3.6.4.Final + + + commons-dbcp + commons-dbcp + 1.4 + + + org.hibernate.javax.persistence + hibernate-jpa-2.0-api + 1.0.1.Final + + + mysql + mysql-connector-java + 5.1.18 + + + + + spring-milestone + Spring Maven MILESTONE Repository + http://repo.springsource.org/libs-milestone + + + spring-snapshot + Spring Maven SNAPSHOT Repository + http://repo.springsource.org/libs-snapshot + + + diff --git a/src/main/java/de/grundid/twiki/jpa/HibernateJpaVendorAdapter.java b/src/main/java/de/grundid/twiki/jpa/HibernateJpaVendorAdapter.java new file mode 100644 index 0000000..4d77bce --- /dev/null +++ b/src/main/java/de/grundid/twiki/jpa/HibernateJpaVendorAdapter.java @@ -0,0 +1,15 @@ +package de.grundid.twiki.jpa; + +import java.util.Map; + +import org.hibernate.cfg.ImprovedNamingStrategy; + +public class HibernateJpaVendorAdapter extends org.springframework.orm.jpa.vendor.HibernateJpaVendorAdapter { + + @Override + public Map getJpaPropertyMap() { + Map map = super.getJpaPropertyMap(); + map.put("hibernate.ejb.naming_strategy", ImprovedNamingStrategy.class.getName()); + return map; + } +} diff --git a/src/main/java/de/grundid/twiki/jpa/WikiEntry.java b/src/main/java/de/grundid/twiki/jpa/WikiEntry.java new file mode 100644 index 0000000..cd7af82 --- /dev/null +++ b/src/main/java/de/grundid/twiki/jpa/WikiEntry.java @@ -0,0 +1,58 @@ +package de.grundid.twiki.jpa; + +import javax.persistence.Entity; +import javax.persistence.GeneratedValue; +import javax.persistence.Id; + +@Entity +public class WikiEntry { + + @Id + @GeneratedValue + private Integer wikiEntryId; + private String source; + private String title; + private String category; + private String entry; + + public Integer getWikiEntryId() { + return wikiEntryId; + } + + public void setWikiEntryId(Integer wikiEntryId) { + this.wikiEntryId = wikiEntryId; + } + + public String getSource() { + return source; + } + + public void setSource(String source) { + this.source = source; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getCategory() { + return category; + } + + public void setCategory(String category) { + this.category = category; + } + + public String getEntry() { + return entry; + } + + public void setEntry(String entry) { + this.entry = entry; + } + +} diff --git a/src/main/java/de/grundid/twiki/jpa/WikiEntryRepository.java b/src/main/java/de/grundid/twiki/jpa/WikiEntryRepository.java new file mode 100644 index 0000000..11c1b8a --- /dev/null +++ b/src/main/java/de/grundid/twiki/jpa/WikiEntryRepository.java @@ -0,0 +1,7 @@ +package de.grundid.twiki.jpa; + +import org.springframework.data.jpa.repository.JpaRepository; + +public interface WikiEntryRepository extends JpaRepository { + +} diff --git a/src/main/java/de/grundid/twiki/parser/DbWriterConsumer.java b/src/main/java/de/grundid/twiki/parser/DbWriterConsumer.java deleted file mode 100644 index 071af4c..0000000 --- a/src/main/java/de/grundid/twiki/parser/DbWriterConsumer.java +++ /dev/null @@ -1,46 +0,0 @@ -package de.grundid.twiki.parser; - -import java.util.HashSet; -import java.util.Set; - -public class DbWriterConsumer extends Consumer { - - private static Set ignorePrefixes = new HashSet(); - static { - ignorePrefixes.add("MediaWiki:"); - ignorePrefixes.add("Wiktionary:"); - - ignorePrefixes.add("Datei:"); - ignorePrefixes.add("Hilfe:"); - ignorePrefixes.add("Vorlage:"); - ignorePrefixes.add("Kategorie:"); - ignorePrefixes.add("Thesaurus:"); - ignorePrefixes.add("Verzeichnis:"); - - ignorePrefixes.add("Appendix:"); - ignorePrefixes.add("Category:"); - ignorePrefixes.add("Help:"); - ignorePrefixes.add("Template:"); - ignorePrefixes.add("Rhymes:"); - ignorePrefixes.add("Rhmyes:"); - ignorePrefixes.add("Sign gloss:"); - ignorePrefixes.add("Summary:"); - ignorePrefixes.add("Thread:"); - ignorePrefixes.add("Transwiki:"); - ignorePrefixes.add("Unsupported titles/"); - ignorePrefixes.add("Wikisaurus:"); - ignorePrefixes.add("Citations:"); - ignorePrefixes.add("Concordance:"); - ignorePrefixes.add("File:"); - ignorePrefixes.add("Glossary:"); - ignorePrefixes.add("Index:"); - ignorePrefixes.add("Wiktionary talk:"); - } - - @Override - protected void consume(WiktionaryEntry element) { - // TODO Auto-generated method stub - - } - -} diff --git a/src/main/java/de/grundid/twiki/parser/ImportHandler.java b/src/main/java/de/grundid/twiki/parser/ImportHandler.java index faee818..35ba155 100644 --- a/src/main/java/de/grundid/twiki/parser/ImportHandler.java +++ b/src/main/java/de/grundid/twiki/parser/ImportHandler.java @@ -9,6 +9,8 @@ import org.xml.sax.helpers.DefaultHandler; +import de.grundid.twiki.parser.consumer.Consumer; + public class ImportHandler implements Runnable { private int defaultQueueSize = 20000; diff --git a/src/main/java/de/grundid/twiki/parser/WiktionaryData.java b/src/main/java/de/grundid/twiki/parser/WiktionaryData.java new file mode 100644 index 0000000..814a4db --- /dev/null +++ b/src/main/java/de/grundid/twiki/parser/WiktionaryData.java @@ -0,0 +1,71 @@ +package de.grundid.twiki.parser; + +import java.util.HashSet; +import java.util.Set; + +public class WiktionaryData { + + public static String[] partsOfSpeech = { "Substantiv", "KonjugierteForm", "DeklinierteForm", "PartizipI", + "Adjektiv", "Verb", "Abkürzung", "Redewendung", "Adverb", "PartizipII", "Wortverbindung", + "ErweiterterInfinitiv", "Numerale", "Toponym", "Nachname", "Personalpronomen" }; + + public static String[] partsOfSpeechAll = { "Substantiv", "KonjugierteForm", "DeklinierteForm", "PartizipI", + "Adjektiv", "Verb", "Abkürzung", "Redewendung", "Adverb", "PartizipII", "Wortverbindung", + "ErweiterterInfinitiv", "Numerale", "Toponym", "Nachname", "Personalpronomen", "Interjektion", + "Präposition", "Sprichwort", "Konjunktion", "GebundenesLexem", "Präfix", "Gerundium", "Eigenname", + "Suffix", "Vorname", "Ortsnamen-Grundwort", "Hiragana", "Indefinitpronomen", "Grußformel", "Pronomen", + "Demonstrativpronomen", "Zahlzeichen", "Possessivpronomen", "Artikel", "Partikel", "Buchstabe", + "Interrogativpronomen", "Kontraktion", "Subjunktion", "Symbol", "Reflexivpronomen", "Komparativ", + "Onomatopoetikum", "Umschrift", "Hilfsverb", "Interrogativadverb", "Relativpronomen", "Zahl", + "Gradpartikel", "Pronominaladverb", "Merkspruch", "Affix", "Antwortpartikel", "Konjunktionaladverb", + "Superlativ", "Modalpartikel", "Negationspartikel", "Postposition", "Präfixoid", + "ReflexivesPersonalpronomen", "Satzzeichen", "Fokuspartikel", "Hanzi", "Katakana", + "KontraktionausPräpositionundArtikel", "ReflexivesPossessivpronomen", "Reziprokpronomen", + "Singularetantum", "Suffixoid", "Zahladjektiv", "Zahlklassifikator" }; + + public static Set ignorePrefixes = new HashSet(); + static { + ignorePrefixes.add("MediaWiki:"); + ignorePrefixes.add("Wiktionary:"); + + ignorePrefixes.add("Datei:"); + ignorePrefixes.add("Hilfe:"); + ignorePrefixes.add("Vorlage:"); + ignorePrefixes.add("Kategorie:"); + ignorePrefixes.add("Thesaurus:"); + ignorePrefixes.add("Verzeichnis:"); + + ignorePrefixes.add("Appendix:"); + ignorePrefixes.add("Category:"); + ignorePrefixes.add("Help:"); + ignorePrefixes.add("Template:"); + ignorePrefixes.add("Rhymes:"); + ignorePrefixes.add("Rhmyes:"); + ignorePrefixes.add("Sign gloss:"); + ignorePrefixes.add("Summary:"); + ignorePrefixes.add("Thread:"); + ignorePrefixes.add("Transwiki:"); + ignorePrefixes.add("Unsupported titles/"); + ignorePrefixes.add("Wikisaurus:"); + ignorePrefixes.add("Citations:"); + ignorePrefixes.add("Concordance:"); + ignorePrefixes.add("File:"); + ignorePrefixes.add("Glossary:"); + ignorePrefixes.add("Index:"); + ignorePrefixes.add("Wiktionary talk:"); + } + + public static boolean isPrefixed(String title) { + return getPrefixIfAny(title) != null; + } + + public static String getPrefixIfAny(String title) { + for (String prefix : WiktionaryData.ignorePrefixes) { + if (title.startsWith(prefix)) { + return prefix; + } + } + return null; + } + +} diff --git a/src/main/java/de/grundid/twiki/parser/WiktionaryImporter.java b/src/main/java/de/grundid/twiki/parser/WiktionaryImporter.java index 0d7a9e9..1c0209a 100644 --- a/src/main/java/de/grundid/twiki/parser/WiktionaryImporter.java +++ b/src/main/java/de/grundid/twiki/parser/WiktionaryImporter.java @@ -11,29 +11,30 @@ public class WiktionaryImporter extends OsmDefaultHandler { private WiktionaryEntry entry; private Deque tagQueue = new LinkedList(); + private StringBuilder content = new StringBuilder(); @Override public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { tagQueue.add(name); + content.setLength(0); if (name.equals("page")) entry = new WiktionaryEntry(); } @Override public void characters(char[] ch, int start, int length) throws SAXException { - String currentTag = tagQueue.peekLast(); - String content = new String(ch, start, length).trim(); - if (currentTag.equals("title")) { - entry.setTitle(content); - } - else if (currentTag.equals("text")) { - entry.setText(content); - } + content.append(ch, start, length); } @Override public void endElement(String uri, String localName, String name) throws SAXException { - tagQueue.removeLast(); + String currentTag = tagQueue.removeLast(); + if (currentTag.equals("title")) { + entry.setTitle(content.toString()); + } + else if (currentTag.equals("text")) { + entry.setText(content.toString()); + } if (entry == null) return; diff --git a/src/main/java/de/grundid/twiki/parser/Consumer.java b/src/main/java/de/grundid/twiki/parser/consumer/Consumer.java similarity index 95% rename from src/main/java/de/grundid/twiki/parser/Consumer.java rename to src/main/java/de/grundid/twiki/parser/consumer/Consumer.java index d2ea89e..f120bb5 100644 --- a/src/main/java/de/grundid/twiki/parser/Consumer.java +++ b/src/main/java/de/grundid/twiki/parser/consumer/Consumer.java @@ -1,4 +1,4 @@ -package de.grundid.twiki.parser; +package de.grundid.twiki.parser.consumer; import java.util.concurrent.BlockingQueue; diff --git a/src/main/java/de/grundid/twiki/parser/consumer/DbWriterConsumer.java b/src/main/java/de/grundid/twiki/parser/consumer/DbWriterConsumer.java new file mode 100644 index 0000000..2e38132 --- /dev/null +++ b/src/main/java/de/grundid/twiki/parser/consumer/DbWriterConsumer.java @@ -0,0 +1,42 @@ +package de.grundid.twiki.parser.consumer; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import de.grundid.twiki.jpa.WikiEntry; +import de.grundid.twiki.jpa.WikiEntryRepository; +import de.grundid.twiki.parser.WiktionaryData; +import de.grundid.twiki.parser.WiktionaryEntry; + +@Component +public class DbWriterConsumer extends Consumer { + + @Autowired + private WikiEntryRepository wikiEntryRepository; + + private String currentSource; + + @Override + protected void consume(WiktionaryEntry element) { + + WikiEntry wikiEntry = new WikiEntry(); + wikiEntry.setSource(currentSource); + wikiEntry.setTitle(element.getTitle()); + wikiEntry.setEntry(element.getText()); + wikiEntry.setCategory("_word_"); + + for (String prefix : WiktionaryData.ignorePrefixes) { + if (element.getTitle().startsWith(prefix)) { + wikiEntry.setCategory(prefix); + break; + } + } + + wikiEntryRepository.save(wikiEntry); + + } + + public void setCurrentSource(String currentSource) { + this.currentSource = currentSource; + } +} diff --git a/src/main/java/de/grundid/twiki/parser/consumer/DummyConsumer.java b/src/main/java/de/grundid/twiki/parser/consumer/DummyConsumer.java new file mode 100644 index 0000000..6e2ec25 --- /dev/null +++ b/src/main/java/de/grundid/twiki/parser/consumer/DummyConsumer.java @@ -0,0 +1,12 @@ +package de.grundid.twiki.parser.consumer; + +import de.grundid.twiki.parser.WiktionaryEntry; + +public class DummyConsumer extends Consumer { + + @Override + protected void consume(WiktionaryEntry element) { + + } + +} diff --git a/src/main/java/de/grundid/twiki/parser/consumer/SimpleFilterConsumer.java b/src/main/java/de/grundid/twiki/parser/consumer/SimpleFilterConsumer.java new file mode 100644 index 0000000..04748c2 --- /dev/null +++ b/src/main/java/de/grundid/twiki/parser/consumer/SimpleFilterConsumer.java @@ -0,0 +1,63 @@ +package de.grundid.twiki.parser.consumer; + +import java.util.Map; +import java.util.Map.Entry; +import java.util.TreeMap; + +import de.grundid.twiki.parser.WiktionaryData; +import de.grundid.twiki.parser.WiktionaryEntry; + +public class SimpleFilterConsumer extends Consumer { + + private Map> matches = new TreeMap>(); + private int total; + + public void addPattern(String pattern) { + matches.put(pattern, new TreeMap()); + } + + @Override + protected void consume(WiktionaryEntry element) { + if (!WiktionaryData.isPrefixed(element.getTitle())) { + addIfMatches(element.getText()); + } + } + + private void addIfMatches(String text) { + for (Entry> entry : matches.entrySet()) { + String pattern = entry.getKey(); + Map subMatches = entry.getValue(); + int pos = text.indexOf(pattern); + if (pos >= 0) { + int endOfLine = text.indexOf("\n", pos); + String match = text.substring(pos, endOfLine).replaceAll(" |'|,|=|/", ""); + incMatch(match, subMatches); + + // String[] split = match.split("\\||}"); + // incMatch(split[1], subMatches); + } + } + } + + private void incMatch(String match, Map subMatches) { + Integer c = subMatches.get(match); + if (c == null) + c = Integer.valueOf(1); + else + c += 1; + subMatches.put(match, c); + total++; + } + + public void outputMatches() { + + for (Entry> entry : matches.entrySet()) { + for (Entry subEntry : entry.getValue().entrySet()) { + System.out.println(entry.getKey() + "\t" + subEntry.getKey() + "\t" + subEntry.getValue()); + } + } + + System.out.println("Total: " + total); + + } +} diff --git a/src/main/java/de/grundid/twiki/parser/consumer/SortedFileWriterConsumer.java b/src/main/java/de/grundid/twiki/parser/consumer/SortedFileWriterConsumer.java new file mode 100644 index 0000000..00542ac --- /dev/null +++ b/src/main/java/de/grundid/twiki/parser/consumer/SortedFileWriterConsumer.java @@ -0,0 +1,59 @@ +package de.grundid.twiki.parser.consumer; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Set; +import java.util.TreeSet; + +import de.grundid.twiki.parser.WiktionaryData; +import de.grundid.twiki.parser.WiktionaryEntry; + +public class SortedFileWriterConsumer extends Consumer { + + private Set words = new TreeSet(); + private String outputFile; + + public void setOutputFile(String outputFile) { + this.outputFile = outputFile; + } + + @Override + protected void consume(WiktionaryEntry element) { + for (String prefix : WiktionaryData.ignorePrefixes) { + if (prefix.startsWith(element.getTitle())) + return; + } + words.add(element.getTitle()); + } + + @Override + protected void finishConsuming() { + writeWords(); + } + + private void writeWords() { + FileWriter fw = null; + try { + fw = new FileWriter(new File(outputFile)); + for (String word : words) { + fw.write(word); + fw.write('\n'); + } + } + catch (IOException e) { + } + finally { + if (fw != null) { + try { + fw.flush(); + fw.close(); + } + catch (IOException e) { + e.printStackTrace(); + } + } + } + } + +} diff --git a/src/test/java/de/grundid/twiki/parser/SimpleImportTest.java b/src/test/java/de/grundid/twiki/parser/SimpleImportTest.java index 9179fd7..6bba3d3 100644 --- a/src/test/java/de/grundid/twiki/parser/SimpleImportTest.java +++ b/src/test/java/de/grundid/twiki/parser/SimpleImportTest.java @@ -1,41 +1,14 @@ package de.grundid.twiki.parser; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.Collection; - import org.junit.Test; +import de.grundid.twiki.parser.consumer.SimpleFilterConsumer; + public class SimpleImportTest { private static final String[] files = { "c:\\tmp\\dewiktionary-20120225-pages-articles.xml", "c:\\tmp\\enwiktionary-20120406-pages-articles.xml" }; - private void writeWords(String file, Collection words) { - FileWriter fw = null; - try { - fw = new FileWriter(new File(file + ".txt")); - for (String word : words) { - fw.write(word); - fw.write('\n'); - } - } - catch (IOException e) { - } - finally { - if (fw != null) { - try { - fw.flush(); - fw.close(); - } - catch (IOException e) { - e.printStackTrace(); - } - } - } - } - @Test public void testImport() throws Exception { @@ -43,7 +16,7 @@ public void testImport() throws Exception { SimpleImporter importer = new SimpleImporter(); ImportHandler handler = new ImportHandler(); - WiktionaryImporter de = processFile(importer, handler, files[0]); + WiktionaryImporter de = processFile(importer, handler, files[0], "de"); // WiktionaryImporter en = processFile(importer, handler, files[1]); // System.out.println("DE before: " + de.getWordsCount()); // de.getWords().removeAll(en.getWords()); @@ -52,20 +25,19 @@ public void testImport() throws Exception { System.out.println("Time: " + (System.currentTimeMillis() - time) + " ms"); } - private WiktionaryImporter processFile(SimpleImporter importer, ImportHandler handler, String file) { + private WiktionaryImporter processFile(SimpleImporter importer, ImportHandler handler, + String file, String source) { WiktionaryImporter wiktionaryImporter = new WiktionaryImporter(); - handler.setProducer(wiktionaryImporter); - handler.setConsumer(new Consumer() { - - @Override - protected void consume(WiktionaryEntry element) { - // System.out.println("[" + element.getTitle() + "]"); - } - }); + SimpleFilterConsumer consumer = new SimpleFilterConsumer(); + for (String part : WiktionaryData.partsOfSpeech) + consumer.addPattern("{{Wortart|" + part + "|Deutsch"); + handler.setProducer(wiktionaryImporter); + handler.setConsumer(consumer); importer.run(file, handler); + consumer.outputMatches(); return wiktionaryImporter; } } diff --git a/src/test/java/de/grundid/twiki/parser/TestBase.java b/src/test/java/de/grundid/twiki/parser/TestBase.java new file mode 100644 index 0000000..ac780a3 --- /dev/null +++ b/src/test/java/de/grundid/twiki/parser/TestBase.java @@ -0,0 +1,11 @@ +package de.grundid.twiki.parser; + +import org.junit.runner.RunWith; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; + +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "/test-context.xml" }) +public abstract class TestBase { + +} diff --git a/src/test/resources/test-context.xml b/src/test/resources/test-context.xml new file mode 100644 index 0000000..6f612d7 --- /dev/null +++ b/src/test/resources/test-context.xml @@ -0,0 +1,65 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +