Skip to content

Commit

Permalink
filter analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
grundid committed Apr 12, 2012
1 parent 336db52 commit e5a119e
Show file tree
Hide file tree
Showing 16 changed files with 490 additions and 104 deletions.
72 changes: 63 additions & 9 deletions pom.xml
Expand Up @@ -4,26 +4,80 @@
<parent>
<groupId>de.grundid.maven</groupId>
<artifactId>java-parent</artifactId>
<version>2011.0</version>
<version>2012.1</version>
<relativePath />
</parent>

<groupId>de.grundid.twiki</groupId>
<artifactId>wiktionary-parser</artifactId>
<version>0.0.1-SNAPSHOT</version>
<version>1.0.0-SNAPSHOT</version>
<packaging>jar</packaging>

<name>wiktionary-parser</name>
<url>http://maven.apache.org</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<org.springframework-version>3.1.1.RELEASE</org.springframework-version>
</properties>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-orm</artifactId>
<version>${org.springframework-version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>${org.springframework-version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-jpa</artifactId>
<version>1.1.0.RC1</version>
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-entitymanager</artifactId>
<version>3.6.4.Final</version>
</dependency>
<dependency>
<groupId>commons-dbcp</groupId>
<artifactId>commons-dbcp</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>org.hibernate.javax.persistence</groupId>
<artifactId>hibernate-jpa-2.0-api</artifactId>
<version>1.0.1.Final</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.18</version>
</dependency>
</dependencies>

<repositories>
<repository>
<id>spring-milestone</id>
<name>Spring Maven MILESTONE Repository</name>
<url>http://repo.springsource.org/libs-milestone</url>
</repository>
<repository>
<id>spring-snapshot</id>
<name>Spring Maven SNAPSHOT Repository</name>
<url>http://repo.springsource.org/libs-snapshot</url>
</repository>
</repositories>

</project>
15 changes: 15 additions & 0 deletions src/main/java/de/grundid/twiki/jpa/HibernateJpaVendorAdapter.java
@@ -0,0 +1,15 @@
package de.grundid.twiki.jpa;

import java.util.Map;

import org.hibernate.cfg.ImprovedNamingStrategy;

public class HibernateJpaVendorAdapter extends org.springframework.orm.jpa.vendor.HibernateJpaVendorAdapter {

@Override
public Map<String, Object> getJpaPropertyMap() {
Map<String, Object> map = super.getJpaPropertyMap();
map.put("hibernate.ejb.naming_strategy", ImprovedNamingStrategy.class.getName());
return map;
}
}
58 changes: 58 additions & 0 deletions src/main/java/de/grundid/twiki/jpa/WikiEntry.java
@@ -0,0 +1,58 @@
package de.grundid.twiki.jpa;

import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.Id;

@Entity
public class WikiEntry {

@Id
@GeneratedValue
private Integer wikiEntryId;
private String source;
private String title;
private String category;
private String entry;

public Integer getWikiEntryId() {
return wikiEntryId;
}

public void setWikiEntryId(Integer wikiEntryId) {
this.wikiEntryId = wikiEntryId;
}

public String getSource() {
return source;
}

public void setSource(String source) {
this.source = source;
}

public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

public String getCategory() {
return category;
}

public void setCategory(String category) {
this.category = category;
}

public String getEntry() {
return entry;
}

public void setEntry(String entry) {
this.entry = entry;
}

}
7 changes: 7 additions & 0 deletions src/main/java/de/grundid/twiki/jpa/WikiEntryRepository.java
@@ -0,0 +1,7 @@
package de.grundid.twiki.jpa;

import org.springframework.data.jpa.repository.JpaRepository;

public interface WikiEntryRepository extends JpaRepository<WikiEntry, Integer> {

}
46 changes: 0 additions & 46 deletions src/main/java/de/grundid/twiki/parser/DbWriterConsumer.java

This file was deleted.

2 changes: 2 additions & 0 deletions src/main/java/de/grundid/twiki/parser/ImportHandler.java
Expand Up @@ -9,6 +9,8 @@

import org.xml.sax.helpers.DefaultHandler;

import de.grundid.twiki.parser.consumer.Consumer;

public class ImportHandler<T> implements Runnable {

private int defaultQueueSize = 20000;
Expand Down
71 changes: 71 additions & 0 deletions src/main/java/de/grundid/twiki/parser/WiktionaryData.java
@@ -0,0 +1,71 @@
package de.grundid.twiki.parser;

import java.util.HashSet;
import java.util.Set;

public class WiktionaryData {

public static String[] partsOfSpeech = { "Substantiv", "KonjugierteForm", "DeklinierteForm", "PartizipI",
"Adjektiv", "Verb", "Abkürzung", "Redewendung", "Adverb", "PartizipII", "Wortverbindung",
"ErweiterterInfinitiv", "Numerale", "Toponym", "Nachname", "Personalpronomen" };

public static String[] partsOfSpeechAll = { "Substantiv", "KonjugierteForm", "DeklinierteForm", "PartizipI",
"Adjektiv", "Verb", "Abkürzung", "Redewendung", "Adverb", "PartizipII", "Wortverbindung",
"ErweiterterInfinitiv", "Numerale", "Toponym", "Nachname", "Personalpronomen", "Interjektion",
"Präposition", "Sprichwort", "Konjunktion", "GebundenesLexem", "Präfix", "Gerundium", "Eigenname",
"Suffix", "Vorname", "Ortsnamen-Grundwort", "Hiragana", "Indefinitpronomen", "Grußformel", "Pronomen",
"Demonstrativpronomen", "Zahlzeichen", "Possessivpronomen", "Artikel", "Partikel", "Buchstabe",
"Interrogativpronomen", "Kontraktion", "Subjunktion", "Symbol", "Reflexivpronomen", "Komparativ",
"Onomatopoetikum", "Umschrift", "Hilfsverb", "Interrogativadverb", "Relativpronomen", "Zahl",
"Gradpartikel", "Pronominaladverb", "Merkspruch", "Affix", "Antwortpartikel", "Konjunktionaladverb",
"Superlativ", "Modalpartikel", "Negationspartikel", "Postposition", "Präfixoid",
"ReflexivesPersonalpronomen", "Satzzeichen", "Fokuspartikel", "Hanzi", "Katakana",
"KontraktionausPräpositionundArtikel", "ReflexivesPossessivpronomen", "Reziprokpronomen",
"Singularetantum", "Suffixoid", "Zahladjektiv", "Zahlklassifikator" };

public static Set<String> ignorePrefixes = new HashSet<String>();
static {
ignorePrefixes.add("MediaWiki:");
ignorePrefixes.add("Wiktionary:");

ignorePrefixes.add("Datei:");
ignorePrefixes.add("Hilfe:");
ignorePrefixes.add("Vorlage:");
ignorePrefixes.add("Kategorie:");
ignorePrefixes.add("Thesaurus:");
ignorePrefixes.add("Verzeichnis:");

ignorePrefixes.add("Appendix:");
ignorePrefixes.add("Category:");
ignorePrefixes.add("Help:");
ignorePrefixes.add("Template:");
ignorePrefixes.add("Rhymes:");
ignorePrefixes.add("Rhmyes:");
ignorePrefixes.add("Sign gloss:");
ignorePrefixes.add("Summary:");
ignorePrefixes.add("Thread:");
ignorePrefixes.add("Transwiki:");
ignorePrefixes.add("Unsupported titles/");
ignorePrefixes.add("Wikisaurus:");
ignorePrefixes.add("Citations:");
ignorePrefixes.add("Concordance:");
ignorePrefixes.add("File:");
ignorePrefixes.add("Glossary:");
ignorePrefixes.add("Index:");
ignorePrefixes.add("Wiktionary talk:");
}

public static boolean isPrefixed(String title) {
return getPrefixIfAny(title) != null;
}

public static String getPrefixIfAny(String title) {
for (String prefix : WiktionaryData.ignorePrefixes) {
if (title.startsWith(prefix)) {
return prefix;
}
}
return null;
}

}
19 changes: 10 additions & 9 deletions src/main/java/de/grundid/twiki/parser/WiktionaryImporter.java
Expand Up @@ -11,29 +11,30 @@ public class WiktionaryImporter extends OsmDefaultHandler<WiktionaryEntry> {
private WiktionaryEntry entry;

private Deque<String> tagQueue = new LinkedList<String>();
private StringBuilder content = new StringBuilder();

@Override
public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
tagQueue.add(name);
content.setLength(0);
if (name.equals("page"))
entry = new WiktionaryEntry();
}

@Override
public void characters(char[] ch, int start, int length) throws SAXException {
String currentTag = tagQueue.peekLast();
String content = new String(ch, start, length).trim();
if (currentTag.equals("title")) {
entry.setTitle(content);
}
else if (currentTag.equals("text")) {
entry.setText(content);
}
content.append(ch, start, length);
}

@Override
public void endElement(String uri, String localName, String name) throws SAXException {
tagQueue.removeLast();
String currentTag = tagQueue.removeLast();
if (currentTag.equals("title")) {
entry.setTitle(content.toString());
}
else if (currentTag.equals("text")) {
entry.setText(content.toString());
}

if (entry == null)
return;
Expand Down
@@ -1,4 +1,4 @@
package de.grundid.twiki.parser;
package de.grundid.twiki.parser.consumer;

import java.util.concurrent.BlockingQueue;

Expand Down
@@ -0,0 +1,42 @@
package de.grundid.twiki.parser.consumer;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import de.grundid.twiki.jpa.WikiEntry;
import de.grundid.twiki.jpa.WikiEntryRepository;
import de.grundid.twiki.parser.WiktionaryData;
import de.grundid.twiki.parser.WiktionaryEntry;

@Component
public class DbWriterConsumer extends Consumer<WiktionaryEntry> {

@Autowired
private WikiEntryRepository wikiEntryRepository;

private String currentSource;

@Override
protected void consume(WiktionaryEntry element) {

WikiEntry wikiEntry = new WikiEntry();
wikiEntry.setSource(currentSource);
wikiEntry.setTitle(element.getTitle());
wikiEntry.setEntry(element.getText());
wikiEntry.setCategory("_word_");

for (String prefix : WiktionaryData.ignorePrefixes) {
if (element.getTitle().startsWith(prefix)) {
wikiEntry.setCategory(prefix);
break;
}
}

wikiEntryRepository.save(wikiEntry);

}

public void setCurrentSource(String currentSource) {
this.currentSource = currentSource;
}
}

0 comments on commit e5a119e

Please sign in to comment.