Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

slim version of module live and topical. removed everything that was …

…not necessary anymore
  • Loading branch information...
commit 48c53d5cbbe357319b8a56bc2f371909005f00d1 1 parent 65bff5e
@dirkweissenborn dirkweissenborn authored
View
1  conf/indexing.properties
@@ -65,5 +65,6 @@ org.dbpedia.spotlight.yahoo.region = us
# Topical configuration
org.dbpedia.spotlight.data.sortedArticlesCategories=/media/dirk/Data/Wikipedia/sorted.article_categories_en.nt
+#only NaiveBayesTopicalClassifier up to now
org.dbpedia.spotlight.topic.classifier.type=NaiveBayesTopicalClassifier
org.dbpedia.spotlight.topic.description=conf/topic_descriptions.xml
View
44 conf/topic_descriptions.xml
@@ -6,26 +6,22 @@
<iptc mediatopic="20000003"/>
<iptc mediatopic="20000004"/>
<categories>Animation,Cartooning</categories>
- <keywords></keywords>
</topic>
<topic name="cinema">
<iptc mediatopic="20000005"/>
<categories>Film</categories>
- <keywords></keywords>
</topic>
<topic name="literature">
<iptc mediatopic="20000013"/>
<categories>Literature</categories>
- <keywords></keywords>
</topic>
<topic name="music">
<iptc mediatopic="20000018"/>
- <categories>Music</categories>
- <keywords></keywords>
+ <categories>Music,Music_genres,20th-century_music_genres</categories>
</topic>
<topic name="performing_arts"> <!--"theatre_dance_opera" renamed because many things from all performing arts occured-->
@@ -33,7 +29,6 @@
<iptc mediatopic="20000029"/>
<iptc mediatopic="20000007"/>
<categories>Opera,Opera_genres,Dance,Theatre</categories>
- <keywords></keywords>
</topic>
<!-- Visual arts -->
@@ -41,28 +36,19 @@
<topic name="architecture">
<iptc mediatopic="20000032"/>
<categories>Architecture</categories>
- <keywords></keywords>
<!--feed url="http://topics.nytimes.com/top/reference/timestopics/subjects/a/architecture/index.html?rss=1"/>
<feed url="http://www.architectsjournal.co.uk/XmlServers/navsectionRSS.aspx?navsectioncode=3"/-->
</topic>
- <!--topic name="fashion">
- <iptc mediatopic="20000011"/>
- <categories>Fashion,Clothing</categories>
- <keywords></keywords>
- </topic DID NOT WORK WELL-->
-
<topic name="painting_drawing">
<iptc mediatopic="20000035"/>
<iptc mediatopic="20000034"/>
<categories>Painting,Drawing</categories>
- <keywords></keywords>
</topic>
<topic name="sculpture">
<iptc mediatopic="20000037"/>
<categories>Sculpture</categories>
- <keywords></keywords>
</topic>
<!-- economy, business, finance-->
@@ -70,7 +56,6 @@
<topic name="economy_business_finance">
<iptc mediatopic="20000344"/>
<categories>Business,Finance</categories>
- <keywords></keywords>
</topic>
<!-- natural science -->
@@ -78,26 +63,22 @@
<topic name="biology">
<iptc mediatopic="20000719"/>
<categories>Biology</categories>
- <keywords></keywords>
<!--feed url="http://feeds.biologynews.net/biologynews/headlines?format=xml"/-->
</topic>
<topic name="chemistry">
<iptc mediatopic="20000725"/>
<categories>Chemistry</categories>
- <keywords></keywords>
</topic>
<topic name="geology_prehistoriclife"><!--"geology"-->
<iptc mediatopic="20000727"/>
<categories>Geology</categories>
- <keywords></keywords>
</topic>
<topic name="physics">
<iptc mediatopic="20000731"/>
<categories>Physics</categories>
- <keywords></keywords>
</topic>
@@ -106,7 +87,6 @@
<topic name="technology_engineering">
<iptc mediatopic="20000756"/>
<categories>Technology,Engineering</categories>
- <keywords></keywords>
</topic>
<!-- Crime, Law, Justice -->
@@ -114,20 +94,17 @@
<topic name="crime">
<iptc mediatopic="20000082"/>
<categories>Criminology,Crime</categories>
- <keywords></keywords>
</topic>
<topic name="law">
<iptc mediatopic="20000121"/>
<categories>Law</categories>
- <keywords></keywords>
</topic>
<!-- Education -->
<topic name="education">
<iptc mediatopic="05000000"/>
<categories>Education</categories>
- <keywords></keywords>
</topic>
@@ -136,7 +113,6 @@
<iptc mediatopic="20000248"/>
<iptc mediatopic="20000244"/>
<categories>Food_and_drink,Cuisine</categories>
- <keywords></keywords>
</topic>
<!-- Social Sciences -->
@@ -144,43 +120,36 @@
<topic name="anthropology">
<iptc mediatopic="20000743"/>
<categories>Anthropology</categories>
- <keywords></keywords>
</topic>
<topic name="archaeology">
<iptc mediatopic="20000744"/>
<categories>Archaeology</categories>
- <keywords></keywords>
</topic>
<topic name="economics">
<iptc mediatopic="20000745"/>
<categories>Economics</categories>
- <keywords></keywords>
</topic>
<topic name="geography">
<iptc mediatopic="20000746"/>
<categories>Geography,Places</categories>
- <keywords></keywords>
</topic>
<topic name="history">
<iptc mediatopic="20000747"/>
<categories>History,Chronology</categories>
- <keywords></keywords>
</topic>
<topic name="philosophy">
<iptc mediatopic="20000751"/>
<categories>Philosophy</categories>
- <keywords></keywords>
</topic>
<topic name="psychology">
<iptc mediatopic="20000753"/>
<categories>Psychology</categories>
- <keywords></keywords>
</topic>
<!-- politics -->
@@ -189,14 +158,12 @@
<iptc mediatopic="11000000"/>
<iptc mediatopic="20000752"/>
<categories>Politics,Political_science</categories>
- <keywords></keywords>
</topic>
<!-- Health -->
<topic name="health">
<iptc mediatopic="07000000"/>
<categories>Health,Diseases_and_disorders,Health_sciences</categories>
- <keywords></keywords>
</topic>
<!-- Structural Science -->
@@ -204,21 +171,18 @@
<topic name="mathematics">
<iptc mediatopic="20000715"/>
<categories>Mathematics</categories>
- <keywords></keywords>
</topic>
<topic name="computer_science">
<iptc mediatopic="20000763"/>
<categories>Computer_science,Computing‎</categories>
- <keywords></keywords>
</topic>
<!-- Sport -->
<topic name="sport">
<iptc mediatopic="15000000"/>
<categories>Sports,Sports_by_year</categories>
- <keywords></keywords>
</topic>
<!-- Mass media -->
@@ -226,7 +190,6 @@
<topic name="mass_media">
<iptc mediatopic="20000045"/>
<categories>Television,Radio,Mass_media,News,Journalism</categories>
- <keywords></keywords>
</topic>
<!-- Culture -->
@@ -234,33 +197,28 @@
<topic name="religion_belief">
<iptc mediatopic="12000000"/>
<categories>Religion,Belief</categories>
- <keywords></keywords>
</topic>
<topic name="transport">
<iptc mediatopic="20000337"/>
<categories>Transport</categories>
- <keywords></keywords>
</topic>
<!--topic name="agriculture">
<iptc mediatopic="20000210"/>
<categories>Agriculture</categories>
- <keywords></keywords>
</topic DID NOT WORK WELL-->
<topic name="video_game">
<iptc mediatopic="20000548"/>
<categories>Video_game_culture,Video_games</categories>
- <keywords></keywords>
</topic>
<topic name="war">
<iptc mediatopic="20000056"/>
<categories>War,Military</categories>
- <keywords></keywords>
</topic>
</topics>
View
6 pom.xml
@@ -50,6 +50,8 @@
<module>eval</module>
<module>uima</module>
<module>dist</module>
+ <module>live</module>
+ <module>topical</module>
</modules>
<build>
@@ -288,6 +290,8 @@
</configuration>
</execution>
+ <!--Dependencies for dbpedia spotlight live-->
+
<execution>
<id>install-hunposchain0.6_mod-jar</id>
<phase>install</phase>
@@ -363,6 +367,8 @@
</configuration>
</execution>
+ <!--Spotlight live dependencies-->
+
</executions>
</plugin>
View
2  topical/pom.xml
@@ -169,7 +169,7 @@
<dependency>
<groupId>cc.factorie</groupId>
<artifactId>factorie</artifactId>
- <version>1.0.0-M3</version>
+ <version>1.0.0-M4</version>
</dependency>
</dependencies>
View
6 topical/src/main/scala/org/dbpedia/spotlight/model/TopicDescription.scala
@@ -24,7 +24,7 @@ object TopicDescription {
for (topicItem <- xml \\ "topic") yield {
val topic = new Topic((topicItem \\ "@name").head.text) // HACK: bug fix Computer_science got read with more than 16 characters
val categories = (topicItem \\ "categories").head.text.split(",").map(category => category.toCharArray.subSequence(0, category.length).toString.trim)
- val keywords = (topicItem \\ "keywords").head.text.split(",").map(category => category.toCharArray.subSequence(0, category.length).toString.trim)
+ //val keywords = (topicItem \\ "keywords").head.text.split(",").map(category => category.toCharArray.subSequence(0, category.length).toString.trim)
var iptcTopics = Set[String]()
for (iptcItem <- topicItem \\ "iptc")
@@ -34,9 +34,9 @@ object TopicDescription {
for (feedItem <- topicItem \\ "feed")
feeds += new URL((feedItem \\ "@url").head.text)
- new TopicDescription(topic, categories, keywords, iptcTopics, feeds)
+ TopicDescription(topic, categories, iptcTopics, feeds)
}
}
}
-class TopicDescription(val topic: Topic, val categories: Seq[String], val keywords: Seq[String], val iptcTopics: Set[String], val rssFeeds: Set[URL])
+case class TopicDescription(topic: Topic, categories: Seq[String],iptcTopics: Set[String], rssFeeds: Set[URL])
View
174 topical/src/main/scala/org/dbpedia/spotlight/topical/NaiveBayesTopicalClassifier.scala
@@ -4,6 +4,7 @@ import cc.factorie._
import app.classify.{Trial, LabelList, ModelBasedClassifier, LogLinearModel}
import java.io.{StringReader, File}
import io.Source
+import la.DenseTensor1
import org.dbpedia.spotlight.model.{Topic, Text}
import org.apache.commons.logging.LogFactory
import org.apache.lucene.analysis.{Analyzer}
@@ -12,6 +13,8 @@ import org.apache.lucene.util.Version
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import scala.Predef._
import collection.mutable.ArrayBuffer
+import cc.factorie.util.{TensorCubbie, BinarySerializer}
+import cc.factorie.app.classify
/**
* @author dirk
@@ -19,42 +22,62 @@ import collection.mutable.ArrayBuffer
* Time: 10:58 AM
*/
+/**
+ * Complementary naive bayes as explained here (http://machinelearning.wustl.edu/mlpapers/paper_files/icml2003_RennieSTK03.pdf)
+ */
protected class NaiveBayesTopicalClassifier extends TopicalClassifier {
private val analyzer: Analyzer = new EnglishAnalyzer(Version.LUCENE_36)
- class Document(text:String, labelName:String = "",training:Boolean = false) extends FeatureVectorVariable[String] {
+ class Document(text:String, labelName:String = labelDomain.category(0),training:Boolean = false) extends FeatureVectorVariable[String] {
def domain = documentDomain
+ final val docCountId = "###DOCCOUNT###"
- labelDomain.gatherCounts = training
domain.dimensionDomain.gatherCounts = training
-
+ //also count number of documents in document domain - this is hacky but works :)
domain.dimensionDomain.index(docCountId)
- var label = new Label(labelName, this)
- final val docCountId = "###DOCCOUNT###"
+ override def skipNonCategories = !training
+ var label =
+ if(training || labelDomain.categories.contains(labelName))
+ new Label(labelName, this)
+ else
+ new Label(labelDomain.category(0), this)
+ //I found that tf*idf and length normalization is very useful for high confidence scores (above 0.5 for one topic),
+ //compared to just using unnormalized BOW vectors,
+ //while overall prediction accuracy is just slightly improved over simple BOW
{
var group = Map[String,Double]()
val tokenStream = analyzer.reusableTokenStream(null, new StringReader(text))
val charTermAttribute = tokenStream.addAttribute(classOf[CharTermAttribute])
while (tokenStream.incrementToken()) {
try {
- group += (charTermAttribute.toString() -> (group.getOrElse(charTermAttribute.toString(),0.0)+1))
+ val token = charTermAttribute.toString().toLowerCase
+ if(token.toLowerCase.matches("[a-z]{3,}"))
+ group += (token -> (group.getOrElse(charTermAttribute.toString(),0.0)+1))
} catch {
case e => //domain size exceeded max size. No problem just don't use that feature
}
}
- group.foreach(token => this += (token._1,token._2))
+ group.foreach(token => {
+ this += (token._1,token._2)
+ })
}
def normalize {
var features = List[(Int,Double)]()
this.tensor.foreachActiveElement{
case (idx,tf) => {
- features ::= (idx, math.log(1+tf) * math.log( domain.dimensionDomain.count(docCountId) / (domain.dimensionDomain.count(idx))))
+ try {
+ val df = domain.dimensionDomain.count(idx)
+ features ::= (idx, math.log(1+tf) * math.log( domain.dimensionDomain.count(docCountId) / df))
+ } catch {
+ case e =>
+ idx;tf
+ }
}
}
@@ -69,17 +92,22 @@ protected class NaiveBayesTopicalClassifier extends TopicalClassifier {
class Label(name:String, val document:Document) extends LabeledCategoricalVariable(name) {
def domain = labelDomain
}
- protected var documentDomain = new CategoricalDimensionTensorDomain[String]{}
+ protected var documentDomain = new CategoricalTensorDomain[String]{ }
protected var labelDomain = new CategoricalDomain[String]
+ //labelIndex -> sum of all feature values for each label
+ protected var totalMasses = new DenseTensor1(0)
+
+
protected var _model:LogLinearModel[Label,Document] = null
- private var classifier = new ModelBasedClassifier[Label](_model, labelDomain)
+ private var classifier = new ModelBasedClassifier[Label,LogLinearModel[Label,Document]](_model, labelDomain)
/**
* @param text
* @return predicted probabilities of topics given the text
*/
def getPredictions(text: Text) = {
val doc = new Document(text.text)
+ doc.normalize
val classification = classifier.classify(doc.label)
getTopics().zip(classification.proportions.asSeq).toArray
}
@@ -103,23 +131,22 @@ protected class NaiveBayesTopicalClassifier extends TopicalClassifier {
def serialize(modelFile: File) {
if (modelFile.getParentFile ne null)
modelFile.getParentFile.mkdirs()
- BinaryFileSerializer.serialize(_model, modelFile)
- val labelDomainFile = new File(modelFile.getAbsolutePath + "-labelDomain")
- BinaryFileSerializer.serialize(labelDomain, labelDomainFile)
- val featuresDomainFile = new File(modelFile.getAbsolutePath + "-documentDomain")
- BinaryFileSerializer.serialize(documentDomain.dimensionDomain, featuresDomainFile)
+
+ val tensorCubbie = new TensorCubbie[DenseTensor1]
+ tensorCubbie.store(totalMasses)
+ BinarySerializer.serialize(documentDomain.dimensionDomain,tensorCubbie,labelDomain,_model, modelFile,true)
}
- //mainly copied from NaiveBayesTrainer;
- // TODO: adding new labels that are not in the initial batch should be possible, but is because of some stupid bug (no idea which) not possible
+ //mainly copied from NaiveBayesTrainer; Complementary naive bayes as explained here (http://machinelearning.wustl.edu/mlpapers/paper_files/icml2003_RennieSTK03.pdf)
private val biasSmoothingMass = 1.0
private val evidenceSmoothingMass = 1.0
private def trainIncremental(il: LabelList[Label, Document]) {
il.foreach(_.document.normalize)
+ //make model growable
val cmodel =
if(_model == null ||
- _model.evidenceTemplate.weights.dim2 < documentDomain.dimensionDomain.size ||
- _model.evidenceTemplate.weights.dim1 < labelDomain.size)
+ _model.evidenceTemplate.weights.value.dim2 < documentDomain.dimensionDomain.size ||
+ _model.evidenceTemplate.weights.value.dim1 < labelDomain.size)
new LogLinearModel[Label,Document](_.document, labelDomain, documentDomain)
else
_model
@@ -127,97 +154,114 @@ protected class NaiveBayesTopicalClassifier extends TopicalClassifier {
val numLabels = labelDomain.size
val numFeatures = documentDomain.dimensionDomain.size
val bias = new DenseProportions1(numLabels)
- val evid = Seq.tabulate(numLabels)(i => new DenseProportions1(numFeatures))
+ val cEvid = Seq.tabulate(numLabels)(i => new DenseProportions1(numFeatures))
// Note: this doesn't actually build the graphical model, it just gathers smoothed counts, never creating factors
// Incorporate smoothing, with simple +m smoothing
for (li <- 0 until numLabels)
- bias.masses += (li, labelDomain.count(li)+biasSmoothingMass)
+ bias.masses += (li, 1)//No biases, labelDomain.count(li)+biasSmoothingMass)
val batchLabelCounts = Array.fill(numLabels){0.0}
il.foreach(label => batchLabelCounts(label.intValue) += 1)
if (_model == null) {
- for (li <- 0 until numLabels; fi <- 0 until numFeatures) evid(li).masses += (fi, evidenceSmoothingMass)
+ for (li <- 0 until numLabels; fi <- 0 until numFeatures) cEvid(li).masses += (fi, evidenceSmoothingMass)
} else {
for (li <- 0 until numLabels ; fi <- 0 until numFeatures)
- if( _model.evidenceTemplate.weights.dim1 > li && _model.evidenceTemplate.weights.dim2 > fi)
- evid(li).masses += (fi, (labelDomain.count(li)/2 - batchLabelCounts(li) ) * math.exp(_model.evidenceTemplate.weights(li,fi)) )
+ if( _model.evidenceTemplate.weights.value.dim1 > li && _model.evidenceTemplate.weights.value.dim2 > fi)
+ //initialize with prior statistics. prior#docs * p(token|label)
+ cEvid(li).masses += (fi, totalMasses(li) * math.exp(-_model.evidenceTemplate.weights.value(li,fi)) ) //labelDomain does double counting
else
- evid(li).masses += (fi, evidenceSmoothingMass)
+ cEvid(li).masses += (fi, evidenceSmoothingMass)
}
// Incorporate evidence
for (label <- il) {
val targetIndex = label.intValue
val features = il.labelToFeatures(label)
- val activeElements = features.tensor.activeElements
- while (activeElements.hasNext) {
- val (featureIndex, featureValue) = activeElements.next()
- evid(targetIndex).masses += (featureIndex, featureValue)
- }
+
+ features.value.foreachActiveElement((featureIndex, featureValue) => {
+ cEvid(targetIndex).masses += (featureIndex, featureValue)
+ })
}
// Put results into the model templates
- (0 until numLabels).foreach(i =>
- cmodel.biasTemplate.weights(i) = math.log(bias(i)))
-
- for (li <- 0 until numLabels; fi <- 0 until numFeatures)
- cmodel.evidenceTemplate.weights(li, fi) = math.log(evid(li)(fi))
+ totalMasses = new DenseTensor1(numLabels)
+ (0 until numLabels).foreach(li => {
+ totalMasses(li) = cEvid(li).masses.massTotal
+ cmodel.biasTemplate.weights.value(li) = math.log(bias(li))
+ for (fi <- 0 until numFeatures) {
+ cmodel.evidenceTemplate.weights.value(li, fi) = math.log(cEvid(li)(fi))
+ }
+ })
_model = cmodel
- classifier = new ModelBasedClassifier[Label](_model, labelDomain)
+ classifier = new ModelBasedClassifier[Label,LogLinearModel[Label,Document]](_model, labelDomain)
}
}
object NaiveBayesTopicalClassifier extends TopicalClassifierTrainer{
private val LOG = LogFactory.getLog(getClass())
- var batchSize = 500000
-
- def main(args:Array[String]) {
- val m1 = trainModel(new File("/media/dirk/Data/Wikipedia/corpus.tsv")).asInstanceOf[NaiveBayesTopicalClassifier]
- }
+ var batchSize = 600000
/**
* @param corpus of following format: each line refers to a document with the following structure: topic\ttext
*/
def trainModel(corpus:File, iterations:Int):TopicalClassifier = {
+ val classifier = new NaiveBayesTopicalClassifier()
+ trainModelIncremental(corpus,iterations,classifier)
+ classifier
+ }
+
+ def trainModelIncremental(corpus: File, iterations: Int, classifier: TopicalClassifier) {
LOG.info("Training model on dataset " + corpus.getAbsolutePath)
+
if (! corpus.exists) throw new IllegalArgumentException("Directory "+corpus+" does not exist.")
- trainModel(Source.fromFile(corpus).getLines().map(line => {
+ trainModelIncremental(Source.fromFile(corpus).getLines().map(line => {
val Array(topic,text) = line.split("\t",2)
(new Topic(topic),new Text(text))
- }), iterations).asInstanceOf[NaiveBayesTopicalClassifier]
+ }), iterations,classifier)
+
+ classifier
}
def trainModel(corpus:Iterator[(Topic,Text)],iterations:Int):TopicalClassifier = {
val classifier = new NaiveBayesTopicalClassifier()
- var documents = new ArrayBuffer[classifier.Document]()
+ trainModelIncremental(corpus,iterations,classifier)
+ classifier
+ }
+
+ def trainModelIncremental(corpus: Iterator[(Topic, Text)], iterations: Int, classifier: TopicalClassifier) {
+ val cl = classifier.asInstanceOf[NaiveBayesTopicalClassifier]
+ cl.documentDomain.dimensionDomain.unfreeze()
+ var documents = new ArrayBuffer[cl.Document]()
var count = 0
def doTrain {
LOG.info("Training on batch "+count+" with "+documents.size+" documents")
count += 1
- val ll = new LabelList[classifier.Label, classifier.Document](documents.map(_.label), _.document)
- classifier.trainIncremental(ll)
+ val ll = new LabelList[cl.Label, cl.Document](documents.map(_.label), _.document)
+ cl.trainIncremental(ll)
+ /*
+ val testTrial = new classify.Trial[cl.Label](cl.classifier)
+ testTrial ++= ll
- /*documents.foreach(_.label.setRandomly())
- val trainTrial = new Trial[classifier.Label](classifier.classifier)
- println(documents.size)
- trainTrial ++= documents.map(_.label)
+ println("acc="+testTrial.accuracy)
+
+ def objective = new HammingTemplate[cl.Label]
(0.1 until(1,0.1)).foreach(cutoff => {
- val cut = trainTrial.filter(_.proportions.max >= cutoff)
- println("accuracy for cutoff: "+cutoff +" = "+cut.size.toDouble/trainTrial.size+" - " +HammingObjective.accuracy(cut.map(_.label)))
- }) */
+ val cut = testTrial.filter(_.proportions.max >= cutoff)
+ println("accuracy for cutoff: "+cutoff +" = "+cut.size.toDouble/testTrial.size+" - " +objective.accuracy(cut.map(_.label)))
+ })*/
}
corpus.foreach {
case (topic,text) => {
- documents += new classifier.Document(text.text,topic.getName, true)
+ documents += new cl.Document(text.text,topic.getName, true)
if(documents.size >= batchSize) {
doTrain
documents.clear()
@@ -226,22 +270,20 @@ object NaiveBayesTopicalClassifier extends TopicalClassifierTrainer{
if(documents.size < batchSize) {
doTrain
}
-
- classifier
+ cl.documentDomain.dimensionDomain.freeze()
}
def deSerialize(file:File):TopicalClassifier = {
val classifier = new NaiveBayesTopicalClassifier()
- val prefix = file.getAbsolutePath
- val labelDomainFile = new File(prefix + "-labelDomain")
- assert(labelDomainFile.exists(), "Trying to load inexistent label domain file: '" + prefix + "-labelDomain'")
- BinaryFileSerializer.deserialize(classifier.labelDomain, labelDomainFile)
- val featuresDomainFile = new File(prefix + "-documentDomain")
- assert(featuresDomainFile.exists(), "Trying to load inexistent label domain file: '" + prefix + "-featuresDomain'")
- BinaryFileSerializer.deserialize(classifier.documentDomain.dimensionDomain, featuresDomainFile)
- val modelFile = file
- assert(modelFile.exists(), "Trying to load inexisting model file: '" + prefix + "-model'")
- BinaryFileSerializer.deserialize(classifier._model, modelFile)
+
+ classifier._model = new LogLinearModel[classifier.Label,classifier.Document](_.document, classifier.labelDomain,classifier.documentDomain)
+
+ val tensorCubbie = new TensorCubbie[DenseTensor1]
+
+ BinarySerializer.deserialize(classifier.documentDomain.dimensionDomain,tensorCubbie,classifier.labelDomain,classifier._model, file,true)
+
+ classifier.totalMasses = tensorCubbie.fetch()
+
classifier
}
}
View
32 topical/src/main/scala/org/dbpedia/spotlight/topical/TopicalClassifier.scala
@@ -48,21 +48,41 @@ object MultiLabelClassifier {
trait TopicalClassifierTrainer {
def trainModel(corpus:File, iterations:Int):TopicalClassifier
+ def trainModelIncremental(corpus:File, iterations:Int, classifier:TopicalClassifier)
def trainModel(corpus:Iterator[(Topic,Text)], iterations:Int):TopicalClassifier
+ def trainModelIncremental(corpus:Iterator[(Topic,Text)], iterations:Int, classifier:TopicalClassifier)
def trainModel(corpus:File):TopicalClassifier = trainModel(corpus,1)
+ def trainModelIncremental(corpus:File,classifier:TopicalClassifier) {
+ trainModelIncremental(corpus,1,classifier)
+ }
def trainModel(corpus:Iterator[(Topic,Text)]):TopicalClassifier = trainModel(corpus,1)
+ def trainModelIncremental(corpus:Iterator[(Topic,Text)],classifier:TopicalClassifier) {
+ trainModelIncremental(corpus,1,classifier)
+ }
+
+ def needsShuffled:Boolean = false
+
+
+ def main(args:Array[String]) {
+ if(args.size < 2) {
+ throw new IllegalArgumentException("You have to provide at least: path to corpus, model output path [, optional:iterations]")
+ }
+
+ val iterations = if(args.length>2) args(2).toInt else 1
+
+ val m1 = trainModel(new File(args(0)),iterations)
+ m1.serialize(new File(args(1)))
+ }
}
object TopicalClassifierTrainer {
def byType(classifierType:String):TopicalClassifierTrainer =
classifierType match {
- case "FactorieTopicalClassifier" => FactorieTopicalClassifier
case _ => NaiveBayesTopicalClassifier // default
}
-
}
object TopicalClassifierFactory {
@@ -71,10 +91,6 @@ object TopicalClassifierFactory {
def fromFile(file:File, classifierType: String): Option[TopicalClassifier] = {
LOG.info("Loading topical classifier...")
- if (classifierType.endsWith("FactorieTopicalClassifier")) {
- return Some(FactorieTopicalClassifier.deSerialize(file))
- }
-
if (classifierType.endsWith("NaiveBayesTopicalClassifier")) {
return Some(NaiveBayesTopicalClassifier.deSerialize(file))
}
@@ -83,8 +99,8 @@ object TopicalClassifierFactory {
}
def fromType(classifierType:String): Option[TopicalClassifier] = {
- if (classifierType.endsWith("FactorieTopicalClassifier")) {
- val classifier = new FactorieTopicalClassifier
+ if (classifierType.endsWith("NaiveBayesTopicalClassifier")) {
+ val classifier = new NaiveBayesTopicalClassifier
return Some(classifier)
}
View
2  topical/src/main/scala/org/dbpedia/spotlight/topical/TopicalMultiLabelClassifier.scala
@@ -26,6 +26,7 @@ object TopicalMultiLabelClassifier {
}
def trainModel(corpus: File, modelOut: File, classifierType:String = "") {
+ LOG.info("Training multilabel classifier on corpus: %s, saving model to: %s".format(corpus.getAbsolutePath, modelOut.getAbsolutePath))
val trainer = TopicalClassifierTrainer.byType(classifierType)
modelOut.mkdirs()
val topics = mutable.Map[Topic,(File,PrintWriter)]()
@@ -38,6 +39,7 @@ object TopicalMultiLabelClassifier {
(f,new PrintWriter(f))
})._2
+ //Add this example to the topical corpus
pw.println(line)
topics.foreach {
View
6 topical/src/main/scala/org/dbpedia/spotlight/topical/index/AssignTopicsToOccs.scala
@@ -24,14 +24,14 @@ object AssignTopicsToOccs {
/**
*
* @param args 1st: path to input occs, 2nd: path to topical classification configuration file,
- * 3rd: min confidence of assigning, 4th: output, 5th: append (true|false)
+ * 3rd: min confidence of assigning (see SplitOccsSemisupervised for more information), 4th: output, 5th: append (true|false)
*/
def main(args: Array[String]) {
val config = new TopicalClassificationConfiguration(args(1))
assignTopics(new File(args(0)), TopicalClassifierFactory.fromFile(config.getModelFile, config.getClassifierType).get, args(2).toDouble, new File(args(3)), args(4).toBoolean)
}
- def assignTopics(occsFile: File, model: TopicalClassifier, minimalConfidence: Double, output: File, append: Boolean) {
+ def assignTopics(occsFile: File, model: TopicalClassifier, minimalConfidence: Double, output: File, append: Boolean = false) {
val writers = Map[Topic, PrintWriter]()
model.getTopics.foreach(topic => writers += (topic -> new PrintWriter(new FileWriter(new File(output, topic.getName + ".tsv"), append))))
@@ -50,7 +50,7 @@ object AssignTopicsToOccs {
writers(topic).println(occ.toTsvString)
written = true
assignments += 1
- if (assignments % 10000 == 0)
+ if (assignments % 1000 == 0)
LOG.info(assignments + "-th assignment: " + occ.id + ", " + occ.resource.uri + "->" + topic.getName)
}
}
View
63 topical/src/main/scala/org/dbpedia/spotlight/topical/index/GenerateOccTopicCorpus.scala
@@ -2,8 +2,6 @@ package org.dbpedia.spotlight.topical.index
import java.io.{FileWriter, PrintWriter, File}
import scala._
-import org.dbpedia.spotlight.topical.wikipedia.util.{WikipediaFlattenedHierarchyLoader, WikipediaHierarchyLoader}
-import org.dbpedia.spotlight.util.IndexingConfiguration
import org.dbpedia.spotlight.model.{DBpediaCategory, Topic}
import org.dbpedia.spotlight.topical.util.TopicUtil
import org.dbpedia.spotlight.io.{FileOccurrenceSource, FileOccsCategoriesSource}
@@ -13,11 +11,8 @@ import scala.util.control.Breaks._
/**
- * This object takes the splitted occs directory extracted by SplitOccsByCategories$ or SplitOccsSemiSupervised$,
- * dbpedias sorted file article_categories (http://downloads.dbpedia.org/3.7/en/article_categories_en.nt.bz2),
- * wikipedias hierarchy (http://downloads.dbpedia.org/3.7/en/skos_categories_en.nt),
- * the output directory from FlattenWikipediaHierarchy, the number of examples each corpus should contain and finally
- * the output file, where the corpus will be written. Note that this corpus should be shuffled afterwards
+ * This object takes the splitted occs directory extracted by SplitOccsSemiSupervised,
+ * the output file, where the corpus will be written.
*
* @author dirk
*/
@@ -27,29 +22,57 @@ object GenerateOccTopicCorpus {
/**
*
- * @param args 1st: path to splitted occs, 2nd: number of examples to be written for each topic (if <= 0, maximum number will be written),
- * 3rd: output corpus file, 4th: indexing.properties
+ * @param args 1st: path to splitted occs,
+ * 2nd: output corpus file,
+ * 3rd: optional: number of examples to be written for each topic (if <= 0, maximum number will be written),
*/
def main(args: Array[String]) {
-
- if (args.length >= 4) {
- val config = new IndexingConfiguration(args(3))
- generateCorpusFromTopics(new File(args(0)), new File(config.get("org.dbpedia.spotlight.data.sortedArticlesCategories")),
- new File(config.get("org.dbpedia.spotlight.topic.flattenedHierarchy")),
- args(1).toInt, new File(args(2)))
- }
- else
- generateCorpus(new File(args(0)), args(1).toInt, new File(args(2)))
+ generateCorpusWithEqualCount(new File(args(0)), new File(args(1)),if(args.length > 2) args(2).toInt else -1)
}
/**
* Simplest way of generating a topic corpus from splitted occs, which takes examples from splitted occs by random,
* until nrOfExamples is reached
* @param splittedOccsDir
- * @param nrOfExamples -1 means write maximum number of examples to corpus
* @param output
+ * @param nrOfExamples <=0 means write maximum number of examples to corpus
+ */
+ def generateCorpus(splittedOccsDir: File, output: File,nrOfExamples: Int = -1) {
+ output.getParentFile.mkdirs()
+ val outputWriter = new PrintWriter(new FileWriter(output))
+
+ splittedOccsDir.listFiles().foreach(topicFile => {
+ val topic = new Topic(topicFile.getName.substring(0, topicFile.getName.length - 4))
+
+ LOG.info("======================= Processing " + topicFile.getName + " =======================")
+
+ if (!topic.getName.equals(TopicUtil.CATCH_TOPIC)) {
+ var counter = 0
+ FileOccurrenceSource.fromFile(topicFile).takeWhile(occ => {
+ outputWriter.println(topic.getName + "\t" + occ.context.text)
+ counter += 1
+ if (counter % 10000 == 0)
+ LOG.info(counter + " examples written")
+ nrOfExamples <= 0 || counter < nrOfExamples
+ })
+
+ outputWriter.flush()
+ }
+ })
+
+ outputWriter.close()
+ LOG.info("Done")
+ }
+
+ /**
+ * Simplest way of generating a topic corpus from splitted occs, which takes examples from splitted occs,
+ * until nrOfExamples is reached
+ * @param splittedOccsDir
+ * @param nrOfExamples <=1 means write maximum number of examples to corpus
+ * @param output
+ * @deprecated
*/
- def generateCorpus(splittedOccsDir: File, nrOfExamples: Int, output: File) {
+ def generateCorpusWithEqualCount(splittedOccsDir: File, output: File,nrOfExamples: Int = -1) {
output.getParentFile.mkdirs()
val outputWriter = new PrintWriter(new FileWriter(output))
View
132 topical/src/main/scala/org/dbpedia/spotlight/topical/index/SplitOccsSemiSupervised.scala
@@ -3,22 +3,35 @@ package org.dbpedia.spotlight.topical.index
import java.io.{FileWriter, PrintWriter, File}
import org.dbpedia.spotlight.model._
import org.dbpedia.spotlight.util.IndexingConfiguration
-import org.dbpedia.spotlight.topical.{TopicalClassifierTrainer}
+import org.dbpedia.spotlight.topical.{TopicalClassifier, TopicalClassifierTrainer}
import org.dbpedia.spotlight.topical.util.TopicUtil
import org.dbpedia.spotlight.io.FileOccurrenceSource
import org.apache.commons.logging.LogFactory
import collection.mutable._
+import com.sun.grizzly.util.FileUtil
+import org.apache.commons.io.FileUtils
+import io.Source
/**
* This object splits the occs file into several topical occs files, by first creating an initial split, which is done
* by defining main categories for each topic in the topic.description file (which location is specified
* in the indexing.properties, can usually be found in the conf/ folder) and assigning each resource which are members of one of these main categories
- * to the specific topics. After that, all occs of these assigned resources are assigned to the specific topics as well.
- * The initial split is afterwards used to train an initial topical classifier which is then used to assign occs to the topics.
- * This step can be repeated several times (splitting, training a model on the new split, splitting again ...)
- *
+ * to the specific topics or, if the output directory is not empty, taking its content as the initial split (allowing
+ * this procedure to be run several times with different thresholds, e.g. first run with 0.8 threshold and 1 iteration and second run with 0.5 and 1 iteration).
+ * The initial split is afterwards used to train an initial topical classifier which is then used to assign unassigned occs to the topics.
+ * This step can be repeated several times (splitting, training a model on the new split, splitting again ...) by defining a number of iterations.
+ * </br></br>
+ * Threshold example statistics on initial split (after training on initial split): </br>
+ * cutoff-number of examples within this cutoff-accuracy </br>
+ * 0.01 - 1.0 - 0.66 </br>
+ * 0.1 - 0.97 - 0.67 </br>
+ * 0.2 = 0.73 - 0.74 </br>
+ * 0.3 = 0.52- 0.8 </br>
+ * 0.4 = 0.38 - 0.85 </br>
+ * 0.5 = 0.29 - 0.89 </br>
* @author dirk
- */
+**/
+
//TODO just allow concept uris
object SplitOccsSemiSupervised {
private val LOG = LogFactory.getLog(getClass)
@@ -26,7 +39,7 @@ object SplitOccsSemiSupervised {
/**
*
* @param args 1st: indexing.properties 2nd: path to occs file, 3rd: temporary path (same partition as output)
- * , 4th: minimal confidence of assigning an occ to a topic (0.5 has experimentally shown to be be good for naive bayes with 0.9 accuracy on the initial split)
+ * , 4th: minimal confidence of assigning an occ to a topic (see
* , 5th: nr of iterations, 6th: path to output directory
*
*/
@@ -56,57 +69,107 @@ object SplitOccsSemiSupervised {
outputDir: File) {
tmpDir.mkdirs()
val tmpCorpus = new File(tmpDir, "corpus.tsv")
- val tmpOther = new File(tmpDir, "toSplit.tsv")
+ val toSplit = new File(tmpDir, "toSplit.tsv")
+ val trainingDir = new File(outputDir.getAbsolutePath+"-training")
outputDir.mkdirs()
+ trainingDir.mkdirs()
if (outputDir.listFiles().size > 0) {
LOG.info("Output directory was not empty. Taking split in this directory as initial split.")
- new File(outputDir, TopicUtil.CATCH_TOPIC.getName + ".tsv").renameTo(tmpOther)
+ new File(outputDir, TopicUtil.CATCH_TOPIC.getName + ".tsv").renameTo(toSplit)
+ outputDir.renameTo(trainingDir)
}
else {
LOG.info("Creating initial split for training an initial model for splitting!")
- initialSplit(topicDescriptionFile, articleCatsFile, occsFile, outputDir)
+ initialSplit(topicDescriptionFile, articleCatsFile, occsFile, trainingDir)
}
val trainer = TopicalClassifierTrainer.byType(classifierType)
+ var classifier:TopicalClassifier = null
for (i <- 0 until iterations) {
- GenerateOccTopicCorpus.generateCorpus(outputDir, -1, new File(tmpCorpus.getAbsolutePath + ".tmp"))
- new ProcessBuilder("sort", "-R", "-o", tmpCorpus.getAbsolutePath, tmpCorpus.getAbsolutePath + ".tmp").start().waitFor()
- new File(tmpCorpus + ".tmp").delete()
+ GenerateOccTopicCorpus.generateCorpusWithEqualCount(trainingDir, tmpCorpus)
+
+ /*val corpus = trainingDir.listFiles().iterator.flatMap(topicFile => {
+ val topic = new Topic(topicFile.getName.substring(0, topicFile.getName.length - 4))
+ if (!topic.getName.equals(TopicUtil.CATCH_TOPIC)) {
+ FileOccurrenceSource.fromFile(topicFile).map(occ => {
+ (topic, occ.context)
+ })
+ }
+ else
+ List[(Topic,Text)]().iterator
+ }) */
+
+ if (trainer.needsShuffled) {
+ LOG.info("Shuffling corpus!")
+ FileUtils.moveFile(tmpCorpus, new File(tmpCorpus.getAbsolutePath+ ".tmp"))
+ new ProcessBuilder("sort", "-R", "-o", tmpCorpus.getAbsolutePath, tmpCorpus.getAbsolutePath + ".tmp").start().waitFor()
+ new File(tmpCorpus.getAbsolutePath + ".tmp").delete()
+ }
+ if (i == 0)
+ classifier = trainer.trainModel(tmpCorpus,10)
+ else
+ trainer.trainModelIncremental(tmpCorpus,10,classifier)
- val classifier = trainer.trainModel(tmpCorpus,10)
+ // After first split only take assigned occs for training, otherwise merge new training examples with old training examples
+ if(i == 1){
+ outputDir.listFiles().foreach(_.delete())
+ outputDir.delete()
+ trainingDir.renameTo(outputDir)
+ }
+ else
+ mergeDirectories(trainingDir,outputDir)
+
+ val f = if(i==0) occsFile else toSplit
- if (i == 0) {
- AssignTopicsToOccs.assignTopics(occsFile, classifier, threshold, outputDir, false)
+ LOG.info("Start splitting occs into topics, iteration: "+i)
+ if (i < iterations-1) {
+ trainingDir.mkdirs()
+ AssignTopicsToOccs.assignTopics(f, classifier, threshold, trainingDir, false)
+
+ new File(trainingDir, TopicUtil.CATCH_TOPIC.getName + ".tsv").renameTo(toSplit)
}
else
- AssignTopicsToOccs.assignTopics(tmpOther, classifier, threshold, outputDir, true)
+ AssignTopicsToOccs.assignTopics(f, classifier, threshold, outputDir, true)
- tmpOther.delete()
- if (i < iterations - 1)
- new File(outputDir, TopicUtil.CATCH_TOPIC.getName + ".tsv").renameTo(tmpOther)
+ toSplit.delete()
}
tmpDir.listFiles().foreach(_.delete())
}
- def loadArticleCategories(articleCats: File, descriptions: collection.Seq[TopicDescription]): Map[DBpediaResource, Set[Topic]] = {
- val assignments = Map[DBpediaResource, Set[Topic]]()
+ private def mergeDirectories(deleteDir:File, keepDir:File) {
+ keepDir.mkdirs()
+ deleteDir.mkdirs()
+ deleteDir.listFiles().foreach(f => {
+ keepDir.listFiles().find(_.getName == f.getName) match {
+ case Some(otherF) => {
+ val pw = new PrintWriter(new FileWriter(otherF,true))
+ Source.fromFile(f).getLines().foreach(l => pw.println(l))
+ pw.close()
+ }
+ case None => {
+ val pw = new PrintWriter(new FileWriter(new File(keepDir,f.getName)))
+ Source.fromFile(f).getLines().foreach(l => pw.println(l))
+ pw.close()
+ }
+ }
- val categoryAssignments = descriptions.foldLeft(Map[DBpediaCategory, Set[Topic]]())((acc, description) =>
- acc ++ (description.categories.foldLeft(Map[DBpediaCategory, Set[Topic]]())((acc2, category) => {
- val cat = new DBpediaCategory(category)
- var result = Map() ++ acc2
+ })
- if (!result.contains(cat))
- result += (cat -> Set(description.topic))
- else
- result(cat) += (description.topic)
+ deleteDir.listFiles().foreach(_.delete())
+ deleteDir.delete()
+ }
+
+ private def loadArticleCategories(articleCats: File, descriptions: collection.Seq[TopicDescription]): Map[DBpediaResource, Set[Topic]] = {
+ val assignments = Map[DBpediaResource, Set[Topic]]()
- result
- })))
+ val categoryAssignments =
+ descriptions.
+ flatMap(d => d.categories.map(c => (new DBpediaCategory(c),d.topic))).
+ groupBy(_._1).map(tuple => (tuple._1,tuple._2.map(_._2).toSet)).toMap
scala.io.Source.fromFile(articleCats).getLines().foreach(line => {
val split = line.split(" ")
@@ -116,7 +179,6 @@ object SplitOccsSemiSupervised {
assignments.getOrElseUpdate(resource, Set[Topic]()) ++= (categoryAssignments(category))
}
-
})
assignments
@@ -151,7 +213,7 @@ object SplitOccsSemiSupervised {
writers(topic).println(occ.toTsvString)
})
assignedResourcesCtr += 1
- if (assignedResourcesCtr % 10000 == 0) {
+ if (assignedResourcesCtr % 100000 == 0) {
LOG.info("Assigned " + assignedResourcesCtr + " occs to topics")
LOG.info("Latest assignment: " + lastResource.uri + " -> " + selectedTopics.foldLeft("")(_ + " " + _.getName))
}
@@ -164,4 +226,4 @@ object SplitOccsSemiSupervised {
writers.foreach(_._2.close())
}
-}
+}
View
2  topical/src/main/scala/org/dbpedia/spotlight/web/rest/TopicalOutputSerializer.scala
@@ -22,7 +22,7 @@ object TopicalOutputSerializer {
<Topics>
{for ((topic,score) <- tags) yield <Topic score={score.toString} mediatopics={
if (descriptions!=null)
- descriptions.find(_.topic.equals(topic)).getOrElse(new TopicDescription(null,null,null,Set[String](),null)).iptcTopics.reduceLeft(_ +","+_)
+ descriptions.find(_.topic.equals(topic)).getOrElse(TopicDescription(null,null,Set[String](),null)).iptcTopics.reduceLeft(_ +","+_)
else
"No iptc mediatopics found"
}>{topic.getName}</Topic> }
Please sign in to comment.
Something went wrong with that request. Please try again.