Skip to content

Commit

Permalink
Merge branch 'add-converters' of github.com:ag-gipp/MathMLTools into …
Browse files Browse the repository at this point in the history
…add-converters
  • Loading branch information
AndreG-P committed Aug 14, 2018
2 parents 3677552 + 94dfda5 commit a1eccec
Show file tree
Hide file tree
Showing 8 changed files with 213 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ public List<CSymbol> getCSymbols() {
if (cSymbols == null) {
final IterableNodeList nodeList = new IterableNodeList(dom.getElementsByTagName("csymbol"));
cSymbols = new ArrayList<>();
nodeList.forEach(n -> cSymbols.add(new CSymbol((Element) n, false)));
nodeList.forEach(n -> cSymbols.add(new CSymbol((Element) n)));
}
return cSymbols;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,7 @@ public void init() {
}

public int initLocally() {
props = ConfigLoader.CONFIG;
max = Integer.parseInt(props.getProperty(ConfigLoader.GOULDI_MAXIMUM_NUM));

String goldPath = props.getProperty(ConfigLoader.GOULDI_LOCAL_PATH);
Path path = Paths.get(goldPath);
Path path = getGoldPath();
gouldi = new JsonGouldiBean[max];

ExecutorService executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() * 2);
Expand All @@ -97,6 +93,14 @@ public int initLocally() {
return max;
}

public Path getGoldPath() {
props = ConfigLoader.CONFIG;
max = Integer.parseInt(props.getProperty(ConfigLoader.GOULDI_MAXIMUM_NUM));

String goldPath = props.getProperty(ConfigLoader.GOULDI_LOCAL_PATH);
return Paths.get(goldPath);
}

public GitHubFileResponse getResponseFromGouldiRequest(int number) {
String file = number + ".json";
return rest.getForObject(
Expand Down
2 changes: 1 addition & 1 deletion mathml-utils/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@
<version>${commons.io.version}</version>
</dependency>
</dependencies>
</project>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@
import org.w3c.dom.Element;

public class CSymbol implements Comparable<CSymbol> {
private boolean strict;
private Element n;
private static final String SERIALIZATION_SEPARATOR = ":";

public CSymbol(Element n, boolean strict) {
this.strict = strict;
public CSymbol(Element n) {
this.n = n;
}

Expand All @@ -26,7 +25,7 @@ public void setCd(String cd) {

@Override
public String toString() {
return getCd() + ":" + getCName();
return getCd() + SERIALIZATION_SEPARATOR + getCName();
}

@Override
Expand Down
2 changes: 1 addition & 1 deletion xamples/MathMLben
16 changes: 15 additions & 1 deletion xamples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
</parent>
<modelVersion>4.0.0</modelVersion>

<groupId>com.formulasearchengine.mathmltools</groupId>
<artifactId>xamples</artifactId>
<dependencies>
<dependency>
Expand All @@ -18,12 +17,27 @@
<version>2.0.2-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.wikidata.wdtk</groupId>
<artifactId>wdtk-wikibaseapi</artifactId>
<version>0.9.0</version>
</dependency>
<dependency>
<groupId>com.formulasearchengine.mathmltools</groupId>
<artifactId>mathml-core</artifactId>
<version>2.0.2-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>4.1</version>
</dependency>
</dependencies>


Expand Down
85 changes: 79 additions & 6 deletions xamples/src/main/java/SymbolListExample.java
Original file line number Diff line number Diff line change
@@ -1,36 +1,109 @@
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.formulasearchengine.mathmltools.gold.GoldStandardLoader;
import com.formulasearchengine.mathmltools.gold.GoldUtils;
import com.formulasearchengine.mathmltools.gold.pojo.JsonGouldiBean;
import com.formulasearchengine.mathmltools.io.XmlDocumentReader;
import com.formulasearchengine.mathmltools.mml.CMMLInfo;
import com.formulasearchengine.mathmltools.mml.MathDoc;
import com.formulasearchengine.mathmltools.utils.mml.CSymbol;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;

public class SymbolListExample {
private SymbolListExample() {
}

public static void main(String[] args) throws IOException, SAXException {
public static void main(String[] args) throws Exception {
final GoldStandardLoader gold = GoldStandardLoader.getInstance();
gold.initLocally();
final HashMultiset<CSymbol> allSymbols = HashMultiset.create();
final Map<String, Integer> omcdMap = new HashMap<>();
final TreeMultiset<CSymbol> allSymbols = getcSymbols(gold);
final TreeMultiset<String> normalizedSymbols = TreeMultiset.create();

System.out.println(allSymbols);
readOmCdMap(omcdMap);
normalizeSymbols(omcdMap, allSymbols, normalizedSymbols);

System.out.println(normalizedSymbols);


}

private static void normalizeSymbols(Map<String, Integer> omcdMap, TreeMultiset<CSymbol> allSymbols, TreeMultiset<String> normalizedSymbols) {
for (Multiset.Entry<CSymbol> cSymbolEntry : allSymbols.entrySet()) {
final String elem = cSymbolEntry.getElement().toString();
if (omcdMap.containsKey(elem)) {
normalizedSymbols.add("wikidata:Q" + omcdMap.get(elem), cSymbolEntry.getCount());
continue;
}
if (cSymbolEntry.getElement().getCd().equals("latexml")) {
if (cSymbolEntry.getElement().getCName().startsWith("Q")) {
normalizedSymbols.add("wikidata:" + cSymbolEntry.getElement().getCName(), cSymbolEntry.getCount());
continue;
}
}
normalizedSymbols.add(elem, cSymbolEntry.getCount());
}
}

public static void readOmCdMap(Map<String, Integer> omcdMap) throws IOException {
final Path goldPath = GoldStandardLoader.getInstance().getGoldPath();

FileReader in = new FileReader(goldPath.resolve("../doc/openMathSymbols.csv").toFile());
Iterable<CSVRecord> records = CSVFormat.RFC4180.parse(in);
for (CSVRecord record : records) {
String omcd = record.get(0);
String wikidata = record.get(2);
int wikidataInt = Integer.parseInt(wikidata.replaceAll("Q(\\d+)", "$1"));
omcdMap.put(omcd, wikidataInt);
}
}

public static TreeMultiset<CSymbol> getcSymbols(GoldStandardLoader gold) throws IOException {
final TreeMultiset<CSymbol> allSymbols = TreeMultiset.create();
// TODO file a bug that gold should implement iterable
for (int i = 1; i < 3; i++) {
for (int i = 1; i < 305; i++) {
final JsonGouldiBean gouldiJson = gold.getGouldiJson(i);
final List<CSymbol> cSymbols = getCSymbols(gouldiJson);
List<CSymbol> cSymbols;
try {
cSymbols = getCSymbols(gouldiJson);
} catch (SAXException e) {
cSymbols = fixException(i, e, gouldiJson);
}
TreeMultiset<CSymbol> currentSymbols = TreeMultiset.create(cSymbols);
allSymbols.addAll(currentSymbols);
System.out.println(currentSymbols);
}
return allSymbols;
}

private static List<CSymbol> fixException(int i, SAXException e, JsonGouldiBean gouldiJson) {
List<CSymbol> cSymbols;
cSymbols = new ArrayList<>();
if (e.getMessage().equals("Attribute \"xmlns:m\" must be declared for element type \"power\".")) {
final String newMml = gouldiJson.getMml().replaceAll("xmlns:m=\"http://www.w3.org/1998/Math/MathML\"", "");
gouldiJson.setMml(newMml);
final Path goldPath = GoldStandardLoader.getInstance().getGoldPath();

GoldUtils.writeGoldFile(goldPath.resolve(i + ".json"), gouldiJson);
}
// System.out.println(gouldiJson.getMml());
// System.out.println(e.getMessage());
// System.err.println(i);
return cSymbols;
}


private static List<CSymbol> getCSymbols(JsonGouldiBean gouldiJson) throws IOException, SAXException {
final String mmlString = gouldiJson.getMml();
final Document doc = XmlDocumentReader.parse(mmlString);
Expand Down
104 changes: 104 additions & 0 deletions xamples/src/main/java/WikidataExample.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import java.io.FileReader;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Map;
import java.util.TreeMap;

import com.formulasearchengine.mathmltools.gold.GoldStandardLoader;
import org.apache.commons.collections4.BidiMap;
import org.apache.commons.collections4.bidimap.DualHashBidiMap;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.wikidata.wdtk.datamodel.interfaces.EntityDocument;
import org.wikidata.wdtk.datamodel.interfaces.ItemDocument;
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
import org.wikidata.wdtk.datamodel.interfaces.SiteLink;
import org.wikidata.wdtk.wikibaseapi.WikibaseDataFetcher;
import org.wikidata.wdtk.wikibaseapi.apierrors.MediaWikiApiErrorException;

public class WikidataExample {

private static final WikibaseDataFetcher FETCHER = WikibaseDataFetcher.getWikidataDataFetcher();
private static final BidiMap<String, Integer> omCDMap = new DualHashBidiMap<>();
private static final Map<String, String> allSymbols = new TreeMap<>();
private static final String LANG = "en";
private static final String prefix = "wikidata:Q";

public static void main(String[] args) throws MediaWikiApiErrorException, IOException {
SymbolListExample.readOmCdMap(omCDMap);
final WikidataExample example = new WikidataExample();
//final String omCd = example.getOmCd(example.getItem(1226939));
//System.out.println(omCd);
example.processFile();

}

private ItemDocument getItem(long qId) throws MediaWikiApiErrorException {
return (ItemDocument) FETCHER.getEntityDocument("Q" + qId);
}

private String getOmCd(ItemDocument item) throws UnsupportedEncodingException {

final long revisionId = item.getRevisionId();
final MonolingualTextValue label = item.getLabels().get(LANG);
final MonolingualTextValue description = item.getDescriptions().get(LANG);
final SiteLink siteLink = item.getSiteLinks().get(LANG + "wiki");
final String qIdString = item.getEntityId().getId();
String descr = "";
String sortkey = "";
if (label != null) {
final String labelText = label.getText();
descr += labelText + "\n";
final String[] split = labelText.split(" ");
sortkey = split[split.length - 1];
}
if (description != null) {
descr += description.getText() + "\n";
}
if (siteLink != null) {
descr += "https://" + LANG + ".wikipedia.org/w/index.php?title=" + URLEncoder.encode(siteLink.getPageTitle(), "utf8") + "\n";
}
Integer qId = Integer.valueOf(qIdString.substring(1));
if (omCDMap.containsValue(qId)) {
descr += "See also " + omCDMap.getKey(qId) + "\n";
}
descr += "\n This description was generated from http://www.wikidata.org/w/index.php?oldid=" + revisionId;
descr = " <CDDefinition>\n"
+ " <Name>" + qIdString + "</Name>\n"
+ " <Role>application</Role>\n"
+ " <Description>\n" + descr
+ " </Description>\n"
+ "</CDDefinition>";
sortkey += qId;
allSymbols.put(sortkey, descr);
return descr;

}

private void processFile() throws IOException, MediaWikiApiErrorException {
final Path goldPath = GoldStandardLoader.getInstance().getGoldPath();

FileReader in = new FileReader(goldPath.resolve("../doc/wiki-cd-freqs.csv").toFile());
Iterable<CSVRecord> records = CSVFormat.RFC4180.parse(in);
final ArrayList<String> qIds = new ArrayList<>();
for (CSVRecord record : records) {
String symbol = record.get(0);
if (symbol.startsWith(prefix)) {
final int qId = Integer.parseInt(symbol.substring(prefix.length()));
qIds.add("Q" + qId);
}
}
final Map<String, EntityDocument> entityDocuments = FETCHER.getEntityDocuments(qIds);
for (Map.Entry<String, EntityDocument> entry : entityDocuments.entrySet()) {
getOmCd((ItemDocument) entry.getValue());
}
for (String s : allSymbols.keySet()) {
System.out.println(allSymbols.get(s));
}

}

}

0 comments on commit a1eccec

Please sign in to comment.