/
wiki-import-config.xml
49 lines (38 loc) · 1.87 KB
/
wiki-import-config.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
<dataConfig>
<dataSource type="FileDataSource" encoding="UTF-8" />
<script><![CDATA[
function AddWikiCategoriesToRow(row) {
var text = row.get("text");
var categories = new java.util.ArrayList();
// Wikipedia category spec says that anything after a | in the Category is used for sorting--it's not an actual category name
var myRe = /\[\[Category:([^|\]]+)/g;
var myArray;
while ((myArray = myRe.exec(text)) !== null)
{
categories.add(myArray[1]); // add the first and only capture group, the category name, to the array
}
row.put('categories', categories);
return row;
}
]]></script>
<document>
<entity name="page"
processor="XPathEntityProcessor"
stream="true"
forEach="/mediawiki/page/"
url="enwiki-latest-pages-articles.xml"
transformer="RegexTransformer,DateFormatTransformer,script:AddWikiCategoriesToRow"
>
<field column="id" xpath="/mediawiki/page/id" />
<field column="title" xpath="/mediawiki/page/title" />
<!-- DDR: we really don't care about the user, revision, or timestamp -->
<!-- <field column="user" xpath="/mediawiki/page/revision/contributor/username" />
<field column="userId" xpath="/mediawiki/page/revision/contributor/id" />
<field column="revision" xpath="/mediawiki/page/revision/id" />
<field column="timestamp" xpath="/mediawiki/page/revision/timestamp" dateTimeFormat="yyyy-MM-dd'T'hh:mm:ss'Z'" />
-->
<field column="text" xpath="/mediawiki/page/revision/text" />
<field column="$skipDoc" regex="^#REDIRECT .*" replaceWith="true" sourceColName="text"/> <!-- skip redirects and disambiguation pages: they won't have category information -->
</entity>
</document>
</dataConfig>