Skip to content
Permalink
Branch: master
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
666 lines (589 sloc) 21.8 KB
<?php
/************************
AUTHOR: Ethan Gruber
MODIFIED: September, 2012
DESCRIPTION: Generate network of EAC-CPF stubs strictly by crawling through dbpedia links
REQUIRED LIBRARIES: php5, php5-curl (if putting to eXist or other web service)
************************/
/************************
* DEFINITION OF VARIABLES
* $start = the starting point for the process
* $end = the end point for the process, hypothetically.
* $lang = the language of labels to extract from dbpedia. if the $lang does not exist, the script will default to 'en'
* $options = conditional options array for categories of data to import
* $identityArray = ever-increasingly large array of dbpedia URIs and default labels
* $processed = list of completed resources
************************/
/*****
* DEFINITION OF $options
* internal: CPF relations link internally (short URI for ID, lowercase version of dbpedia id,
* e.g., "antoninus_pius") or link externally to dbpedia resources, e.g., http://dbpedia.org/resource/Antoninus_Pius
* occupations: occupations uncommon in dbpedia; placed in cpfDescription/description
* subjects: placed in cpfDescription/description as <localDescription @localType='subject'>
* children/parents/dynasties/successors/predecessors/spouses/influences: cpfRelations
* resourceRelations: dbpedia-owl:wikiPageExternalLink in dbpedia RDF
* thumbnail: dbpedia-owl:thumbnail stored as a resourceRelation with @xlink:role='portrait'
*****/
$identityArray = array();
$processed = array();
$start = 'http://dbpedia.org/resource/Augustus';
$end = '';
$lang = 'en';
$options = array(
'internal'=>true,
'occupations'=>true,
'subjects'=>false,
'birth/death places'=>true,
'children/parents'=>true,
'dynasties'=>true,
'successors/predecessors'=>true,
'spouses'=>true,
'influences'=>false,
'resourceRelations'=>true,
'thumbnail'=>true
);
//generate EAC-CPF record
$xml = generate_eac($start, $end, $lang, $options);
//save file
$id = strtolower(substr(strstr($start, 'resource/'), strpos(strstr($start, 'resource/'), '/') + 1));
$fileName = 'temp/' . $id . '.xml';
save_file($xml, $id, $fileName);
//process $identityArray
while (list($k, $v) = each($identityArray)){
if (!in_array($k, $processed)){
$id = strtolower(substr(strstr($k, 'resource/'), strpos(strstr($k, 'resource/'), '/') + 1));
$fileName = 'temp/' . $id . '.xml';
$xml = generate_eac($k, $end, $lang, $options);
save_file($xml, $id, $fileName);
}
}
function generate_eac($resource, $end, $lang, $options){
GLOBAL $identityArray;
GLOBAL $processed;
$processed[] = $resource;
$id = substr(strstr($resource, 'resource/'), strpos(strstr($resource, 'resource/'), '/') + 1);
//load dbpedia RDF
$dbRDF = new DOMDocument();
$dbRDF->load('http://dbpedia.org/data/' . $id . '.rdf');
$dxpath = new DOMXpath($dbRDF);
$dxpath->registerNamespace('rdf', "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
$dxpath->registerNamespace('dbpedia-owl', "http://dbpedia.org/ontology/");
$dxpath->registerNamespace('rdfs', "http://www.w3.org/2000/01/rdf-schema#");
$dxpath->registerNamespace('ns7', "http://live.dbpedia.org/ontology/");
$xml = '<?xml version="1.0" encoding="utf-8"?><eac-cpf xmlns="urn:isbn:1-931666-33-4" xmlns:xlink="http://www.w3.org/1999/xlink">';
/************ CONTROL ************/
$xml .= '<control>';
$xml .= '<recordId>' . strtolower($id) . '</recordId>';
//get viaf RDF, if it exists
$viafId = '';
$viafIds = $dxpath->query('//dbpprop:viaf');
foreach ($viafIds as $node){
$viafId = $node->nodeValue;
}
if (strlen($viafId) > 0){
$viafRDF = new DOMDocument();
$viafRDF->load('http://viaf.org/viaf/' . $viafId . '/rdf.xml');
$vxpath = new DOMXPath($viafRDF);
$vxpath->registerNamespace('rdf', "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
$vxpath->registerNamespace('owl', "http://www.w3.org/2002/07/owl#");
$vxpath->registerNamespace('schema', "http://schema.org/");
}
$xml .= '<maintenanceAgency><agencyName>Agency Name</agencyName></maintenanceAgency>';
$xml .= '<maintenanceHistory><maintenanceEvent><eventType>created</eventType><eventDateTime standardDateTime="' . date(DATE_W3C) . '"/><agentType>machine</agentType><agent>xEAC dbpedia PHP</agent></maintenanceEvent></maintenanceHistory>';
$xml .= '<conventionDeclaration><abbreviation>WIKIPEDIA</abbreviation><citation>Wikipedia/DBpedia</citation></conventionDeclaration>';
$xml .= '<sources><source xlink:type="simple" xlink:href="' . $resource . '"/>';
if (strlen($viafId) > 0){
$xml .= '<source xlink:type="simple" xlink:href="http://viaf.org/viaf/' . $viafId . '/"/>';
}
$xml .= '</sources>';
$xml .= '</control><cpfDescription>';
/************ IDENTITY ************/
$xml .= '<identity>';
//gather entityType
$types = $dxpath->query('//rdf:type[rdf:resource="http://xmlns.com/foaf/0.1/Person"]');
if (count($types) > 0){
$xml .= '<entityType>person</entityType>';
} else {
$xml .= '<entityType>family</entityType>';
}
//other entityIDs
$xml .= '<entityId>' . $resource . '</entityId>';
//get other records from VIAF
if (strlen($viafId) > 0){
foreach ($vxpath->query("//rdf:Description[rdf:type/@rdf:resource='http://xmlns.com/foaf/0.1/Person']/schema:sameAs") as $ele){
if (!strstr($ele->getAttribute('rdf:resource'), 'dbpedia')){
$xml .= '<entityId>' . $ele->getAttribute('rdf:resource') . '</entityId>';
}
}
}
foreach ($dxpath->query('//rdfs:label') as $ele){
$xml .= '<nameEntry xml:lang="' . $ele->getAttribute('xml:lang') . '"><part>' . $ele->nodeValue . '</part>';
//set English as preferred label, otherwise alternative
if ($ele->getAttribute('xml:lang') == $lang){
$xml .= '<preferredForm>WIKIPEDIA</preferredForm>';
} else {
$xml .= '<alternativeForm>WIKIPEDIA</alternativeForm>';
}
$xml .= '</nameEntry>';
}
$xml .= '</identity>';
/************ DESCRIPTION ************/
$xml .= '<description>';
//get dbpedia abstract -> eac:biogHist
$abstracts = $dxpath->query("//dbpedia-owl:abstract");
$abstract = getLabel($abstracts, $lang);
$xml .= '<biogHist><abstract xml:lang="' . $abstract['lang'] . '" localType="wikipedia">' . $abstract['label'] . '</abstract></biogHist>';
//existDates, get from VIAF by default, if available
if (strlen($viafId) > 0){
$xml .= getExistDates($vxpath->query('//schema:birthDate')->item(0)->nodeValue, $vxpath->query('//schema:deathDate')->item(0)->nodeValue);
} else {
//else get from dbpedia (inconsistent)
$startDates = $dxpath->query('//*[local-name()="birthDate"][@rdf:datatype="http://www.w3.org/2001/XMLSchema#date"]');
$endDates = $dxpath->query('//*[local-name()="deathDate"][@rdf:datatype="http://www.w3.org/2001/XMLSchema#date"]');
if (count($startDates) > 0 && count($endDates) > 0){
if (strlen($startDates->item(0)->nodeValue) > 0 && strlen($endDates->item(0)->nodeValue) > 0){
$gStart = $startDates->item(0)->nodeValue;
$gEnd = $endDates->item(0)->nodeValue;
$xml .= '<existDates><dateRange>';
$xml .= '<fromDate standardDate="' . $gStart . '">' . getDateTextual($gStart) . '</fromDate>';
$xml .= '<toDate standardDate="' . $gEnd . '">' . getDateTextual($gEnd) . '</toDate>';
$xml .= '</dateRange></existDates>';
}
}
}
//get birth and death places
if ($options['birth/death places'] == true){
$bdPlaces = $dxpath->query('descendant::dbpedia-owl:birthPlace|descendant::dbpedia-owl:deathPlace');
foreach ($bdPlaces as $place){
$localname = $place->localName;
$url = $place->getAttribute('rdf:resource');
//get label
$tempId = substr(strstr($url, 'resource/'), strpos(strstr($url, 'resource/'), '/') + 1);
if (!array_key_exists($url, $identityArray)) {
$labels = loadTempRDF($tempId);
$label = getLabel($labels, $lang);
$name = $label['label'];
} else {
$name = $identityArray[$url];
}
$xml .= '<place><placeEntry localVocabulary="' . $url . '">' . $name . '</placeEntry>';
//set placeRole
if ($localname == 'birthPlace'){
$xml .= '<placeRole>Place of Birth</placeRole>';
} else {
$xml .= '<placeRole>Place of Death</placeRole>';
}
//add date for birth or death, if available
if (strlen($viafId) > 0){
$query = ($localname == 'birthPlace') ? '//schema:birthDate' : '//schema:deathDate';
$gDate = normalizeDate($vxpath->query($query)->item(0)->nodeValue);
$xml .= '<date standardDate="' . $gDate . '">' . getDateTextual($gDate) . '</date>';
} else {
if (count($startDates) > 0 && count($endDates) > 0){
if (strlen($startDates->item(0)->nodeValue) > 0 && strlen($endDates->item(0)->nodeValue) > 0){
$gDate = ($localname == 'birthPlace') ? $startDates->item(0)->nodeValue : $endDates->item(0)->nodeValue;
$xml .= '<date standardDate="' . $gDate . '">' . getDateTextual($gDate) . '</date>';
}
}
}
$xml .= '</place>';
}
}
//get occupations
if ($options['occupations'] == true){
$occupations = $dxpath->query('descendant::rdf:Description[@rdf:about="' . $resource . '"]/dbpedia-owl:occupation');
foreach ($occupations as $occupation){
$url = $occupation->getAttribute('rdf:resource');
$tempId = substr(strstr($url, 'resource/'), strpos(strstr($url, 'resource/'), '/') + 1);
if (!array_key_exists($url, $identityArray)) {
$labels = loadTempRDF($tempId);
$label = getLabel($labels, $lang);
$name = $label['label'];
} else {
$name = $identityArray[$url];
}
$xml .= '<occupation>';
$xml .= '<term vocabularySource="' . $url . '">' . $name . '</term>';
$xml .= '</occupation>';
}
}
//get subjects
if ($options['subjects'] == true){
$subjects = $dxpath->query('descendant::rdf:Description[@rdf:about="' . $resource . '"]/dcterms:subject');
foreach ($subjects as $subject){
$url = $subject->getAttribute('rdf:resource');
$tempId = substr(strstr($url, 'resource/'), strpos(strstr($url, 'resource/'), '/') + 1);
if (!array_key_exists($url, $identityArray)) {
$labels = loadTempRDF($tempId);
$label = getLabel($labels, $lang);
$name = $label['label'];
} else {
$name = $identityArray[$url];
}
$xml .= '<localDescription localType="subject">';
$xml .= '<term vocabularySource="' . $url . '">' . $name . '</term>';
$xml .= '</localDescription>';
}
}
$xml .= '</description>';
/************ RELATIONS ************/
$xml .= getRelations($dxpath, $resource, $end, $lang, $options);
//close EAC-CPF
$xml .= '</cpfDescription></eac-cpf>';
return $xml;
}
function loadTempRDF ($tempId){
$tempRDF = new DOMDocument();
$tempRDF->load('http://dbpedia.org/data/' . $tempId . '.rdf');
$txpath = new DOMXpath($tempRDF);
$txpath->registerNamespace('rdfs', "http://www.w3.org/2000/01/rdf-schema#");
$labels = $txpath->query('//rdfs:label');
return $labels;
}
function getLabel($elements, $lang){
$array = array();
foreach ($elements as $element){
$array[$element->getAttribute('xml:lang')] = $element->nodeValue;
}
if (array_key_exists($lang, $array)) {
return array('lang'=>$lang, 'label'=>$array[$lang]);
} else {
return array('lang'=>'en', 'label'=>$array['en']);
}
}
function getRelations($dxpath, $resource, $end, $lang, $options){
GLOBAL $identityArray;
$xml = '<relations>';
/*********** CPF RELATIONS *************/
$nodeSets = array();
//populate $nodeSets for xquery with $options
if ($options['dynasties'] == true){
$nodeSets[] = 'descendant::dbpprop:dynasty[@rdf:resource]';
$nodeSets[] = 'descendant::dbpprop:house[@rdf:resource]';
$nodeSets[] = 'descendant::dbpprop:royalHouse[@rdf:resource]';
}
if ($options['children/parents'] == true){
//$nodeSets[] = 'descendant::dbpedia-owl:parent';
$nodeSets[] = 'descendant::dbpprop:mother';
$nodeSets[] = 'descendant::dbpprop:father';
}
if ($options['successors/predecessors'] == true){
$nodeSets[] = 'descendant::dbpedia-owl:successor';
$nodeSets[] = 'descendant::dbpedia-owl:predecessor';
}
if ($options['spouses'] == true){
//get spouses from dbpprop:spouse and dbpedia-owl:spouse, but avoid duplication
$nodeSets[] = 'descendant::*[local-name()="spouse"][not(@rdf:resource = preceding-sibling::*[local-name()="spouse"]/@rdf:resource)]';
}
if ($options['influences'] == true){
$nodeSets[] = 'descendant::dbpedia-owl:influenced';
$nodeSets[] = 'descendant::dbpedia-owl:influencedBy';
}
//attributes within the the resources main rdf:Description
$mainAttr = array('house', 'dynasty', 'royalHouse','mother','father','parent','spouse','predecessor','successor','influenced');
//execute xquery if $nodeSets has at least one item
if (count($nodeSets) > 0){
$query = implode('|', $nodeSets);
$relations = $dxpath->query($query);
//populate array for cpfRelations
$relationArray = array();
foreach ($relations as $relation){
$localname = $relation->localName;
if ($relation->parentNode->getAttribute('rdf:about') == $resource && in_array($localname, $mainAttr)){
switch ($localname){
case 'house':
$xlink = array('arcrole'=>'belongsToDynasty', 'role'=>'arch:Family');
break;
case 'dynasty':
$xlink = array('arcrole'=>'belongsToDynasty', 'role'=>'arch:Family');
break;
case 'royalHouse':
$xlink = array('arcrole'=>'belongsToDynasty', 'role'=>'arch:Family');
break;
case 'parent':
$xlink = array('arcrole'=>'childOf', 'role'=>'foaf:Person');
break;
case 'mother':
$xlink = array('arcrole'=>'childOf', 'role'=>'foaf:Person');
break;
case 'father':
$xlink = array('arcrole'=>'childOf', 'role'=>'foaf:Person');
break;
case 'spouse':
$xlink = array('arcrole'=>'spouseOf', 'role'=>'foaf:Person');
break;
case 'successor':
$xlink = array('arcrole'=>'predecessorOf', 'role'=>'foaf:Person');
break;
case 'predecessor':
$xlink = array('arcrole'=>'successorOf', 'role'=>'foaf:Person');
break;
case 'influenced':
$xlink = array('arcrole'=>'influenced', 'role'=>'foaf:Person');
break;
case 'influencedBy':
$xlink = array('arcrole'=>'influencedBy', 'role'=>'foaf:Person');
break;
default:
$xlink = array('arcrole'=>'NULL1', 'role'=>'foaf:Person');
break;
}
$url = $relation->getAttribute('rdf:resource');
if ($xlink['arcrole'] != 'NULL1'){
//get label
$tempId = substr(strstr($url, 'resource/'), strpos(strstr($url, 'resource/'), '/') + 1);
if (strlen($url) > 0){
if (!array_key_exists($url, $identityArray)) {
$labels = loadTempRDF($tempId);
$label = getLabel($labels, $lang);
$name = $label['label'];
//don't add the $end dbpedia resource into $identityArray
if ($resource != $end){
$identityArray[$url] = $name;
}
} else {
$name = $identityArray[$url];
}
} else {
$name = $relation->nodeValue;
}
//get href, whether internal or external
if ($options['internal'] == true){
$href = strtolower($tempId);
} else {
$href = $url;
}
$relationArray[] = array('href'=>$href, 'label'=>$name, 'arcrole'=>$xlink['arcrole'], 'role'=>$xlink['role']);
}
} elseif ($relation->parentNode->getAttribute('rdf:about') != $resource) {
switch ($localname){
case 'parent':
$xlink = array('arcrole'=>'parentOf', 'role'=>'foaf:Person');
break;
case 'house':
$xlink = array('arcrole'=>'dynastyOf', 'role'=>'foaf:Person');
break;
case 'dynasty':
$xlink = array('arcrole'=>'dynastyOf', 'role'=>'foaf:Person');
break;
case 'royalHouse':
$xlink = array('arcrole'=>'dynastyOf', 'role'=>'foaf:Person');
break;
case 'influenced':
$xlink = array('arcrole'=>'influencedBy', 'role'=>'foaf:Person');
break;
default:
$xlink = array('arcrole'=>'NULL2', 'role'=>'foaf:Person');
break;
}
$url = $relation->parentNode->getAttribute('rdf:about');
//get label
if ($xlink['arcrole'] != 'NULL2'){
$tempId = substr(strstr($url, 'resource/'), strpos(strstr($url, 'resource/'), '/') + 1);
if (strlen($url) > 0){
if (!array_key_exists($url, $identityArray)) {
$labels = loadTempRDF($tempId);
$label = getLabel($labels, $lang);
$name = $label['label'];
//don't add the $end dbpedia resource into $identityArray
if ($resource != $end){
$identityArray[$url] = $name;
}
} else {
$name = $identityArray[$url];
}
} else {
$name = $relation->nodeValue;
}
//get href, whether internal or external
if ($options['internal'] == true){
$href = strtolower($tempId);
} else {
$href = $url;
}
$relationArray[] = array('href'=>$href, 'label'=>$name, 'arcrole'=>$xlink['arcrole'], 'role'=>$xlink['role']);
}
}
}
}
/*** GENERATE CPFRELATION ELEMENTS ***/
foreach ($relationArray as $rel){
$xml .= '<cpfRelation xlink:type="simple"' . (strlen($rel['href']) > 0 ? ' xlink:href="' . $rel['href'] . '"' : '') . ' xlink:role="' . $rel['role'] . '" xlink:arcrole="' . $rel['arcrole'] . '">';
$xml .= '<relationEntry>' . $rel['label'] . '</relationEntry>';
$xml .= '</cpfRelation>';
}
/*********** END CPF RELATIONS *************/
//get thumbnail resource relation
if ($options['thumbnail'] == true){
$thumbnails = $dxpath->query("//*[local-name()='thumbnail']");
foreach ($thumbnails as $thumbnail){
if ($thumbnail->hasAttribute('rdf:resource') === TRUE){
$xml .= '<resourceRelation xlink:type="simple" xlink:href="' . $thumbnail->getAttribute('rdf:resource') . '" xlink:role="portrait">';
$xml .= '<relationEntry>Thumbnail</relationEntry>';
$xml .= '</resourceRelation>';
}
}
}
//get resource relatons from dbpedia
if ($options['resourceRelations'] == true){
$resources = $dxpath->query('descendant::dbpedia-owl:wikiPageExternalLink');
foreach($resources as $resource){
$xml .= '<resourceRelation xlink:type="simple" xlink:href="' . str_replace('&', '&amp;', $resource->getAttribute('rdf:resource')) . '" xlink:role="subjectOf">';
$xml .= '<relationEntry>' . str_replace('&', '&amp;', $resource->getAttribute('rdf:resource')) . '</relationEntry>';
$xml .= '</resourceRelation>';
}
}
$xml .= '</relations>';
//return $xml
return $xml;
}
function getExistDates($startDate, $endDate){
$gDateStart = normalizeDate($startDate);
$gDateEnd = normalizeDate($endDate);
$xml = '<existDates><dateRange>';
$xml .= '<fromDate standardDate="' . $gDateStart . '">' . getDateTextual($gDateStart) . '</fromDate>';
$xml .= '<toDate standardDate="' . $gDateEnd . '">' . getDateTextual($gDateEnd) . '</toDate>';
$xml .= '</dateRange></existDates>';
return $xml;
}
function normalizeDate($date){
$gDate = '';
if (strpos($date, '-') !== FALSE){
$segs = explode('-', $date);
if (strpos($date, '-') == 0){
//if BC
$gDate .= '-';
foreach ($segs as $k=>$v){
if ($k == 1){
$gDate .= str_pad((int) $v,4,"0",STR_PAD_LEFT);
} elseif ($k > 1) {
$gDate .= '-' . $v;
}
}
} else {
//if AD
foreach ($segs as $k=>$v){
if ($k == 0){
$gDate .= str_pad((int) $v,4,"0",STR_PAD_LEFT);
} else {
$gDate .= '-' . $v;
}
}
}
} else {
$gDate .= str_pad((int) $date,4,"0",STR_PAD_LEFT);
}
return $gDate;
}
function getDateTextual ($date){
$textDate = '';
if (strpos($date, '-') !== FALSE){
$segs = explode('-', $date);
if (strpos($date, '-') == 0){
//if BC
//day
if (strlen($segs[3]) > 0){
$textDate .= $segs[3] . ' ';
}
//month
if (strlen($segs[2]) > 0){
$textDate .= normalizeMonth($segs[2]) . ' ';
}
//year
$textDate .= (int) $segs[1] . ' B.C.';
} else {
//if AD
//day
if (strlen($segs[2]) > 0){
$textDate .= $segs[2] . ' ';
}
//month
if (strlen($segs[1]) > 0){
$textDate .= normalizeMonth($segs[1]) . ' ';
}
//year
$textDate .= ((int) $segs[0] < 400 ? 'A.D. ' : '') . (int) $segs[0];
}
} else {
$textDate = ((int) $date < 400 ? 'A.D. ' : '') . (int) $date;
}
return $textDate;
}
function normalizeMonth($month){
$monthName = '';
switch ((int) $month){
case 1:
$monthName = 'January';
break;
case 2:
$monthName = 'February';
break;
case 3:
$monthName = 'March';
break;
case 4:
$monthName = 'April';
break;
case 5:
$monthName = 'May';
break;
case 6:
$monthName = 'June';
break;
case 7:
$monthName = 'July';
break;
case 8:
$monthName = 'August';
break;
case 9:
$monthName = 'September';
break;
case 10:
$monthName = 'October';
break;
case 11:
$monthName = 'November';
break;
case 12:
$monthName = 'December';
break;
}
return $monthName;
}
function save_file($xml, $id, $fileName){
$dom = new DOMDocument;
$dom->preserveWhiteSpace = FALSE;
$dom->loadXML($xml);
if ($dom->loadXML($xml) === FALSE){
echo "Failed to validate in DOMDocument: {$id}.\n";
} else {
$dom->formatOutput = TRUE;
echo "Wrote {$id}.\n";
//echo $dom->saveXML();
//comment out the line below and uncomment the block below to enable saving to eXist
$dom->save($fileName);
//save to eXist
/*if (($readFile = fopen($fileName, 'r')) === FALSE){
echo "Failed to read file: {$id}.\n";
} else {
echo "Wrote {$id}.\n";
//PUT xml to eXist
$putToExist=curl_init();
//set curl opts
curl_setopt($putToExist,CURLOPT_URL,'http://localhost:8080/exist/rest/db/xeac/records/' . $id . '.xml');
curl_setopt($putToExist,CURLOPT_HTTPHEADER, array("Content-Type: text/xml; charset=utf-8"));
curl_setopt($putToExist,CURLOPT_CONNECTTIMEOUT,2);
curl_setopt($putToExist,CURLOPT_RETURNTRANSFER,1);
curl_setopt($putToExist,CURLOPT_PUT,1);
curl_setopt($putToExist,CURLOPT_INFILESIZE,filesize($fileName));
curl_setopt($putToExist,CURLOPT_INFILE,$readFile);
curl_setopt($putToExist,CURLOPT_USERPWD,"admin:");
$response = curl_exec($putToExist);
$http_code = curl_getinfo($putToExist,CURLINFO_HTTP_CODE);
//close eXist curl
curl_close($putToExist);
}*/
}
}
?>
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.