In [5]:
%%python

import pyspark

from tools.datasets import *

wikidata_prefix = "https://wikidata.org/wiki/Special:EntityData/"
douglasadams_full_url = "https://www.wikidata.org/wiki/Special:EntityData/Q42.json"
douglasadams_full_filename = "Q42.json"

douglasadams_truthy_url = douglasadams_full_url + "?flavor=simple"
douglasadams_truthy_filename = "Q42_truthy.json"

glasgow_truthy_url = wikidata_prefix + "Q4093.json"

fetch_dataset(douglasadams_full_url, douglasadams_full_filename)
fetch_dataset(douglasadams_truthy_url, douglasadams_truthy_filename)

File Q42.json was already downloaded. Skipping...
Dataset not available, downloading from https://www.wikidata.org/wiki/Special:EntityData/Q42.json?flavor=simple
Done


In [3]:
// An experiment with GraphX to generate N-triples
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

val baseURI = "http://snee.com/xpropgraph#"
val sc = spark.sparkContext

val users: RDD[(VertexId, (String, String))] =
    sc.parallelize(Array(
        (3L, ("rxin", "student")),
        (7L, ("jgonzal", "postdoc"))
    ))

val relationships: RDD[Edge[String]] =
    sc.parallelize(Array(
        Edge(3L, 7L, "collab")
    ))

val graph = Graph(users, relationships)

graph.triplets.foreach( t => println(
    s"<$baseURI${t.srcAttr._1}> <$baseURI${t.attr}> <$baseURI${t.dstAttr._1}> ."
))

users.foreach(t => println(
    s"""<$baseURI${t._2._1}> <${baseURI}role> \"${t._2._2}\" ."""
))

<http://snee.com/xpropgraph#rxin> <http://snee.com/xpropgraph#collab> <http://snee.com/xpropgraph#jgonzal> .
<http://snee.com/xpropgraph#rxin> <http://snee.com/xpropgraph#role> "student" .
<http://snee.com/xpropgraph#jgonzal> <http://snee.com/xpropgraph#role> "postdoc" .


import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
baseURI: String = http://snee.com/xpropgraph#
sc: org.apache.spark.SparkContext = org.apache.spark.SparkContext@5684fa80
users: org.apache.spark.rdd.RDD[(org.apache.spark.graphx.VertexId, (String, String))] = ParallelCollectionRDD[0] at parallelize at <console>:35
relationships: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[String]] = ParallelCollectionRDD[1] at parallelize at <console>:41
graph: org.apache.spark.graphx.Graph[(String, String),String] = org.apache.spark.graphx.impl.GraphImpl@7db307b4


In [19]:
import spark.implicits._

val wikidata_df = spark.read
    .json("data/Q42_truthy.json")

import spark.implicits._
wikidata_df: org.apache.spark.sql.DataFrame = [entities: struct<Q42: struct<aliases: struct<ar: array<struct<language:string,value:string>>, be-tarask: array<struct<language:string,value:string>> ... 38 more fields>, claims: struct<P1005: array<struct<id:string,mainsnak:struct<datatype:string,datavalue:struct<type:string,value:string>,property:string,snaktype:string>,rank:string,references:array<struct<hash:string,snaks:struct<P214:array<struct<datatype:string,datavalue:struct<type:string,value:string>,property:string,snaktype:string>>,P248:array<struct<datatype:string,datavalue:struct<type:string,value:struct<entity-type:string,id:string,numeric-id:bigint>>,property:string,snaktype:string>>,P813:array<struct<datatype:string,datavalue:struct<type:string,value:st...

## Wikidata schema structure

(For JSON only)

entities -> Map of Entity

Entity -> labels (the "proper" name of the label, under a localization), aliases (alternative name, eg. pen name or synonym etc., under localization), description (abstract, definition etc. under localization), claims (dictionary of properties) and sitelink. Sitelinks are references to other websites (mainly within the Wikimedia/Wikipedia universe, prefixed by locale identifier).

Statements are denoted as: String -> Array of properties. The String is usually a Wikidata property (starting with P). The "object" part in a triple is denoted by the "mainsnak". A property can either contain zero or more snaks. Such value goes under "datavalue", whose format may vary quite wildly. See https://www.mediawiki.org/wiki/Wikibase/DataModel/JSON#Snaks for an overview.


In [18]:
import org.apache.spark.sql.types._

df.select()

root
 |-- entities: struct (nullable = true)
 |    |-- Q42: struct (nullable = true)
 |    |    |-- aliases: struct (nullable = true)
 |    |    |    |-- ar: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- language: string (nullable = true)
 |    |    |    |    |    |-- value: string (nullable = true)
 |    |    |    |-- be-tarask: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- language: string (nullable = true)
 |    |    |    |    |    |-- value: string (nullable = true)
 |    |    |    |-- bho: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- language: string (nullable = true)
 |    |    |    |    |    |-- value: string (nullable = true)
 |    |    |    |-- ca: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- lan