# Explore MongoDB

In [24]:
import pymongo
from pymongo import MongoClient
from bson.son import SON
from bson.objectid import ObjectId
import rdflib

In [3]:
client = MongoClient()
db = client.lobbyradar

Entities = db.entities
Relations = db.relations

person_filter = { 'type' : 'person' }
entity_filter = { 'type' : 'entity' }
persons = Entities.find(person_filter)
entities = Entities.find(entity_filter)

__Entity typen__
werden alle zu einem `rdf:Class`

In [4]:
Entities.distinct('type')

[u'Bundesdatenschutzbeauftragte',
 u'Hausausweise',
 u'Mitglied',
 u'Position',
 u'Tochterfirma',
 u'Vorsitzender',
 u'activity',
 u'association',
 u'business',
 u'committee',
 u'consulting',
 u'donation',
 u'ececutive',
 u'entity',
 u'executive',
 u'government',
 u'lobbyist',
 u'member',
 u'mitglied',
 u'person',
 u'position',
 u'publication',
 u'sponsoring',
 u'subisdiary',
 u'subsidiary']

__Ralationship typen__
werden alle zu einem `rdf:Property`

In [5]:
Relations.distinct('type')
for row in Relations.distinct('type'):
    print row

Bundesdatenschutzbeauftragte
Hausausweise
Mitglied
Position
Tochterfirma
Vorsitzender
activity
association
business
committee
consulting
donation
ececutive
executive
government
lobbyist
member
mitglied
position
publication
sponsoring
subisdiary
subsidiary


__ Importer __

In [6]:
pipeline = [ {"$group": { "_id": "$importer", "count":{"$sum": 1}}}, 
             {"$sort": {"count":-1}} ]
for entity in Entities.aggregate(pipeline):
    print "%s (%i)" % (entity['_id'], entity['count'])

lobbyliste (32668)
bundestag (7398)
parteispenden (6372)
pr (4001)
None (2132)
dax (1213)
laender (905)
parteispenden13 (670)
db.clean (629)
seitenwechsler (569)
kabinette (518)
thinktanks (485)
parteispenden14 (15)
parteien (1)


# Organisationen
__Beispiel Organisation__

In [7]:
entities = Entities.find({'type' : 'entity'})
entities.next()

{u'_id': ObjectId('54d50d2fd2257f9a581e2c32'),
 u'aliases': [],
 u'created': datetime.datetime(2015, 2, 6, 18, 51, 27, 527000),
 u'data': [],
 u'importer': None,
 u'name': u'SPD Vorpommern',
 u'search': [u'spd vorpommern'],
 u'slug': u'spd vorpommern',
 u'tags': [u'spd'],
 u'type': u'entity',
 u'updated': datetime.datetime(2015, 2, 19, 17, 43, 54, 49000)}

__ Datentypen für Organisationen __

- source/Quelle => dc:source
- address/Adresse => vcard:ADR
- description/Beschreibung => dc:description
* members/Anzahl der Mitglieder (2118)
* organisations/Anzahl der Mitgliedsorganisationen (953)
- topic/Thema => foaf:topic
* url/URL (199)
* finance/Finanzierung (151)
* staff/Anzahl der Mitarbeiter (126)
* legalform/Rechtsform (28)
- link/Link => vcard:RELATED
* partei/Partei => org:Organization => own:Party
* begin/Von (5) => dc:data => own:from
* end/Bis (4) => dc:date => own:to
- www/Webseite => foaf:homepage

In [8]:
pipeline = [ {"$match" : { "type" : "entity" }}, 
             {"$unwind": "$data"}, 
             {"$group": { "_id": { "$concat": [ "$data.key", "/", "$data.desc"]}, "count":{"$sum": 1}}}, 
             {"$sort": {"count":-1}} ]
for entity in Entities.aggregate(pipeline):
    print "%s (%i)" % (entity['_id'], entity['count'])

source/Quelle (7495)
address/Adresse (6547)
description/Beschreibungstext (2370)
members/Anzahl der Mitglieder (2118)
organisations/Anzahl der Mitgliedsorganisationen (953)
topic/Thema (684)
url/URL (199)
finance/Finanzierung (151)
staff/Anzahl der Mitarbeiter (126)
legalform/Rechtsform (28)
link/Link (28)
verified/Verifiziert (26)
displayname/Anzeigename (24)
partei/Partei (20)
begin/Von (5)
end/Bis (4)
description/Beschreibung (3)
www/Webseite (2)


# Personen
__ Beispiel Person __

In [26]:
persons = Entities.find({'_id' : ObjectId('552ff9cdaf9ee96e1c1df7c5')})
persons.next()

{u'_id': ObjectId('552ff9cdaf9ee96e1c1df7c5'),
 u'aliases': [u'Bundeskanzleramt'],
 u'created': datetime.datetime(2015, 4, 16, 18, 5, 0, 25000),
 u'data': [{u'auto': True,
   u'created': datetime.datetime(2015, 4, 16, 18, 5, 2, 136000),
   u'desc': u'Quelle',
   u'format': u'link',
   u'id': u'868b0075130ff4b99d8c9681e9d3f69decb42891fcf5e607f3a6434c7e734a1a',
   u'key': u'source',
   u'updated': datetime.datetime(2015, 4, 16, 18, 5, 2, 136000),
   u'value': {u'remark': u'created by seitenwechsler importer',
    u'url': u'https://lobbypedia.de/wiki/Seitenwechsler_in_Deutschland_im_%C3%9Cberblick'}}],
 u'importer': u'seitenwechsler',
 u'name': u'Bundeskanzleramt',
 u'search': [u'bundeskanzleramt'],
 u'slug': u'bundeskanzleramt',
 u'tags': [u'seitenwechsler', u'spd', u'cdu'],
 u'type': u'entity',
 u'updated': datetime.datetime(2015, 4, 16, 18, 5, 5, 664000)}

__Datentypen für Personen__

- source/Quelle => dc:source
- titles/Titel => dc:title
- address/Adresse => vcard:ADR
- link/Link => vcard:RELATED
- surname/Nachname foaf:familyName
- names/Vornamen foaf:givenName
- photo/Foto => foaf:Image
* bundesland/Bundesland (629) 
* btcertuid/Benutzename BT-Cert (608)
* wahlkreis/Wahlkreis (524)
* landesliste/Landesliste (194) -> container with vcard:country ?
* partei/Partei (109)
- description/Beschreibung => dc:description
* url/URL (6)
- title/Titel => dc:title
* listenplatz/Listenplatz (1)
- www/Webseite => foaf:homepage

In [10]:
pipeline = [ {"$match" : { "type" : "person" }}, 
             {"$unwind": "$data"}, 
             {"$group": { "_id": { "$concat": [ "$data.key", "/", "$data.desc"]}, "count":{"$sum": 1}}}, 
             {"$sort": {"count":-1}} ]
for entity in Entities.aggregate(pipeline):
    print "%s (%i)" % (entity['_id'], entity['count'])

source/Quelle (20338)
titles/Titel (14560)
address/Adresse (6165)
link/Link (5399)
surname/Nachname (3444)
names/Vornamen (3442)
photo/Foto (2433)
bundesland/Bundesland (629)
btcertuid/Benutzename BT-Cert (608)
wahlkreis/Wahlkreis (524)
landesliste/Landesliste (194)
partei/Partei (109)
verified/Verifiziert (22)
displayname/Anzeigename (20)
description/Beschreibung (10)
url/URL (6)
title/Titel (5)
listenplatz/Listenplatz (1)
www/Webseite (1)


# Relations

- Mitglied/member/mitglied => org:member
- executive => org:roleProperty => own:executiveOf (used with org:memberOf)

In [27]:
pipeline = [ {"$unwind": "$data"},
             {"$match" : {
                "data.key" : "position",
                "type" : "subsidiary"
             }},
             {"$group": { "_id": "$data.value", "count":{"$sum": 1}}}, 
             {"$sort": {"count":-1}} ]
for entity in Relations.aggregate(pipeline):
    print "%s (%i)" % (entity['_id'], entity['count'])

 (26)
Anteilseigentümer (2)
Hundertprozentige Tochtergesellschaft der Bertelsmann-Gruppe (1)
Untersteht dem Bundeskanzleramt (1)
Die Bundeswehr untersteht dem Bundesminister der Verteidigung (1)
Swiss Life Select Deutschland GmbH ist hervorgegangen aus dem AWD (1)
Medienholding der SPD (1)
Tochtergesellschaft der Nürnberger Versicherungsgruppe (1)
Tochterunternehmen von Omnicom (1)
Tochtergesellschaft (1)
Tochtergesellschaft der Deutschen Post (1)
Die Schott AG ist zu 100% in die ADLT-Gruppe eingegliedert (1)
Tochterunternehmen (1)


In [104]:
pipeline = [ {"$group": { "_id": "$type", "count":{"$sum": 1}}}, 
             {"$sort": {"count":-1}} ]
for rel in Relations.aggregate(pipeline):
    print "%s (%i)" % (rel['_id'], rel['count'])

executive (17018)
member (4291)
donation (3820)
activity (2664)
business (1454)
position (1157)
government (580)
Mitglied (169)
association (64)
subsidiary (39)
Position (27)
committee (26)
Hausausweise (19)
mitglied (16)
consulting (5)
Tochterfirma (4)
sponsoring (3)
subisdiary (2)
ececutive (2)
Bundesdatenschutzbeauftragte (1)
Vorsitzender (1)
lobbyist (1)
publication (1)


In [31]:
relations = Relations.find({'type' : 'government'})

In [34]:
relations.next()

{u'_id': ObjectId('5501cb466e8aab706857b0c1'),
 u'created': datetime.datetime(2015, 3, 12, 17, 22, 14, 11000),
 u'data': [{u'auto': True,
   u'created': datetime.datetime(2015, 3, 12, 17, 22, 14, 11000),
   u'desc': u'Position',
   u'format': u'string',
   u'id': u'fab2cb74049249ba75a1f0f7cb0243f03b057aca062352fd1e2d7a4d46c8e5be',
   u'key': u'position',
   u'updated': datetime.datetime(2015, 3, 12, 17, 22, 14, 11000),
   u'value': u'Staatsminister'},
  {u'auto': True,
   u'created': datetime.datetime(2015, 3, 12, 17, 22, 14, 11000),
   u'desc': u'Von',
   u'format': u'date',
   u'id': u'29d902a134d370c73438c68c72fe96222ee4bb6436239b18239245e1f4da6438',
   u'key': u'begin',
   u'updated': datetime.datetime(2015, 3, 12, 17, 22, 14, 11000),
   u'value': datetime.datetime(2013, 12, 16, 23, 0)}],
 u'entities': [ObjectId('54c71e92349d25992bca523c'),
  ObjectId('552ff9cdaf9ee96e1c1df7c5')],
 u'importer': u'kabinette',
 u'tags': [u'kabinette'],
 u'type': u'government',
 u'updated': datetime.d

- 

- source/Quelle => dc:source
- position/Position => skos:Role

In [101]:
pipeline = [ {"$unwind": "$data"}, 
             {"$group": { "_id": { "$concat": [ "$data.key", "/", "$data.desc"]}, "count":{"$sum": 1}}}, 
             {"$sort": {"count":-1}} ]
for rel in Relations.aggregate(pipeline):
    print "%s (%i)" % (rel['_id'], rel['count'])

source/Quelle (25018)
donation/Parteispende (9691)
position/Position (2735)
activity/Angaben zur Nebentätigkeit (2690)
position/Funktion im Ausschuss (2014)
begin/Von (622)
end/Ende (308)
end/Bis (247)
begin/Beginn (173)
start/Beginn (141)
issued/Ausgestellt von (18)


In [54]:
from rdflib import Namespace, BNode, Literal
from rdflib.namespace import FOAF

In [55]:
g = rdflib.Graph()
g.parse('ontology.ttl', format='turtle')

<Graph identifier=N91971e16ba084f0fbd8de4c31256c59b (<class 'rdflib.graph.Graph'>)>

In [56]:
node = BNode()

In [57]:
FOAF.Person

rdflib.term.URIRef(u'http://xmlns.com/foaf/0.1/Person')

In [58]:
exampleorg = Namespace('http://example.org/')

In [59]:
exampleorg.Party

rdflib.term.URIRef(u'http://example.org/Party')

In [60]:
g.add((node, exampleorg.Party, Literal('Die Linke')))

In [61]:
print(g.serialize(format = 'turtle'))

@prefix : <http://example.org/> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix dcam: <http://purl.org/dc/dcam/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix org: <http://www.w3.org/ns/org#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix vcard: <http://www.w3.org/2006/vcard/ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

:Address a rdfs:Class ;
    rdfs:subClassOf vcard:ADR .

:Description a rdfs:Class ;
    rdfs:subClassOf dc:description .

:From a rdfs:Class ;
    rdfs:subClassOf dc:date .

:Homepage a rdfs:Class ;
    rdfs:subClassOf foaf:homepage .

:LegalForm a rdfs:Class .

:Lobbyist a rdfs:Class ;
    rdfs:subClassOf foaf:Person .

:Party a rdfs:Class ;
    rdfs:subClassOf org:Organization .

:Photo a rdfs:Class ;
    rdfs:subClassOf foaf:Image .

:Politician a rdfs:Class ;
    rdfs:subClassOf foaf:Person .

:Source

In [None]:
ObjectId