In [1]:
# read, parse and serialize graph to nt format
import time
import rdflib
from rdflib.graph import Graph, ConjunctiveGraph
import rdflib.namespace
from rdflib.namespace import OWL, RDF, RDFS
from rdflib import Namespace, URIRef, Literal

print ('STARTING')
start = time.time()

s = "OpenCyc/opencyc-latest.owl"
g = rdflib.ConjunctiveGraph()
print ('START PARSING GRAPH')
g = g.parse(s)
print ('DONE PARSING GRAPH')
print ('START SERIALIZING GRAPH')
g.serialize(destination='OpenCyc/opencyc-latest.nt', format='nt')
print ('DONE SERIALIZING GRAPH')
print ('EXECUTION TIME: {} s'.format(time.time()-start))

STARTING
START PARSING GRAPH


IOError: [Errno 2] No such file or directory: u'/Users/curtis/git/SeminarPaper/OpenCyc/opencyc-latest.owl'

In [1]:
# get top 10 classes

import os
import time
import string
import itertools
import re
import operator

start = time.time()

#readFile = 'OpenCyc/opencyc-latest_sample.nt'
readFile = 'OpenCyc/opencyc-latest.nt'

classCount = {} #key:class, value:count

lineProgress = 1000000

rdfType = '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>'
owlClass = '<http://www.w3.org/2002/07/owl#Class>'

allClasses = set()
seenClasses = set()
classCountDic = {} #key:class, value:count

def getSPO(splittedLine):
    word_position = 0
    for word in splittedLine:
        if (word_position == 0):
            subj = word
        elif (word_position == 1):
            pred = word
        elif (word_position == 2):
            obj = word
        else:
            return subj, pred, obj
        word_position += 1
    return subj, pred, obj

def addToClasses(s):
    if s not in allClasses:
        allClasses.add(s)
        
def countClasses(o):
    if o in seenClasses:
        classCountDic[o] += 1
    else:
        classCountDic[o] = 1
        seenClasses.add(o)
        
print('START')
try:
    f = open(readFile, 'r')
    lineCounter = 0
    for line in f:
        splittedLine = line.rstrip('\n').split()
        s, p, o = getSPO(splittedLine)
        if (p == rdfType and o == owlClass):
            addToClasses(s)
        lineCounter += 1
        if (lineCounter % lineProgress == 0):
            print ('{} lines read'.format(lineCounter))
        #if (lineCounter > 100):
        #    break
    f.close()
    print ('{} classes found'.format(len(allClasses)))
    f = open(readFile, 'r')
    lineCounter = 0
    for line in f:
        splittedLine = line.rstrip('\n').split()
        s, p, o = getSPO(splittedLine)
        if (p == rdfType and o in allClasses):
            countClasses(o)
        lineCounter += 1
        if (lineCounter % lineProgress == 0):
            print ('{} lines read'.format(lineCounter))
    #print classCountDic
    print ('{} different classes found'.format(len(classCountDic)))
    top10dic = dict(sorted(classCountDic.items(), key=operator.itemgetter(1), reverse=True)[:10])
    print (sorted(top10dic.items(), key=operator.itemgetter(1), reverse=True))
    f.close()
    print ('DONE')
except:
    print('ERROR')
print ('EXECUTION TIME: {} s'.format(time.time()-start))

START
1000000 lines read
2000000 lines read
116822 classes found
1000000 lines read
2000000 lines read
7594 different classes found
[('<http://sw.opencyc.org/concept/Mx4rvWXYgJwpEbGdrcN5Y29ycA>', 11743), ('<http://sw.opencyc.org/concept/Mx4rpPHhAOB1EdqAAAACs6hRXg>', 10547), ('<http://sw.opencyc.org/concept/Mx4rFGe-SEsrEd2AAADggVfs8g>', 8863), ('<http://sw.opencyc.org/concept/Mx4rvVjf5JwpEbGdrcN5Y29ycA>', 7591), ('<http://sw.opencyc.org/concept/Mx4rvVjaApwpEbGdrcN5Y29ycA>', 7483), ('<http://sw.opencyc.org/concept/Mx4rvi9EhJwpEbGdrcN5Y29ycA>', 7419), ('<http://sw.opencyc.org/concept/Mx4rHIBS0h_TEdaAAABQ2rksLw>', 7091), ('<http://sw.opencyc.org/concept/Mx4rvVjnZ5wpEbGdrcN5Y29ycA>', 5243), ('<http://sw.opencyc.org/concept/Mx4rvViq35wpEbGdrcN5Y29ycA>', 4122), ('<http://sw.opencyc.org/concept/Mx4rvVjZ_ZwpEbGdrcN5Y29ycA>', 4109)]
DONE
EXECUTION TIME: 20.9853029251 s


In [3]:
# calculate class indegree & outdegree

import operator
import time

print ('STARTING')
start = time.time()

indegreeDic = {} # key:class, value: indegree
outdegreeDic = {} #key: class, value: outdegree

lineCounter = 0

try:
    f = open(readFile, 'r')
    lineCounter = 0
    for line in f:
        splittedLine = line.rstrip('\n').split()
        s, p, o = getSPO(splittedLine)
        if top10dic.has_key(s):
            if outdegreeDic.has_key(s):
                outdegreeDic[s] += 1
            else:
                outdegreeDic[s] = 1
        if top10dic.has_key(o):
            if indegreeDic.has_key(o):
                indegreeDic[o] += 1
            else:
                indegreeDic[o] = 1
        lineCounter += 1
        if (lineCounter % lineProgress == 0):
            print ('{} lines read'.format(lineCounter))
    f.close()
    print ('DONE')
    print ('INDEGREE RESULT:')
    print sorted(indegreeDic.items(), key=operator.itemgetter(1), reverse=True)
    print ('OUTDEGREE RESULT')
    print sorted(outdegreeDic.items(), key=operator.itemgetter(1), reverse=True)
    print ('EXECUTION TIME: {} s'.format(time.time()-start))
except:
    print ('ERROR')
    



STARTING
1000000 lines read
2000000 lines read
DONE
INDEGREE RESULT:
[('<http://sw.opencyc.org/concept/Mx4rvWXYgJwpEbGdrcN5Y29ycA>', 12959), ('<http://sw.opencyc.org/concept/Mx4rHIBS0h_TEdaAAABQ2rksLw>', 11057), ('<http://sw.opencyc.org/concept/Mx4rpPHhAOB1EdqAAAACs6hRXg>', 10573), ('<http://sw.opencyc.org/concept/Mx4rvVjaApwpEbGdrcN5Y29ycA>', 9873), ('<http://sw.opencyc.org/concept/Mx4rFGe-SEsrEd2AAADggVfs8g>', 8864), ('<http://sw.opencyc.org/concept/Mx4rvVjf5JwpEbGdrcN5Y29ycA>', 7761), ('<http://sw.opencyc.org/concept/Mx4rvi9EhJwpEbGdrcN5Y29ycA>', 7422), ('<http://sw.opencyc.org/concept/Mx4rvVjnZ5wpEbGdrcN5Y29ycA>', 5326), ('<http://sw.opencyc.org/concept/Mx4rvViq35wpEbGdrcN5Y29ycA>', 4582), ('<http://sw.opencyc.org/concept/Mx4rvVjZ_ZwpEbGdrcN5Y29ycA>', 4147)]
OUTDEGREE RESULT
[('<http://sw.opencyc.org/concept/Mx4rvVjaApwpEbGdrcN5Y29ycA>', 40), ('<http://sw.opencyc.org/concept/Mx4rvVjnZ5wpEbGdrcN5Y29ycA>', 33), ('<http://sw.opencyc.org/concept/Mx4rvWXYgJwpEbGdrcN5Y29ycA>', 32), ('<ht

In [9]:
# class instances indegree & outdegree

import os
import time
import string
import itertools
import re
import operator
import numpy

start = time.time()

class ClassInstances:
    def __init__(self, name, uri):
        self.name = name
        self.uri = uri
        self.allClassInstances = set()
        self.countDict = dict()
        self.min = 9999999
        self.max = 0
        self.avg = 0.0
        self.median = 0.0
    def getURI(self):
        return self.uri
    def getClassInstances(self):
        return len(self.allClassInstances)
    def addInstance(self, instance):
        if instance in self.allClassInstances:
            self.countDict[instance] += 1
        else:
            self.countDict[instance] = 1
            self.allClassInstances.add(instance)
    def calculateDegrees(self):
        allValueList = []
        for k,v in self.countDict.iteritems():
            allValueList.append(v)
            if (v < self.min):
                self.min = v
            if (v > self.max):
                self.max = v
            self.avg += v
        if (len(self.countDict) != 0):
            self.avg = self.avg / len(self.countDict)
        if (len(allValueList) != 0):
            self.median = numpy.median(numpy.array(allValueList))
    def printResults(self):
        print ('{}'.format(self.name))
        print ('min: {}, avg: {}, median: {}, max: {}'.format(self.min, self.avg, self.median, self.max))
        
# instances sets
iAllSet = set() #allInstances = set()
iSet1 = set() #instancesObject = set()
iSet2 = set() #instancesObjectNoOverlap = set()
iSet3 = set() #instancesNoun = set()
iSet4 = set() #instancesOrganism = set()
iSet5 = set() #instancesIndividual = set()
iSet6 = set() #instancesProgram = set()
iSet7 = set() #instancesCollection = set()
iSet8 = set() #instancesCity = set()
iSet9 = set() #instancesTempObject = set()
iSet10 = set() #instancesCoorp = set()
iSetAll = [iSet1, iSet2, iSet3, iSet4, iSet5, iSet6, iSet7, iSet8, iSet9, iSet10]

# indegree class instantiation 
ci1 = ClassInstances('indegree object', '<http://sw.opencyc.org/concept/Mx4rvWXYgJwpEbGdrcN5Y29ycA>')
ci2 = ClassInstances('indegree object no overlap', '<http://sw.opencyc.org/concept/Mx4rpPHhAOB1EdqAAAACs6hRXg>')
ci3 = ClassInstances('indegree noun', '<http://sw.opencyc.org/concept/Mx4rFGe-SEsrEd2AAADggVfs8g>')
ci4 = ClassInstances('indegree organism', '<http://sw.opencyc.org/concept/Mx4rvVjf5JwpEbGdrcN5Y29ycA>')
ci5 = ClassInstances('indegree individual', '<http://sw.opencyc.org/concept/Mx4rvVjaApwpEbGdrcN5Y29ycA>')
ci6 = ClassInstances('indegree program', '<http://sw.opencyc.org/concept/Mx4rvi9EhJwpEbGdrcN5Y29ycA>')
ci7 = ClassInstances('indegree collection', '<http://sw.opencyc.org/concept/Mx4rHIBS0h_TEdaAAABQ2rksLw>')
ci8 = ClassInstances('indegree city', '<http://sw.opencyc.org/concept/Mx4rvVjnZ5wpEbGdrcN5Y29ycA>')
ci9 = ClassInstances('indegree temporal object', '<http://sw.opencyc.org/concept/Mx4rvViq35wpEbGdrcN5Y29ycA>')
ci10 = ClassInstances('indegree public coorporation', '<http://sw.opencyc.org/concept/Mx4rvVjZ_ZwpEbGdrcN5Y29ycA>')
ciAll = [ci1, ci2, ci3, ci4, ci5, ci6, ci7, ci8, ci9, ci10]

# outdegree class instantiation 
co1 = ClassInstances('outdegree object', '<http://sw.opencyc.org/concept/Mx4rvWXYgJwpEbGdrcN5Y29ycA>')
co2 = ClassInstances('outdegree object no overlap', '<http://sw.opencyc.org/concept/Mx4rpPHhAOB1EdqAAAACs6hRXg>')
co3 = ClassInstances('outdegree noun', '<http://sw.opencyc.org/concept/Mx4rFGe-SEsrEd2AAADggVfs8g>')
co4 = ClassInstances('outdegree organism', '<http://sw.opencyc.org/concept/Mx4rvVjf5JwpEbGdrcN5Y29ycA>')
co5 = ClassInstances('outdegree individual', '<http://sw.opencyc.org/concept/Mx4rvVjaApwpEbGdrcN5Y29ycA>')
co6 = ClassInstances('outdegree program', '<http://sw.opencyc.org/concept/Mx4rvi9EhJwpEbGdrcN5Y29ycA>')
co7 = ClassInstances('outdegree collection', '<http://sw.opencyc.org/concept/Mx4rHIBS0h_TEdaAAABQ2rksLw>')
co8 = ClassInstances('outdegree city', '<http://sw.opencyc.org/concept/Mx4rvVjnZ5wpEbGdrcN5Y29ycA>')
co9 = ClassInstances('outdegree temporal object', '<http://sw.opencyc.org/concept/Mx4rvViq35wpEbGdrcN5Y29ycA>')
co10 = ClassInstances('outdegree public coorporation', '<http://sw.opencyc.org/concept/Mx4rvVjZ_ZwpEbGdrcN5Y29ycA>')
coAll = [co1, co2, co3, co4, co5, co6, co7, co8, co9, co10]

def getSPO(splittedLine):
    word_position = 0
    for word in splittedLine:
        if (word_position == 0):
            subj = word
        elif (word_position == 1):
            pred = word
        elif (word_position == 2):
            obj = word
        else:
            return subj, pred, obj
        word_position += 1
    return subj, pred, obj

# count all instances of the top10 classes
def countInstances(s, o):
    #s a o: o=class s=instance
    iAllSet.add(s)
    if (o == ci1.getURI()):
        if (s not in iSet1):
            iSet1.add(s)
        return
    if (o == ci2.getURI()):
        if (s not in iSet2):
            iSet2.add(s)
        return
    if (o == ci3.getURI()):
        if (s not in iSet3):
            iSet3.add(s)
        return
    if (o == ci4.getURI()):
        if (s not in iSet4):
            iSet4.add(s)
        return
    if (o == ci5.getURI()):
        if (s not in iSet5):
            iSet5.add(s)
        return
    if (o == ci6.getURI()):
        if (s not in iSet6):
            iSet6.add(s)
        return
    if (o == ci7.getURI()):
        if (s not in iSet7):
            iSet7.add(s)
        return
    if (o == ci8.getURI()):
        if (s not in iSet8):
            iSet8.add(s)
        return
    if (o == ci9.getURI()):
        if (s not in iSet9):
            iSet9.add(s)
        return
    if (o == ci10.getURI()):
        if (s not in iSet10):
            iSet10.add(s)
        return

# count instance degrees
def countInstanceDegrees(s, o):
    if (s in iAllSet): # outdegree
        if (s in iSet1):
            co1.addInstance(s)
        if (s in iSet2):
            co2.addInstance(s)
        if (s in iSet3):
            co3.addInstance(s)
        if (s in iSet4):
            co4.addInstance(s)
        if (s in iSet5):
            co5.addInstance(s)
        if (s in iSet6):
            co6.addInstance(s)
        if (s in iSet7):
            co7.addInstance(s)
        if (s in iSet8):
            co8.addInstance(s)
        if (s in iSet9):
            co9.addInstance(s)
        if (s in iSet10):
            co10.addInstance(s)       
    if (o in iAllSet): # indegree
        if (o in iSet1):
            ci1.addInstance(o)
        if (o in iSet2):
            ci2.addInstance(o)
        if (o in iSet3):
            ci3.addInstance(o)
        if (o in iSet4):
            ci4.addInstance(o)
        if (o in iSet5):
            ci5.addInstance(o)
        if (o in iSet6):
            ci6.addInstance(o)
        if (o in iSet7):
            ci7.addInstance(o)
        if (o in iSet8):
            ci8.addInstance(o)
        if (o in iSet9):
            ci9.addInstance(o)
        if (o in iSet10):
            ci10.addInstance(o)
        return

def calculateClassDegrees():
    for item in ciAll:
        item.calculateDegrees()
    for item in coAll:
        item.calculateDegrees()
        
def printClassDegreeResults():
    for item in ciAll:
        item.printResults()
    for item in coAll:
        item.printResults()
        
print('START')
try:
    # get all instances for the top10 classes
    f = open(readFile, 'r')
    lineCounter = 0
    print ('START COUNTING INSTANCES')
    for line in f:
        splittedLine = line.rstrip('\n').split()
        s, p, o = getSPO(splittedLine)
        if (p == rdfType and top10dic.has_key(o)):
            countInstances(s, o)
        lineCounter += 1
        if (lineCounter % lineProgress == 0):
            print ('{} lines read'.format(lineCounter))
        #if (lineCounter > 5000):
        #    break
    f.close()
    print ('DONE COUNTING INSTANCES')
    for i, item in enumerate(iSetAll):
        print ('iSet{}, #instances: {}'.format(i+1, len(iSetAll[i])))
        
    # count instance degrees
    print ('START COUNTING INSTANCE DEGREES')
    f = open(readFile, 'r')
    lineCounter = 0
    for line in f:
        splittedLine = line.rstrip('\n').split()
        s, p, o = getSPO(splittedLine)
        countInstanceDegrees(s, o)
        lineCounter += 1
        if (lineCounter % lineProgress == 0):
            print ('{} lines read'.format(lineCounter))
        #if (lineCounter > 5000):
        #    break
    f.close()
    print ('DONE COUNTING INSTANCE DEGREES')
    print ('START CALCULATING CLASS DEGREES')
    calculateClassDegrees()
    print ('DONE CALCULATING CLASS DEGREES')
    print ('RESULTS')
    printClassDegreeResults()
    print ('EXECUTION TIME: {} s'.format(time.time()-start))
except:
    print ('ERROR')

START
START COUNTING INSTANCES
1000000 lines read
2000000 lines read
DONE COUNTING INSTANCES
iSet1, #instances: 11743
iSet2, #instances: 10547
iSet3, #instances: 8863
iSet4, #instances: 7591
iSet5, #instances: 7483
iSet6, #instances: 7419
iSet7, #instances: 7091
iSet8, #instances: 5243
iSet9, #instances: 4122
iSet10, #instances: 4109
START COUNTING INSTANCE DEGREES
1000000 lines read
2000000 lines read
DONE COUNTING INSTANCE DEGREES
START CALCULATING CLASS DEGREES
DONE CALCULATING CLASS DEGREES
RESULTS
indegree object
min: 1, avg: 9.21316249272, median: 2.0, max: 4147
indegree object no overlap
min: 1, avg: 6.66605691057, median: 2.0, max: 1226
indegree noun
min: 9999999, avg: 0.0, median: 0.0, max: 0
indegree organism
min: 1, avg: 19.7437325905, median: 2.0, max: 3752
indegree individual
min: 1, avg: 1.48837209302, median: 1.0, max: 16
indegree program
min: 1, avg: 2.12116484659, median: 1.0, max: 150
indegree collection
min: 1, avg: 6.019826864, median: 1.0, max: 1195
indegree city
m