In [29]:
from rdflib import Graph
from rdflib.term import URIRef, Literal, BNode
from rdflib.namespace import RDF, RDFS, OWL, XSD
import os

import tiktoken
import pandas as pd

In [30]:
enc = tiktoken.encoding_for_model("gpt-4")


In [38]:
paths = {
    'conference': '/home/guilherme/Documents/kg/complex/conference/ont',
    'populated_conference': '/home/guilherme/Documents/kg/complex/conference_100/ont',
    'geolink': '/home/guilherme/Documents/kg/complex/geolink',
    'hydrography': '/home/guilherme/Documents/kg/complex/hydrography_ontology/ontology',
    'taxon': '/home/guilherme/Documents/kg/complex/taxon/ont'
}

data = []

for k, v in paths.items():
    
    for p, d, f in os.walk(v):
        for fs in f:
            if not fs.endswith('.owl') and not fs.endswith('.rdf'):
                continue
            
            with open(os.path.join(p, fs), 'r') as f:
                tokens = len(enc.encode(f.read()))
                
            g = Graph().parse(os.path.join(p, fs))
            
            triples = len(g)
            
            
            sbj = set(g.subjects())
            
            subjects = len(sbj)
            
            bnodes = 0
            classes = 0
            properties = 0
            instances = 0
            
            
            for s in sbj:
                
                if type(s) == BNode:
                    bnodes += 1
                    continue
                
                tp = g.value(s, RDF.type)
                            
                if tp is None:
                    continue
                    
                if type(tp) == Literal:
                    instances += 1
                
                elif tp == OWL.Class:
                    classes += 1
                    continue
                
                elif type(tp) == BNode or g.namespace_manager.qname(tp).split(':')[0] == 'owl':
                    properties += 1
                    continue
                
                else:
                    instances += 1
                    continue
                
                
            data.append([f'{k}/{fs}', tokens, triples, subjects, bnodes, classes, properties, instances])
            


df = pd.DataFrame(data, columns=['file', 'tokens', 'triples', 'subjects', 'bnodes', 'classes', 'properties', 'instances'])

In [39]:
df.to_csv('complex_ontologies.csv', index=False)