In [3]:
from SPARQLWrapper import SPARQLWrapper, CSV
import pandas as pd
from io import StringIO

# Define the SPARQL endpoint and query
endpoint_url = "https://dbpedia.org/sparql"
def make_query(batch_size = 1000, offset = 0):
  return"""
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>

SELECT * 
WHERE {
  {
    SELECT 
      ?authorLabel
      (GROUP_CONCAT(DISTINCT ?allGenres; separator=", ") AS ?combinedGenres)
      ?birthDate
      (COALESCE(?nationalityLabel, ?languageResourceLabel, ?languageLiteral) AS ?combinedNationalityOrLanguage)
      (GROUP_CONCAT(DISTINCT ?topBookLabel; separator=", ") AS ?topBooks)
    WHERE {
      ?author a dbo:Writer ;
              rdfs:label ?authorLabel ;
              dbo:birthDate ?birthDate.
      FILTER (lang(?authorLabel) = "en")

      # Genres from dbo:genre
      OPTIONAL {
        ?author dbo:genre ?genre.
        ?genre rdfs:label ?genreLabel2.
        FILTER (lang(?genreLabel2) = "en")
        BIND(?genreLabel2 AS ?allGenres)
      }

      # Genres from dbp:genre (literal)
      OPTIONAL {
        ?author dbp:genre ?plainGenre.
        BIND(STR(?plainGenre) AS ?genreLabel)
        BIND(?genreLabel AS ?allGenres)
      }

      # Nationality
      OPTIONAL {
        ?author dbo:nationality ?nationality.
        ?nationality rdfs:label ?nationalityLabel.
        FILTER (lang(?nationalityLabel) = "en")
      }

      # Language from dbo:language
      OPTIONAL {
        ?author dbo:language ?langResource.
        ?langResource rdfs:label ?languageResourceLabel.
        FILTER (lang(?languageResourceLabel) = "en")
      }

      # Language from dbp:language (literal)
      OPTIONAL {
        ?author dbp:language ?languageLit.
        FILTER (lang(?languageLit) = "en")
        BIND(STR(?languageLit) AS ?languageLiteral)
      }

      # Notable works
      OPTIONAL {
        { ?author dbo:notableWork ?topBook }
        UNION
        { ?author dbp:notableWork ?topBook }
        ?topBook rdfs:label ?topBookLabel.
        FILTER (lang(?topBookLabel) = "en")
      }
    }
    GROUP BY ?authorLabel ?birthDate ?nationalityLabel ?languageResourceLabel ?languageLiteral
    ORDER BY ?authorLabel
    """ +  f"LIMIT {batch_size} OFFSET {offset}" + """
  }
  FILTER (
    ?combinedGenres != "" && 
    ?combinedNationalityOrLanguage != "" && 
    ?topBooks != ""
  )
}
"""


ModuleNotFoundError: No module named 'SPARQLWrapper'

In [2]:
from tqdm.notebook import tqdm
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, CSV
from io import StringIO

# SPARQL Endpoint and Query
endpoint_url = "https://dbpedia.org/sparql"
batch_size = 500
offset = 0
total_rows = 0  # Counter for the total rows

sparql = SPARQLWrapper(endpoint_url)
sparql.setReturnFormat(CSV)
sparql.setTimeout(300)
# Initialize an empty DataFrame to store all results
all_data = pd.DataFrame()

# Estimated total pages
max_pages = 60

# Progress bar setup
with tqdm(total=max_pages, desc="Fetching pages") as pbar:
    while True:
        # Update the query with the current OFFSET
        paginated_query = make_query(batch_size, offset)
        sparql.setQuery(paginated_query)

        # Execute the query and fetch results
        results = sparql.query().convert()
        csv_str = results.decode("utf-8")
        data = pd.read_csv(StringIO(csv_str))

        # If no data is returned, stop the loop
        # if data.empty:
        #     print(f"No more data after offset {offset}.")
        #     break

        # Append the current batch to the all_data DataFrame
        all_data = pd.concat([all_data, data], ignore_index=True)

        # Update total rows count
        total_rows += len(data)
        print(f"Rows acquired in this batch: {len(data)} | Total rows: {total_rows}")

        # Increment the offset for the next batch
        offset += batch_size

        # Update progress bar
        pbar.update(1)

        # Stop after a maximum number of pages (safety to prevent infinite loops)
        if pbar.n >= max_pages:
            print("Reached maximum pages defined.")
            break


        # Optionally, save to CSV
        all_data.to_csv("tmp_authors_data.csv", index=False)
        

# Print the final DataFrame and total rows
print(f"Total rows acquired: {total_rows}")
print(all_data)

all_data.to_csv("authors_data.csv", index=False)

Fetching pages:   0%|          | 0/60 [00:00<?, ?it/s]

Rows acquired in this batch: 25 | Total rows: 25
Rows acquired in this batch: 27 | Total rows: 52
Rows acquired in this batch: 36 | Total rows: 88
Rows acquired in this batch: 32 | Total rows: 120
Rows acquired in this batch: 30 | Total rows: 150
Rows acquired in this batch: 31 | Total rows: 181
Rows acquired in this batch: 35 | Total rows: 216
Rows acquired in this batch: 30 | Total rows: 246
Rows acquired in this batch: 37 | Total rows: 283
Rows acquired in this batch: 43 | Total rows: 326
Rows acquired in this batch: 39 | Total rows: 365
Rows acquired in this batch: 41 | Total rows: 406
Rows acquired in this batch: 31 | Total rows: 437
Rows acquired in this batch: 32 | Total rows: 469
Rows acquired in this batch: 43 | Total rows: 512
Rows acquired in this batch: 26 | Total rows: 538
Rows acquired in this batch: 37 | Total rows: 575
Rows acquired in this batch: 27 | Total rows: 602
Rows acquired in this batch: 31 | Total rows: 633
Rows acquired in this batch: 34 | Total rows: 667
Row

In [4]:
import pandas as pd
import re
from rdflib import Graph, Namespace, Literal, RDF, URIRef
from rdflib.namespace import XSD, RDFS
import datetime
import sys
from datetime import datetime

# --------------------------
# Helper functions
# --------------------------

def split_and_clean(s):
    """
    Utility function to split comma or newline separated strings and clean them up.
    """
    if pd.isna(s):
        return []
    items = []
    # Split by commas, then split by newlines if necessary
    for item in s.split(","):
        items.extend(item.split("\n"))
    # Remove punctuation, strip spaces, and title-case
    clean_items = [re.sub(r'[^a-zA-Z0-9 ]', '', x.strip()).strip().title() for x in items if x.strip()]
    return clean_items

def safe_uri(base, raw_name):
    """
    Convert a raw string to a safe URI by removing or encoding special characters and lowercasing.
    """
    # Replace spaces with underscores, remove non-alphanumeric underscores, then lowercase
    clean_name = re.sub(r'[^a-zA-Z0-9_]+', '', raw_name.replace(" ", "_")).lower()
    return URIRef(base + clean_name)

In [5]:

# Read CSV
df = pd.read_csv("authors_fixed.csv")

# Pre-process data
df["author_id"] = df['authorLabel'].astype(str)  # Create a unique ID for each author
df['genres_list'] = df['combinedGenres'].apply(split_and_clean)
df['books_list']  = df['topBooks'].apply(split_and_clean)
df = df.drop_duplicates(subset=['author_id'])

# Initialize an RDFLib graph
graph = Graph()

# Define the base namespace for our ontology
BASE = "http://www.semanticweb.org/admin/ontologies/2024/9/authors-ontology#"
ONS = Namespace(BASE)
graph.bind("ons", ONS)

# Define our classes
AUTHOR_CLASS = ONS.Author
GENRE_CLASS  = ONS.Genre
BOOK_CLASS   = ONS.Book
NATLANG_CLASS = ONS.NationalityOrLanguage

# Add RDF:Class statements to the graph for clarity
graph.add((AUTHOR_CLASS, RDF.type, RDFS.Class))
graph.add((GENRE_CLASS,  RDF.type, RDFS.Class))
graph.add((BOOK_CLASS,   RDF.type, RDFS.Class))
graph.add((NATLANG_CLASS, RDF.type, RDFS.Class))

# Define properties (object/data properties)
# For example: hasGenre, wroteBook, hasNationality, name, birthDate, etc.
HAS_GENRE       = ONS.hasGenre
WROTE_BOOK      = ONS.wroteBook
HAS_NATLANG     = ONS.hasNationalityOrLanguage
NAME            = ONS.name
BIRTH_DATE      = ONS.birthDate

# A helper dictionary to store references to existing URIs (for re-use)
# This helps avoid duplicating the same genre/book/nationality resource
known_genres = {}
known_books  = {}
known_natlang = {}

def add_genre(genre_name):
    """
    Create or retrieve a genre resource in the graph.
    """
    if genre_name not in known_genres:
        genre_uri = safe_uri(BASE, f"genre_{genre_name}")
        # Add triple: (genre_uri, RDF.type, GENRE_CLASS)
        graph.add((genre_uri, RDF.type, GENRE_CLASS))
        # Add triple: (genre_uri, RDFS.label, Literal(genre_name))
        graph.add((genre_uri, RDFS.label, Literal(genre_name)))
        # You could also store a 'name' property
        graph.add((genre_uri, NAME, Literal(genre_name)))
        known_genres[genre_name] = genre_uri
    return known_genres[genre_name]

def add_book(book_title):
    """
    Create or retrieve a book resource in the graph.
    """
    if book_title not in known_books:
        book_uri = safe_uri(BASE, f"book_{book_title}")
        graph.add((book_uri, RDF.type, BOOK_CLASS))
        graph.add((book_uri, RDFS.label, Literal(book_title)))
        graph.add((book_uri, NAME, Literal(book_title)))
        known_books[book_title] = book_uri
    return known_books[book_title]

def add_natlang(natlang):
    """
    Create or retrieve a NationalityOrLanguage resource in the graph.
    """
    if natlang not in known_natlang:
        natlang_uri = safe_uri(BASE, f"natlang_{natlang}")
        graph.add((natlang_uri, RDF.type, NATLANG_CLASS))
        graph.add((natlang_uri, RDFS.label, Literal(natlang)))
        graph.add((natlang_uri, NAME, Literal(natlang)))
        known_natlang[natlang] = natlang_uri
    return known_natlang[natlang]

def add_author(row):
    """
    Create or retrieve an Author resource, and add properties and links.
    """
    author_name = row["authorLabel"]
    author_uri = safe_uri(BASE, f"author_{author_name}")
    
    # Add type triple
    graph.add((author_uri, RDF.type, AUTHOR_CLASS))
    # Add name triple
    graph.add((author_uri, NAME, Literal(author_name)))

    # If birthDate is present and parseable, add as date literal
    birth_str = row.get("birthDate", "")
    if pd.notna(birth_str) and birth_str.strip():
        try:
            # Validate the date string
            parsed_date = datetime.strptime(birth_str, "%Y-%m-%d")
            # If valid, add it as an XSD.date literal
            graph.add((author_uri, BIRTH_DATE, Literal(parsed_date.date(), datatype=XSD.date)))
        except ValueError:
            # Skip invalid dates and print a warning
            print(f"Warning: Invalid date format '{birth_str}' for author '{row['authorLabel']}'. Skipping.")

    
    # If there's nationality/language, add it
    natlang = row.get("combinedNationalityOrLanguage", "")
    if pd.notna(natlang) and natlang.strip():
        # We can further split if you suspect multiple nationalities in one row
        # but the sample only seems to have one. We'll treat it as a single string.
        # If multiple, you'd parse similarly to how we parse genres/books.
        natlang_uri = add_natlang(natlang.title().strip())
        graph.add((author_uri, HAS_NATLANG, natlang_uri))

    # Add genre links
    for g in row['genres_list']:
        genre_uri = add_genre(g)
        graph.add((author_uri, HAS_GENRE, genre_uri))
    
    # Add book links
    for b in row['books_list']:
        book_uri = add_book(b)
        graph.add((author_uri, WROTE_BOOK, book_uri))

# Iterate over each author row and add them to the graph
for idx, row in df.iterrows():
    add_author(row)

# Finally, serialize to an .owl (RDF/XML) file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
OUTPUT_OWL_FILE = f"authors_ontology_{timestamp}.owl"

try:
    graph.serialize(destination=OUTPUT_OWL_FILE, format='application/rdf+xml')
    print(f"Ontology successfully saved as {OUTPUT_OWL_FILE}")
except Exception as e:
    print(f"Failed to save ontology: {e}")
    sys.exit(1)


Ontology successfully saved as authors_ontology_20241217_184531.owl


In [6]:
num_triples = len(graph)
distinct_classes = set(graph.subjects(RDF.type, RDFS.Class))
distinct_properties = set(graph.predicates())

# You can also count specific instances like authors, books, etc.
num_authors = len(set(graph.subjects(RDF.type, AUTHOR_CLASS)))
num_genres = len(set(graph.subjects(RDF.type, GENRE_CLASS)))
num_books = len(set(graph.subjects(RDF.type, BOOK_CLASS)))
num_natlangs = len(set(graph.subjects(RDF.type, NATLANG_CLASS)))

print("Total number of triples:", num_triples)
print("Number of distinct classes:", len(distinct_classes))
print("Number of distinct properties:", len(distinct_properties))
print("Number of authors:", num_authors)
print("Number of genres:", num_genres)
print("Number of books:", num_books)
print("Number of nationalities or languages:", num_natlangs)

Total number of triples: 24692
Number of distinct classes: 4
Number of distinct properties: 7
Number of authors: 1629
Number of genres: 674
Number of books: 3408
Number of nationalities or languages: 165
