In [10]:
from SPARQLWrapper import SPARQLWrapper, CSV
import pandas as pd
from io import StringIO

# Define the SPARQL endpoint and query
endpoint_url = "https://dbpedia.org/sparql"
def make_query(batch_size = 1000, offset = 0):
  return"""
PREFIX rdf:  <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dbo:  <http://dbpedia.org/ontology/>
PREFIX dbp:  <http://dbpedia.org/property/>

SELECT DISTINCT ?filmTitle ?filmGenre ?bookTitle ?bookGenre ?author ?authorName ?birthDate
WHERE {
  # Ensure the resource is typed as a Film
  ?film rdf:type dbo:Film .
  
  # Check both dbo:basedOn and dbp:basedOn for the relation to the book
  {
    ?film dbo:basedOn ?book .
  }
  UNION
  {
    ?film dbp:basedOn ?book .
  }
  
  # Retrieve film labels in English
  ?film rdfs:label ?filmTitle .
  FILTER (lang(?filmTitle) = "en")

  # --------------------------------------------
  # 1) Film Genre (optional, because not all films have dbo:genre)
  # --------------------------------------------
  OPTIONAL {
    ?film dbo:genre ?filmGenreResource .
    ?filmGenreResource rdfs:label ?filmGenre .
    FILTER (lang(?filmGenre) = "en")
  }

  # Retrieve book label in English
  ?book rdfs:label ?bookTitle .
  FILTER (lang(?bookTitle) = "en")

  # --------------------------------------------
  # 2) Book Genre (optional, because not all books have dbo:genre)
  # --------------------------------------------
  OPTIONAL {
    ?book dbo:genre ?bookGenreResource .
    ?bookGenreResource rdfs:label ?bookGenre .
    FILTER (lang(?bookGenre) = "en")
  }

  # --------------------------------------------
  # 3) Book Author and Birth Date (both optional)
  # --------------------------------------------
  OPTIONAL {
    ?book dbo:author ?author .
    ?author rdfs:label ?authorName .
    FILTER (lang(?authorName) = "en")

    OPTIONAL {
      ?author dbo:birthDate ?birthDate .
    }
  }
}
ORDER BY ?filmTitle
"""


In [11]:
from tqdm.notebook import tqdm
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, CSV
from io import StringIO

# SPARQL Endpoint and Query
endpoint_url = "https://dbpedia.org/sparql"
batch_size = 500
offset = 0
total_rows = 0  # Counter for the total rows

sparql = SPARQLWrapper(endpoint_url)
sparql.setReturnFormat(CSV)
sparql.setTimeout(300)
# Initialize an empty DataFrame to store all results
all_data = pd.DataFrame()

# Estimated total pages
max_pages = 60

# Progress bar setup
# Update the query with the current OFFSET
paginated_query = make_query(batch_size, offset)
sparql.setQuery(paginated_query)

# Execute the query and fetch results
results = sparql.query().convert()
csv_str = results.decode("utf-8")
data = pd.read_csv(StringIO(csv_str))

# If no data is returned, stop the loop
# if data.empty:
#     print(f"No more data after offset {offset}.")
#     break

# Append the current batch to the all_data DataFrame
all_data = pd.concat([all_data, data], ignore_index=True)

# Update total rows count
total_rows += len(data)
print(f"Rows acquired in this batch: {len(data)} | Total rows: {total_rows}")

# Increment the offset for the next batch
offset += batch_size

# Optionally, save to CSV
all_data.to_csv("tmp_adaptations_data.csv", index=False)
        

# Print the final DataFrame and total rows
print(f"Total rows acquired: {total_rows}")
print(all_data)

all_data.to_csv("adaptations_data.csv", index=False)

Rows acquired in this batch: 1166 | Total rows: 1166
Total rows acquired: 1166
                         filmTitle filmGenre                      bookTitle  \
0                       18.05.2009       NaN           Sri Lankan Civil War   
1                     200 Halla Ho       NaN                     Akku Yadav   
2                          27 Guns       NaN               Ugandan Bush War   
3      50 Million Frenchmen (film)       NaN                 Herbert Fields   
4      50 Million Frenchmen (film)       NaN                    Cole Porter   
...                            ...       ...                            ...   
1161                   Yugadrashta       NaN          Pitambar Deva Goswami   
1162  Zack Snyder's Justice League       NaN  Lists of DC Comics characters   
1163            Zameer (2005 film)       NaN              Mazhayethum Munpe   
1164               Zehreela Insaan       NaN                   Naagarahaavu   
1165     Zorro (1975 Italian film)       NaN        

In [16]:
import pandas as pd
import re
from rdflib import Graph, Namespace, Literal, RDF, URIRef
from rdflib.namespace import XSD, RDFS
import datetime
import sys
from datetime import datetime

# --------------------------
# Helper functions
# --------------------------

def split_and_clean(s):
    """
    Utility function to split comma or newline separated strings and clean them up.
    """
    if pd.isna(s):
        return []
    items = []
    # Split by commas, then split by newlines if necessary
    for item in s.split(","):
        items.extend(item.split("\n"))
    # Remove punctuation, strip spaces, and title-case
    clean_items = [re.sub(r'[^a-zA-Z0-9 ]', '', x.strip()).strip().title() for x in items if x.strip()]
    return clean_items

def safe_uri(base, raw_name):
    """
    Convert a raw string to a safe URI by removing or encoding special characters and lowercasing.
    """
    # Replace spaces with underscores, remove non-alphanumeric underscores, then lowercase
    clean_name = re.sub(r'[^a-zA-Z0-9_]+', '', raw_name.replace(" ", "_")).lower()
    return URIRef(base + clean_name)

In [21]:

# Read CSV
df = pd.read_csv("authors_fixed.csv")

# Pre-process data
df["author_id"] = df['authorLabel'].astype(str)  # Create a unique ID for each author
df['genres_list'] = df['combinedGenres'].apply(split_and_clean)
df['books_list']  = df['topBooks'].apply(split_and_clean)
df = df.drop_duplicates(subset=['author_id'])

# Initialize an RDFLib graph
graph = Graph()

# Define the base namespace for our ontology
BASE = "http://www.semanticweb.org/admin/ontologies/2024/9/authors-ontology#"
ONS = Namespace(BASE)
graph.bind("ons", ONS)

# Define our classes
AUTHOR_CLASS = ONS.Author
GENRE_CLASS  = ONS.Genre
BOOK_CLASS   = ONS.Book
NATLANG_CLASS = ONS.NationalityOrLanguage
FILM_CLASS    = ONS.Film

# Add RDF:Class statements to the graph for clarity
graph.add((AUTHOR_CLASS, RDF.type, RDFS.Class))
graph.add((GENRE_CLASS,  RDF.type, RDFS.Class))
graph.add((BOOK_CLASS,   RDF.type, RDFS.Class))
graph.add((NATLANG_CLASS, RDF.type, RDFS.Class))
graph.add((FILM_CLASS,    RDF.type, RDFS.Class))

# Define properties (object/data properties)
# For example: hasGenre, wroteBook, hasNationality, name, birthDate, etc.
HAS_GENRE       = ONS.hasGenre
WROTE_BOOK      = ONS.wroteBook
HAS_NATLANG     = ONS.hasNationalityOrLanguage
NAME            = ONS.name
BIRTH_DATE      = ONS.birthDate
ADAPTED_FROM_BOOK   = ONS.adaptedFromBook

# A helper dictionary to store references to existing URIs (for re-use)
# This helps avoid duplicating the same genre/book/nationality resource
known_genres = {}
known_books  = {}
known_natlang = {}
known_authors  = {} 
known_films    = {}

def add_genre(genre_name):
    """
    Create or retrieve a genre resource in the graph.
    """
    if genre_name not in known_genres:
        genre_uri = safe_uri(BASE, f"genre_{genre_name}")
        # Add triple: (genre_uri, RDF.type, GENRE_CLASS)
        graph.add((genre_uri, RDF.type, GENRE_CLASS))
        # Add triple: (genre_uri, RDFS.label, Literal(genre_name))
        graph.add((genre_uri, RDFS.label, Literal(genre_name)))
        # You could also store a 'name' property
        graph.add((genre_uri, NAME, Literal(genre_name)))
        known_genres[genre_name] = genre_uri
    return known_genres[genre_name]

def add_book(book_title):
    """
    Create or retrieve a book resource in the graph.
    """
    if book_title not in known_books:
        book_uri = safe_uri(BASE, f"book_{book_title}")
        graph.add((book_uri, RDF.type, BOOK_CLASS))
        graph.add((book_uri, RDFS.label, Literal(book_title)))
        graph.add((book_uri, NAME, Literal(book_title)))
        known_books[book_title] = book_uri
    return known_books[book_title]

def add_natlang(natlang):
    """
    Create or retrieve a NationalityOrLanguage resource in the graph.
    """
    if natlang not in known_natlang:
        natlang_uri = safe_uri(BASE, f"natlang_{natlang}")
        graph.add((natlang_uri, RDF.type, NATLANG_CLASS))
        graph.add((natlang_uri, RDFS.label, Literal(natlang)))
        graph.add((natlang_uri, NAME, Literal(natlang)))
        known_natlang[natlang] = natlang_uri
    return known_natlang[natlang]

def add_author(row):
    """
    Create or retrieve an Author resource, and add properties and links.
    """
    author_name = row["authorLabel"]
    author_uri = safe_uri(BASE, f"author_{author_name}")
    
    # Add type triple
    graph.add((author_uri, RDF.type, AUTHOR_CLASS))
    # Add name triple
    graph.add((author_uri, NAME, Literal(author_name)))

    # If birthDate is present and parseable, add as date literal
    birth_str = row.get("birthDate", "")
    if pd.notna(birth_str) and birth_str.strip():
        try:
            # Validate the date string
            parsed_date = datetime.strptime(birth_str, "%Y-%m-%d")
            # If valid, add it as an XSD.date literal
            graph.add((author_uri, BIRTH_DATE, Literal(parsed_date.date(), datatype=XSD.date)))
        except ValueError:
            # Skip invalid dates and print a warning
            print(f"Warning: Invalid date format '{birth_str}' for author '{row['authorLabel']}'. Skipping.")

    
    # If there's nationality/language, add it
    natlang = row.get("combinedNationalityOrLanguage", "")
    if pd.notna(natlang) and natlang.strip():
        # We can further split if you suspect multiple nationalities in one row
        # but the sample only seems to have one. We'll treat it as a single string.
        # If multiple, you'd parse similarly to how we parse genres/books.
        natlang_uri = add_natlang(natlang.title().strip())
        graph.add((author_uri, HAS_NATLANG, natlang_uri))

    # Add genre links
    for g in row['genres_list']:
        genre_uri = add_genre(g)
        graph.add((author_uri, HAS_GENRE, genre_uri))
    
    # Add book links
    for b in row['books_list']:
        book_uri = add_book(b)
        graph.add((author_uri, WROTE_BOOK, book_uri))

# Iterate over each author row and add them to the graph
for idx, row in df.iterrows():
    add_author(row)

df_adaptations = pd.read_csv("adaptations_data.csv")

def add_author_resource(author_name, birthdate=None):
    """
    Create or retrieve an Author resource by name.
    Optionally add birthdate if provided.
    """
    if not author_name:
        return None
    if author_name not in known_authors:
        author_uri = safe_uri(BASE, f"author_{author_name}")
        graph.add((author_uri, RDF.type, AUTHOR_CLASS))
        graph.add((author_uri, NAME, Literal(author_name)))
        # If birthdate is valid, add it
        if birthdate and pd.notna(birthdate):
            print(birthdate)
            try:
                parsed_date = datetime.strptime(birthdate.strip(), "%Y-%m-%d")
                graph.add((author_uri, BIRTH_DATE, Literal(parsed_date.date(), datatype=XSD.date)))
            except ValueError:
                pass
        known_authors[author_name] = author_uri
    return known_authors[author_name]


def add_film(film_title):
    """
    Create or retrieve a Film resource by title.
    """
    if not film_title:
        return None
    if film_title not in known_films:
        film_uri = safe_uri(BASE, f"film_{film_title}")
        graph.add((film_uri, RDF.type, FILM_CLASS))
        graph.add((film_uri, RDFS.label, Literal(film_title)))
        graph.add((film_uri, NAME, Literal(film_title)))
        known_films[film_title] = film_uri
    return known_films[film_title]

for idx, row in df_adaptations.iterrows():
    film_title  = row.get('filmTitle', '')
    film_genre  = row.get('filmGenre', '')
    book_title  = row.get('bookTitle', '')
    book_genre  = row.get('bookGenre', '')
    author_id   = row.get('author', '')      # Possibly an ID if any
    author_name = row.get('authorName', '')
    birthdate   = row.get('birthDate', '')
    
    # 1. Add or get the Film resource
    film_uri = add_film(film_title)
    
    if film_uri:
        # 2. (Optional) Add film genre if present
        if film_genre:
            # either reuse hasGenre or define a new property
            fg_uri = add_genre(film_genre)
            graph.add((film_uri, HAS_GENRE, fg_uri))

        # 3. If there's a bookTitle, link film -> book via adaptedFromBook
        if book_title:
            book_uri = add_book(book_title)
            graph.add((film_uri, ADAPTED_FROM_BOOK, book_uri))
            
            # If there's a bookGenre, you can link the book to it
            if book_genre:
                bg_uri = add_genre(book_genre)
                graph.add((book_uri, HAS_GENRE, bg_uri))
    
        # 4. If an author is mentioned, add or get the Author resource
        if author_name:
            # Reuse the new helper to create the author
            a_uri = add_author_resource(author_name, birthdate)
            # Decide whether to link the Film directly to the Author
            # If you have a property like 'hasAuthor' or 'directedBy' or 'writtenBy', you can add it here.
            # Example (uncomment if you want):
            # graph.add((film_uri, ONS.hasAuthor, a_uri))
            pass


# Finally, serialize to an .owl (RDF/XML) file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
OUTPUT_OWL_FILE = f"authors_ontology_{timestamp}.owl"

try:
    graph.serialize(destination=OUTPUT_OWL_FILE, format='application/rdf+xml')
    print(f"Ontology successfully saved as {OUTPUT_OWL_FILE}")
except Exception as e:
    print(f"Failed to save ontology: {e}")
    sys.exit(1)


1939-04-12
1880-11-25
1968-09-23
1914-08-09
1908-01-19
1920-03-08
1818-07-30
1547-09-29
1774-08-12
1896-06-18
1888-09-16
1975-07-21
1914-05-19
1923-09-20
1935-12-08
1805-04-02
1566-02-15
1628-01-12
1835-11-30
1812-02-07
1896-07-19
1933-07-02
1918-04-16
1833-08-28
1957-01-24
1876-08-12
1873-08-18
1961-06-23
1973-12-24
1921-02-21
1936-01-28
Ontology successfully saved as authors_ontology_20250124_143007.owl


In [22]:
num_triples = len(graph)
distinct_classes = set(graph.subjects(RDF.type, RDFS.Class))
distinct_properties = set(graph.predicates())

# You can also count specific instances like authors, books, etc.
num_authors = len(set(graph.subjects(RDF.type, AUTHOR_CLASS)))
num_genres = len(set(graph.subjects(RDF.type, GENRE_CLASS)))
num_books = len(set(graph.subjects(RDF.type, BOOK_CLASS)))
num_natlangs = len(set(graph.subjects(RDF.type, NATLANG_CLASS)))

print("Total number of triples:", num_triples)
print("Number of distinct classes:", len(distinct_classes))
print("Number of distinct properties:", len(distinct_properties))
print("Number of authors:", num_authors)
print("Number of genres:", num_genres)
print("Number of books:", num_books)
print("Number of nationalities or languages:", num_natlangs)

Total number of triples: 33381
Number of distinct classes: 5
Number of distinct properties: 8
Number of authors: 1676
Number of genres: 719
Number of books: 4299
Number of nationalities or languages: 165
