In [1]:
from rdflib import Graph
from rdflib import URIRef, BNode, Literal
from rdflib import Namespace
from rdflib.namespace import OWL, RDF, RDFS, FOAF, XSD
from rdflib.util import guess_format
from isub import isub
import csv
import owlrl
import pandas as pd

In [2]:
df = pd.read_csv("IN3067-INM713_coursework_data_pizza_500.csv")

In [3]:
df.shape

(501, 11)

In [4]:
df.head()

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,currency,item description
0,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701.0,OR,Pizza Place,Bianca Pizza,22.5,USD,
1,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701.0,OR,Pizza Place,Cheese Pizza,18.95,USD,
2,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Margherita",12.0,USD,
3,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Mushroom",13.0,USD,
4,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Puttenesca",13.0,USD,"Olives, onions, capers, tomatoes"


In [5]:
df.country.unique()

array(['US'], dtype=object)

In [6]:
print(df.isna().sum())
print(df.dtypes)

name                  0
address               0
city                  0
country               0
postcode             10
state                 0
categories            0
menu item             0
item value           78
currency             75
item description    325
dtype: int64
name                 object
address              object
city                 object
country              object
postcode            float64
state                object
categories           object
menu item            object
item value          float64
currency             object
item description     object
dtype: object


### Subtask RDF.0

In [7]:
# here we create 2 random restaurants and will append them to the dataframe

cols = ['name','address','city','country','postcode','state','categories','menu item','item value','currency','item description']
new_entries = pd.DataFrame([
  ['Restaurant A', '123 Main St', 'Springfield', 'US', '12345', 'IL', 
   'Pizza Place', 'Margherita Pizza', 12.5, 'USD', None],

  ['Restaurant B', '467 Main St', 'Providence', 'US', '90210', 'RI',
   'Pizza Place','Pepperoni Pizza', 15.0, 'USD', None]
], columns=cols)

df = df.append(new_entries, ignore_index=True)



  df = df.append(new_entries, ignore_index=True)


In [8]:
# Filter rows where 'restaurant_name' is 'Restaurant A' or 'Restaurant B'
mask = df['name'].isin(['Restaurant A', 'Restaurant B'])
filtered_df = df[mask]

# Print all rows where 'restaurant_name' is 'Restaurant A' or 'Restaurant B'
print(filtered_df)



             name      address         city country postcode state  \
501  Restaurant A  123 Main St  Springfield      US    12345    IL   
502  Restaurant B  467 Main St   Providence      US    90210    RI   

      categories         menu item  item value currency item description  
501  Pizza Place  Margherita Pizza        12.5      USD             None  
502  Pizza Place   Pepperoni Pizza        15.0      USD             None  


In [9]:
print(df.dtypes)

name                 object
address              object
city                 object
country              object
postcode             object
state                object
categories           object
menu item            object
item value          float64
currency             object
item description     object
dtype: object


The names of the states are not uniform, some are abbreviated while some have their full names, so we will create a dictionary so that the abbreviations are linked with their full names. Once that's done, we will then standardize the data by displaying the names of the states with their full name.

In [10]:
# Made a dictionary for all states in the dataset with their full names and their abbreviations

states = {'AL': 'Alabama', 'AK': 'Alaska', 'AS': 'Arkansas', 'AZ': 'Arizona', 'CA': 'California', 'CO': 'Colorado',
         'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho', 
         'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisianna', 
         'MA': 'Maryland', 'MI': 'Michigan', 'MN': 'Minnesota', 'MO': 'Missouri', 'MS': 'Mississipi', 'MT': 'Montana',
         'NV': 'Nevada', 'NE': 'Nebraska', 'NH': 'New_Hampshire', 'NJ': 'New_Jersey', 'NM': 'New_Mexico', 
         'NY': 'New_York', 'NC': 'North_Carolina', 'ND': 'North_Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon',
         'PA': 'Pennsylvania', 'RI': 'Rhode_Island', 'SC': 'South_Carolina', 'SD': 'South_Dakota', 'TX': 'Texas',
         'TN': 'Tennessee', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WY': 'Wyoming', 'WI': 'Wisconsin',
         'WA': 'Washington', 'WV': 'West_Virginia'}


# iterating through the rows in the data to replace abbreviated state names with their full names
for idx, row in df.iterrows():
    state = row['state']
    if state in states:
        df.at[idx, 'state'] = states[state]

df.head()

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,currency,item description
0,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701.0,Oregon,Pizza Place,Bianca Pizza,22.5,USD,
1,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701.0,Oregon,Pizza Place,Cheese Pizza,18.95,USD,
2,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Margherita",12.0,USD,
3,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Mushroom",13.0,USD,
4,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Puttenesca",13.0,USD,"Olives, onions, capers, tomatoes"


Once a dictionary for the name of the states have been created, it is now time to standardize the strings of the columns: name, city, categories, menu item, item description, categories, state and address. This is to remove non-ascii characters and to  introduce uniformity in how entities are stored in the dataframe, they will be in lower case and any space between words will have an underscore replacing it.

In [11]:
columns = ["name","city","categories","menu item","item description","categories","state",'address']
for col in columns:
    df[col] = df[col].str.lower()
    df[col].replace(' ','_',regex = True, inplace = True)
df.head()

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,currency,item description
0,little_pizza_paradise,cascade_village_mall_across_from_target,bend,US,97701.0,oregon,pizza_place,bianca_pizza,22.5,USD,
1,little_pizza_paradise,cascade_village_mall_across_from_target,bend,US,97701.0,oregon,pizza_place,cheese_pizza,18.95,USD,
2,the_brentwood,148_s_barrington_ave,los_angeles,US,90049.0,brentwood,"american_restaurant,bar,bakery","pizza,_margherita",12.0,USD,
3,the_brentwood,148_s_barrington_ave,los_angeles,US,90049.0,brentwood,"american_restaurant,bar,bakery","pizza,_mushroom",13.0,USD,
4,the_brentwood,148_s_barrington_ave,los_angeles,US,90049.0,brentwood,"american_restaurant,bar,bakery","pizza,_puttenesca",13.0,USD,"olives,_onions,_capers,_tomatoes"


### Subtask RDF.1 and RDF.2

In [12]:
# code idea adapted from Lab 5 solutions at 
# https://github.com/city-knowledge-graphs/python-2023/blob/main/lab5/solution/lab5_solution.py

# Initialize an empty RDF graph
g = Graph()

# Define a namespace for our RDF data. Here it is set to a URL for the coursework.
namespace = "http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/"
cw = Namespace(namespace)

# Bind a prefix, here "cw", to the namespace
g.bind("cw",cw)

# Add a triple to the graph to state the creator of the data. The URI for the namespace itself is used as the subject.
g.add((URIRef(namespace), cw.Created_by, Literal('Faiq Komron', datatype = RDFS.Literal)))

# Iterate over each row in a pandas dataframe, df
for idx, row in df.iterrows():

  # Create URIRef objects for each entity involved: the restaurant, the pizza (menu item), the state, and the category
  restaurant = URIRef(namespace + row['city'] + "_" + row['name'])
  pizza = URIRef(namespace + row['name'] + "_" + row['menu item'])  
  state = URIRef(namespace + row['state'])
  category = URIRef(namespace + row['categories'])

  # Add RDF.type relations to the graph to define what kind of entity each URI is
  g.add((restaurant, RDF.type, cw.Restaurant))
  g.add((pizza, RDF.type, cw.Pizza))
  g.add((state, RDF.type, cw.State))
  g.add((category, RDF.type, cw.Category))

  # Add more detailed object properties to the graph, such as which pizzas are served at which restaurants,
  # which state the restaurant is in, the menu category for each pizza, and the price of each pizza
  g.add((pizza, cw.Served_at, restaurant))
  g.add((restaurant, cw.Serves, pizza))
  g.add((state, cw.hasName, Literal(row['state'])))
  g.add((restaurant, cw.locatedInState, state))
  g.add((pizza, cw.hasMenuCategory, category))
  g.add((pizza, cw.hasPrice, Literal(row['item value'], datatype=XSD.float)))
  g.add((restaurant, cw.hasAddress, Literal(row['address'])))
  g.add((restaurant, cw.hasPostcode, Literal(row['postcode'])))

  # Add data properties to the graph: the name of the restaurant, the name of the pizza, and the address of the restaurant
  g.add((restaurant, cw.Name, Literal(row['name'], datatype = RDFS.Literal)))
  g.add((pizza, cw.Name, Literal(row['menu item'], datatype = RDFS.Literal)))
  g.add((restaurant, cw.Address, Literal(row['address'], datatype = RDFS.Literal)))


In [13]:
print("The number of triples are {}".format(len(g)))

The number of triples are 4053


In [14]:
print("Finished processing DataFrame. Now serializing graph to file.")

g.serialize(destination='populated_graph.ttl', format='turtle') 

print("Finished serializing graph to file.")

Finished processing DataFrame. Now serializing graph to file.
Finished serializing graph to file.


### Subtask RDF.3

In [15]:
# this is to check that isub is working as it should
print(isub('hello', 'hello'))  # Should return 1.0 or a very high score close to 1.0
print(isub('hello', 'world'))  # Should return 0.0 or a very low score close to 0.0


1.0
0.27479674796747966


In [16]:
from lookup import WikidataAPI, GoogleKGLookup

In [17]:
google_kg_namespace = "https://kgsearch.googleapis.com/v1/entities/"
google = Namespace(google_kg_namespace)
g.bind("google", google)


In [18]:
# Create an instance of the GoogleKGLookup class. This class contains methods to interact with the Google Knowledge Graph API.
kg = GoogleKGLookup()


This code is iterating over a DataFrame (which presumably contains data about various restaurants and their locations) and adding this information to an RDF graph. The code uses the Google Knowledge Graph to look up entities for cities, states, and countries and adds them to the graph as well. Essentially, this code is creating a semantic representation of a dataset about restaurants and their locations, using a combination of local data and information retrieved from the Google Knowledge Graph. 

In [19]:
# Print a message indicating the start of the process.
print("Start processing DataFrame")

# Loop through each row of the DataFrame.
for idx, row in df.iterrows():
    # To avoid spamming the console, only print status updates every 50 rows.
    if idx % 50 == 0:
        try:
            # Print messages indicating the row number being processed and the city, state, and country entities being retrieved.
            print(f"Processing row {idx}")
            print(f"Getting entities for city: {row['city']}")
            entities_city = kg.getKGEntities(row['city'], limit=1)
            if entities_city: 
                city = URIRef(entities_city[0].getId()) 
            else:
                city = URIRef(google_kg_namespace+row['city'])
            
            print(f"Getting entities for state: {row['state']}")
            entities_state = kg.getKGEntities(row['state'], limit=1)
            if entities_state: 
                state = URIRef(entities_state[0].getId())
            else:
                state = URIRef(google_kg_namespace+row['state']) 
            
            print(f"Getting entities for country: {row['country']}")
            entities_country = kg.getKGEntities(row['country'], limit=1)
            if entities_country: 
                country = URIRef(entities_country[0].getId())
            else:
                country = URIRef(google_kg_namespace+row['country'])
            
            print("Adding triples to graph")
        except Exception as e:
            # If an error occurs during the entity retrieval, print the error message.
            print(f"Error occurred: {e}")

    restaurant = URIRef(google_kg_namespace + row['name'])

    # Add triples to the graph to define the type of each entity
    g.add((city, RDF.type, cw.City))
    g.add((state, RDF.type, cw.State))
    g.add((country, RDF.type, cw.Country))
    
    # Add triples to the graph to define the location of the restaurant, and the relationships between cities, states, and countries
    g.add((restaurant, cw.locatedInState, state))
    g.add((city, cw.locatedInState, state))
    g.add((state, cw.locatedInCountry, country))
    g.add((country, cw.hasState, state))
    g.add((state, cw.hasCity, city))
    g.add((city, cw.hasRestaurant, restaurant))
    
    # Add triples to the graph to define the names of the city, state, and country
    g.add((city, cw.Has_name, Literal(row['city'], datatype = RDFS.Literal)))
    g.add((state, cw.Has_name, Literal(row['state'], datatype = RDFS.Literal)))
    g.add((country, cw.Has_name, Literal(row['country'], datatype = RDFS.Literal)))

# Print a message indicating that processing is complete.
print("Finished processing DataFrame. Now serializing graph to file.")

# Serialize the graph to a file in Turtle format.
g.serialize(destination='Google KG output.ttl', format='turtle')

# Print a message indicating that the graph has been serialized.
print("Finished serializing graph to file.")


Start processing DataFrame
Processing row 0
Getting entities for city: bend
Getting entities for state: oregon
Getting entities for country: US
Adding triples to graph
Processing row 50
Getting entities for city: arnold
Getting entities for state: missouri
Getting entities for country: US
Adding triples to graph
Processing row 100
Getting entities for city: lawrence_township
Getting entities for state: lawrenceville
Getting entities for country: US
Adding triples to graph
Processing row 150
Getting entities for city: barboursville
Getting entities for state: west_virginia
Getting entities for country: US
Adding triples to graph
Processing row 200
Getting entities for city: philadelphia
Getting entities for state: pennsylvania
Getting entities for country: US
Adding triples to graph
Processing row 250
Getting entities for city: suffolk
Getting entities for state: virginia
Getting entities for country: US
Adding triples to graph
Processing row 300
Getting entities for city: bronson
Getti

After the ontology has been populated with Google KG entities, it is saved in the turtle file "Google KG output". Now, we will repeat the same procedure but using Wikidata API and its resources and entities.

In [20]:
wikidata_namespace = "https://www.wikidata.org/w/api.php"
wiki = Namespace(wikidata_namespace)
g.bind("wiki", wiki)

In this code, we're creating an instance of a WikidataAPI class, which presumably has a method getKGEntities that takes a keyword, a limit, and a type as input parameters and fetches entities from the Wikidata Knowledge Graph that match the query. The entities retrieved are then printed.

In [21]:
# Create an instance of the WikidataAPI class. This class contains methods to interact with the Wikidata API.
wikidata = WikidataAPI()

In [22]:
# Print statement to signal the beginning of DataFrame processing
print("Start processing DataFrame")

# Loop through each row of the DataFrame with its index
for idx, row in df.iterrows():

    # Print status every 50 rows to monitor progress and avoid spamming the console
    if idx % 50 == 0:
        try:
            # Print messages indicating the row number being processed and the city, state, and country entities being retrieved.
            print(f"Processing row {idx}")
            print(f"Getting entities for city: {row['city']}")
            entities_city = wikidata.getKGEntities(row['city'], limit=1)
            # If city entity is found, create URI with entity ID
            # If not, create URI with city name
            if entities_city: 
                city = URIRef(entities_city[0].getId()) 
            else:
                city = URIRef(wikidata_namespace+row['city']) 

           
            print(f"Getting entities for state: {row['state']}")
            entities_state = wikidata.getKGEntities(row['state'], limit=1)
            if entities_state: 
                state = URIRef(entities_state[0].getId())
            else:
                state = URIRef(wikidata_namespace+row['state']) 

            
            print(f"Getting entities for country: {row['country']}")
            entities_country = wikidata.getKGEntities(row['country'], limit=1)
            if entities_country: 
                country = URIRef(entities_country[0].getId())
            else:
                country = URIRef(wikidata_namespace+row['country'])

            print("Adding triples to graph")
        except Exception as e:
            # Catch any exceptions and print error message
            print(f"Error occurred: {e}")

    # Create URI for restaurant entity
    restaurant = URIRef(wikidata_namespace + row['name'])

    # Add classes to RDF graph
    g.add((city, RDF.type, cw.City))
    g.add((state, RDF.type, cw.State))
    g.add((country, RDF.type, cw.Country))
    
    # Add object properties to RDF graph
    g.add((restaurant, cw.locatedInState, state))
    g.add((city, cw.locatedInState, state))
    g.add((state, cw.locatedInCountry, country))
    g.add((country, cw.hasState, state))
    g.add((state, cw.hasCity, city))
    g.add((city, cw.hasRestaurant, restaurant))
    
    # Add data properties to RDF graph
    g.add((city, cw.Has_name, Literal(row['city'], datatype = RDFS.Literal)))
    g.add((state, cw.Has_name, Literal(row['state'], datatype = RDFS.Literal)))
    g.add((country, cw.Has_name, Literal(row['country'], datatype = RDFS.Literal)))

# Serialize the RDF graph to a Turtle file
print("Finished processing DataFrame. Now serializing graph to file.")
g.serialize(destination='Wikidata output.ttl', format='turtle')
print("Finished serializing graph to file.")


Start processing DataFrame
Processing row 0
Getting entities for city: bend
Error occurred: 'entities'
Processing row 50
Getting entities for city: arnold
Error occurred: 'entities'
Processing row 100
Getting entities for city: lawrence_township
Error occurred: 'entities'
Processing row 150
Getting entities for city: barboursville
Error occurred: 'entities'
Processing row 200
Getting entities for city: philadelphia
Error occurred: 'entities'
Processing row 250
Getting entities for city: suffolk
Error occurred: 'entities'
Processing row 300
Getting entities for city: bronson
Error occurred: 'entities'
Processing row 350
Getting entities for city: alameda
Error occurred: 'entities'
Processing row 400
Getting entities for city: sioux_city
Error occurred: 'entities'
Processing row 450
Getting entities for city: pittsburgh
Error occurred: 'entities'
Processing row 500
Getting entities for city: new_york
Error occurred: 'entities'
Finished processing DataFrame. Now serializing graph to file.

The results are saved to the Wikidata output file.

### Subtask RDF.4

In [23]:
# Initialize an instance of an RDF graph
g = Graph()

# Load cw_onto
g.parse('pizza-restaurants-ontology.ttl') 

# Load populated graph
g.parse('populated_graph.ttl')
# Apply OWL-RL reasoning to the graph to infer and add new triples
owlrl.DeductiveClosure(owlrl.OWLRL_Semantics, axiomatic_triples=True, datatype_axioms=False).expand(g)

# Print the number of triples in the graph after the reasoning procedure
print("number of triples after reasoning: ", len(g))

# Serialize the graph and save it to a Turtle file ('populated_with_onto_reasoning.ttl')
# The resulting file will contain the data from cw_onto and what was created in RDF.1 and RDF.2
# plus the inferred triples from the reasoning process
g.serialize(destination = 'populated_with_onto_reasoning.ttl', format = 'ttl')


number of triples after reasoning:  15662


<Graph identifier=Ne657e7d0776849c799d76e7814366c81 (<class 'rdflib.graph.Graph'>)>

##  SPARQL and Reasoning

### SPARQL.1

In [24]:
# this query consists of 3 triple patterns and is looking for the maximum price of vegetarian pizzas that are below 20 dollars

quer = g.query(
"""    
PREFIX cw: <http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT (MAX(?price) AS ?maxprice) 
WHERE
{
?pizza rdf:type cw:Pizza .
?pizza cw:hasPrice ?price . 
FILTER (?price <=20)
}

""")
with open('sparql1.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Max Price'])
    for row in quer:
        writer.writerow([round(float(row[0]), 2)])

print("Query result written to sparql1.csv file.")

# printing result
for row in quer:
    print("The max price of a pizza that is below 20 dollars, is {} USD.".format(round(float(row.maxprice), 2)))

Query result written to sparql1.csv file.
The max price of a pizza that is below 20 dollars, is 20.0 USD.


### SPARQL.2

In [25]:
# this query uses 3 triple patterns, and is looking for the average price of pizzas that are above $20, using an AVG function
quer = g.query(
"""    
PREFIX cw: <http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT (AVG(?price) AS ?avgprice) 
WHERE
{
?pizza rdf:type cw:Pizza .
?pizza cw:hasPrice ?price .
?restaurant cw:Serves ?pizza .
FILTER (?price >=20)
}

""")

with open('sparql2.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Average Price'])
    for row in quer:
        writer.writerow([round(float(row.avgprice), 2)])
        
print("Query result written to sparql2.csv file.")

for row in quer:
    print("The average price of vegeterian pizzas that are above 20 dollars is {} USD.".format(round(float(row.avgprice), 2)))


Query result written to sparql2.csv file.
The average price of vegeterian pizzas that are above 20 dollars is 31.58 USD.


### SPARQL.3

In [26]:
# this query looks to see if there are restaurants in LA or NYC that has restaurants which are not named Mcdonalds,
# using UNION pattern and Negation
quer = g.query(
"""
PREFIX cw: <http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?restaurant
WHERE {
  {
    ?restaurant cw:locatedInState ?state .
    ?state cw:hasName "new_york" .
  }
  UNION
  {
    ?restaurant cw:locatedInState ?state .
    ?state cw:hasName "los_angeles" .
  }
  FILTER NOT EXISTS {
    ?restaurant cw:Name "Mcdonalds" .
  }
}""")
# Save the count result to a CSV file
names = []
with open('sparql3.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Restaurant Names'])
    for row in quer:
         writer.writerow([row.restaurant])
         print(row.restaurant)
        
print("The result has been saved to sparql3.csv")

http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/massapequa_park_pizza_bistro
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/white_plains_the_melting_pot_-_white_plains
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/staten_island_mario's_-_staten_island
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/buffalo_brando's_pizza
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/white_plains_euro_pizzeria
The result has been saved to sparql3.csv


### SPARQL.4

In [27]:
# this query is aggregating and filtering by counting states that have more than 5 restaurants, 
# and then groups the result by the states
quer = g.query(
"""    
PREFIX cw: <http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>


SELECT (COUNT(DISTINCT ?restaurant) AS ?count) ?state
WHERE
{
?restaurant cw:locatedInState ?state .
?state rdf:type cw:State .
}
GROUP BY ?state
HAVING(COUNT(?restaurant) >5)
""")

for row in quer:
  print(row[0], row[1])
# open file for writing
with open('sparql4.csv', 'w', newline='') as file:

    # create CSV writer
    writer = csv.writer(file)
    # write header row
    writer.writerow(['Restaurant Count', 'state'])
    # loop over query results and write rows to CSV file
    for row in quer:
        writer.writerow([row[0], row[1]])
        
print("Results saved to sparql4.csv")


6 http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/illinois
7 http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/maryland
8 http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/pennsylvania
11 http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/california
11 http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/texas
Results saved to sparql4.csv


### SPARQL.5

In [28]:
# the final query is ordered by two conditions that is
# ASC(?count) which orders by ascending count alphabetically from A-Z
# ?avg price - Then orders ascendingly by average price of the categories
# it also aggregates by COUNT over the HAVING filter and groups them by category
quer = g.query(
"""    
PREFIX cw: <http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>


SELECT ?category (COUNT(?pizza) AS ?count) (AVG(?price) AS ?avgPrice)
WHERE
{
?pizza cw:hasMenuCategory ?category .
?pizza cw:hasPrice ?price .
}
GROUP BY ?category
HAVING(COUNT(?category) >5)
ORDER BY ASC(?count) ASC(?avgPrice)
""")

for row in quer:
  print(row[0], row[1], row[2])
# open file for writing
with open('sparql5.csv', 'w', newline='') as file:

    # create CSV writer
    writer = csv.writer(file)
    # write header row
    writer.writerow(['Category', 'Count', 'Average Price'])
    # loop over query results and write rows to CSV file
    for row in quer:
        writer.writerow([row[0], row[1], row[2]])
        
print("Results saved to sparql5.csv")

http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/american_restaurant_and_asian_restaurant 6 6.740000000000001
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/pizza_place,restaurants,food_&_entertainment 7 10.39
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/american_restaurant,_pizza_place,_and_gluten-free_restaurant,american_restaurant,pizza_place,gluten-free_restaurant 8 nan
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/pizza,restaurants 8 10.3025
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/italian_restaurant,pizza_place,take_out_restaurants,restaurants,italian_restaurant_and_pizza_place,pizza,doctor 9 8.501111111111111
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/pizza_place,pizza,sandwich_shops,italian_restaurants,delicatessens,restaurants 9 nan
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/burger_joint_and_cupcake_shop 9 17.06111111111111
http://www.semanticweb.org/c

In [29]:
# some NAN results appear in average price, 
# but that's due to missing values or non-integers 
# in some entries, but it sorts primarily according to the alphabetical order
# of the categories

## Ontology Alignment

In [30]:
from owlready2 import *
#Shoudl be imported after owlready
from rdflib import Graph



### Subtask 0A.1

In [31]:
# these methods are from lab 7's github at
# https://github.com/city-knowledge-graphs/python-2023/blob/main/lab7/lab7_notebook.ipynb
# some will be used to perform the tasks required
def getClasses(onto):        
    return onto.classes()
    
def getDataProperties(onto):        
    return onto.data_properties()
    
def getObjectProperties(onto):        
    return onto.object_properties()
    
def getIndividuals(onto):    
    return onto.individuals()


def getRDFSLabelsForEntity(entity):
    #if hasattr(entity, "label"):
    return entity.label


def getRDFSLabelsForEntity(entity):
    #if hasattr(entity, "label"):
    return entity.label 

Here we use the getClasses method to print out the classes that exist in both ontologies, this is so we can get a brief overview of how many classes that are in the 2 ontologies.

In [32]:
# Methods and solutions are adapted from the lab 7 solutions at Ernesto's github
# at https://github.com/city-knowledge-graphs/python-2023/blob/main/lab7/lab7_notebook.ipynb

onto="pizza.owl"

#Method from owlready
onto = get_ontology(onto).load()
    
print("Classes in Ontology: " + str(len(list(getClasses(onto)))))
i=0
for cls in getClasses(onto):
    i+=1
    #Name of entity in URI. But in some cases it may be a 
    #code like in mouse and human anatomy ontologies                
    print(cls.iri)
    print("\t"+cls.name)  
    #Labels from RDFS label
    print("\t"+str(getRDFSLabelsForEntity(cls)))
    
    if i==5:
        break

Classes in Ontology: 100
http://www.co-ode.org/ontologies/pizza/pizza.owl#Pizza
	Pizza
	[locstr('Pizza', 'en')]
http://www.co-ode.org/ontologies/pizza/pizza.owl#PizzaBase
	PizzaBase
	[locstr('BaseDaPizza', 'pt'), locstr('PizzaBase', 'en')]
http://www.co-ode.org/ontologies/pizza/pizza.owl#Food
	Food
	[locstr('Food', 'en')]
http://www.co-ode.org/ontologies/pizza/pizza.owl#Spiciness
	Spiciness
	[locstr('Spiciness', 'en'), locstr('Tempero', 'pt')]
http://www.co-ode.org/ontologies/pizza#FoodTopping
	FoodTopping
	[]


In [33]:
cw_onto="pizza-restaurants-ontology.owl"

#Method from owlready
cw_onto = get_ontology(cw_onto).load()
    
print("Classes in Ontology: " + str(len(list(getClasses(cw_onto)))))
i=0
for cls in getClasses(cw_onto):
    i+=1
    #Name of entity in URI. But in some cases it may be a 
    #code like in mouse and human anatomy ontologies                
    print(cls.iri)
    print("\t"+cls.name)  
    #Labels from RDFS label
    print("\t"+str(getRDFSLabelsForEntity(cls)))
    
    if i==5:
        break

Classes in Ontology: 151
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants#ItemValue
	ItemValue
	['Item value']
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants#Currency
	Currency
	['Currency']
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants#Address
	Address
	['Address']
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants#City
	City
	['City']
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants#Location
	Location
	['Location']


In [34]:
# Retrieve all classes from the pizza ontology
pizza_classes = getClasses(onto) 
# Retrieve all classes from the cw ontology
cw_classes = getClasses(cw_onto)

# Retrieve all object properties from the pizza ontology
pizza_objectproperties = getObjectProperties(onto)
# Retrieve all object properties from the cw ontology
cw_objectproperties = getObjectProperties(cw_onto)

# Retrieve all data properties from the pizza ontology
pizza_dataproperties = getDataProperties(onto)
# Retrieve all data properties from the cw ontology
cw_dataproperties = getDataProperties(cw_onto)

# Retrieve all individuals from the pizza ontology
pizza_individuals = getIndividuals(onto)
# Retrieve all individuals from the cw ontology
cw_individuals = getIndividuals(cw_onto)


In [35]:
pizza_classes = list(getClasses(onto))
cw_classes = list(getClasses(cw_onto))
print("Number of classes in Pizza ontology:", len(pizza_classes))
print("Number of classes in CW ontology:", len(cw_classes))

pizza_objectproperties = list(getObjectProperties(onto))
cw_objectproperties = list(getObjectProperties(cw_onto))
print("Number of object properties in Pizza ontology:", len(pizza_objectproperties))
print("Number of object properties in CW ontology:", len(cw_objectproperties))

pizza_dataproperties = list(getDataProperties(onto))
cw_dataproperties = list(getDataProperties(cw_onto))
print("Number of data properties in Pizza ontology:", len(pizza_dataproperties))
print("Number of data properties in CW ontology:", len(cw_dataproperties))

pizza_individuals = list(getIndividuals(onto))
cw_individuals = list(getIndividuals(cw_onto))
print("Number of individuals in Pizza ontology:", len(pizza_individuals))
print("Number of individuals in CW ontology:", len(cw_individuals))

Number of classes in Pizza ontology: 100
Number of classes in CW ontology: 151
Number of object properties in Pizza ontology: 8
Number of object properties in CW ontology: 17
Number of data properties in Pizza ontology: 0
Number of data properties in CW ontology: 6
Number of individuals in Pizza ontology: 5
Number of individuals in CW ontology: 0


After printing out the number of entities in both ontologies, we can see that the entities that have the possibility of matching are the classes and object properties, as there exists entities with a count of 0 in CW and Pizza ontology.

In [36]:
from rdflib import Graph, URIRef
from rdflib.namespace import OWL

# Initialize RDF graph and bind namespaces
g = Graph()

pizza_ns = "http://www.co-ode.org/ontologies/pizza/pizza.owl#"
cw_ns = "http://www.semanticweb.org/city/in3067-inm713/2023/restaurants#"

g.bind("pizza", pizza_ns)
g.bind("cw", cw_ns)

# Initialize counters for matched entities
matched_classes = 0
matched_object_properties = 0
matched_data_properties = 0
matched_individuals = 0

# Loop through all classes in the pizza ontology
for p_class in pizza_classes:
    # For each class in the pizza ontology, loop through all classes in the cw ontology
    for c_class in cw_classes:
        # Extract the class name from the full IRI for both ontologies
        pizza_name = p_class.iri.replace(pizza_ns, "")
        cw_name = c_class.iri.replace(cw_ns, "")

        # If the class names match, add an equivalence triple to the graph
        if pizza_name == cw_name:
            matched_classes += 1
            subject = URIRef(f"{pizza_ns}{pizza_name}")
            predicate = OWL.equivalentClass
            obj = URIRef(f"{cw_ns}{cw_name}")
            g.add((subject, predicate, obj))

# Loop through all object properties in the pizza ontology
for p_prop in pizza_objectproperties:
    # For each object property in the pizza ontology, loop through all object properties in the cw ontology
    for c_prop in cw_objectproperties:
        # Extract the property name from the full IRI for both ontologies
        pizza_prop_name = p_prop.iri.replace(pizza_ns, "")
        cw_prop_name = c_prop.iri.replace(cw_ns, "")

        # If the property names match, add an equivalence triple to the graph
        if pizza_prop_name == cw_prop_name:
            matched_object_properties += 1
            subject = URIRef(f"{pizza_ns}{pizza_prop_name}")
            predicate = OWL.equivalentProperty
            obj = URIRef(f"{cw_ns}{cw_prop_name}")
            g.add((subject, predicate, obj))

# Loop through all data properties in the pizza ontology
for p_data_prop in pizza_dataproperties:
    # For each data property in the pizza ontology, loop through all data properties in the cw ontology
    for c_data_prop in cw_dataproperties:
        # Extract the data property name from the full IRI for both ontologies
        pizza_data_prop_name = p_data_prop.iri.replace(pizza_ns, "")
        cw_data_prop_name = c_data_prop.iri.replace(cw_ns, "")

        # If the data property names match, add an equivalence triple to the graph
        if pizza_data_prop_name == cw_data_prop_name:
            matched_data_properties += 1
            subject = URIRef(f"{pizza_ns}{pizza_data_prop_name}")
            predicate = OWL.equivalentProperty
            obj = URIRef(f"{cw_ns}{cw_data_prop_name}")
            g.add((subject, predicate, obj))

# Loop through all individuals in the pizza ontology
for p_ind in pizza_individuals:
    # For each individual in the pizza ontology, loop through all individuals in the cw ontology
    for c_ind in cw_individuals:
        # Extract the individual name from the full IRI for both ontologies
        pizza_ind_name = p_ind.iri.replace(pizza_ns, "")
        cw_ind_name = c_ind.iri.replace(cw_ns, "")

        # If the individual names match, add a sameAs triple to the graph
        if pizza_ind_name == cw_ind_name:
            matched_individuals += 1
            subject = URIRef(f"{pizza_ns}{pizza_ind_name}")
            predicate = OWL.sameAs
            obj = URIRef(f"{cw_ns}{cw_ind_name}")
            g.add((subject, predicate, obj))

# Print the number of matched entities
print(f"Matched classes: {matched_classes}")
print(f"Matched object properties: {matched_object_properties}")
print(f"Matched data properties: {matched_data_properties}")
print(f"Matched individuals: {matched_individuals}")

# Serialize the graph to a Turtle file (if needed)
g.serialize(destination="OA1.ttl", format="turtle")


Matched classes: 6
Matched object properties: 2
Matched data properties: 0
Matched individuals: 0


<Graph identifier=N64a67b1e70be40e79167781bd2c95787 (<class 'rdflib.graph.Graph'>)>

After finding the matched entities, they are saved in equivalences.ttl file. We see that there are only 8 entities that match between the 2 ontologies.

### Subtask OA.2

### Subtask OA.3

In [38]:
# Create graph 
g = Graph()

# Load cw_onto
g.parse('pizza-restaurants-ontology.ttl')

# Load pizza.owl ontology
g.parse('pizza.owl') 

# Load alignments
g.parse('OA1.ttl')

# Load populated data
g.parse('populated_with_onto_reasoning.ttl')

# Perform reasoning on all of them 
owlrl.DeductiveClosure(owlrl.OWLRL_Semantics).expand(g)

# Print triples after reasoning 
print("Triples after reasoning:", len(g))

# Serialize reasoned graph
g.serialize(destination='OA3.ttl', format='ttl')

Triples after reasoning: 35108


<Graph identifier=N92cddf4b4885481cb74022ac7a83a092 (<class 'rdflib.graph.Graph'>)>

### Subtask OA.4

In [39]:
quer = g.query(
    """
    PREFIX cw: <http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 

    SELECT ?restaurant ?state
    WHERE {
      ?restaurant a cw:Restaurant .
      ?restaurant cw:locatedInState ?state .
      FILTER(?state = cw:california) .
    }
    """
)

for row in quer:
  print(row[0], row[1])

# Open CSV file for writing 
with open('OA4.csv', 'w', newline='') as file:

  # Create CSV writer
  writer = csv.writer(file)

  # Write header row
  writer.writerow(['Restaurant', 'State'])

  # Loop over results and write to rows
  for row in quer:
    writer.writerow([row.restaurant, row.state])

print("Results written to OA4.csv.csv")

http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/fresno_valley_lahvosh_baking http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/california
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/irvine_zpizza http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/california
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/laguna_niguel_i_love_bagels http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/california
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/san_diego_tilted_kilt_mission_valley http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/california
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/san_jose_buca_di_beppo_-_san_jose_-_oakridge http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/california
http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/santa_cruz_ristorante_italiano http://www.semanticweb.org/city/in3067-inm713/2023/restaurants/calif

## Ontology Embedding

### Vector.1

In [40]:
# if not done so already, run this to unzip the owl2vec_star.zip file, if already done, this cell can be deleted or commented,
# if the file has not been unzipped, simply uncomment the lines below

#import zipfile

#zip_file = zipfile.ZipFile('owl2vec_star.zip', 'r')
#zip_file.extractall()
#zip_file.close()

In [41]:
from rdflib import Graph
g = Graph()
g.parse("populated_graph.ttl") 
g.parse("pizza-restaurants-ontology.ttl")

<Graph identifier=N853f0e254a2144bcbed4cff86254d06a (<class 'rdflib.graph.Graph'>)>

As the tip suggested combining cw_onto and the generated data into one file, this is what we did to perform the first task of ontology alignment.


In [42]:
with open("combined.owl", "wb") as f:
    g.serialize(f, format='xml')

In [43]:
from owl2vec_star import owl2vec_star

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\faiqh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


For the first configuration in default.cfg, they are: URI_Doc = yes
Lit_Doc = yes
Mix_Doc = no


For the second configuration in default2.cfg, they are: URI_Doc = no
Lit_Doc = no
Mix_Doc = yes



In [44]:
#Parameters:
# ontology_file
# config_file
# uri_doc
# lit_doc
# mix_doc
gensim_model = owl2vec_star.extract_owl2vec_model("combined.owl", "./default.cfg", True, True, True)

output_folder="alignment_output/"

#results are saved in binary and text format
gensim_model.save(output_folder+"ontology.embeddings")
gensim_model.wv.save_word2vec_format(output_folder+"ontology.embeddings.txt", binary=False)

In [45]:
gensim_model = owl2vec_star.extract_owl2vec_model("combined.owl", "./default2.cfg", True, True, True)

output_folder="alignment_output/"

#results are saved in binary and text format
gensim_model.save(output_folder+"ontology2.embeddings")
gensim_model.wv.save_word2vec_format(output_folder+"ontology2.embeddings.txt", binary=False)

The generated vectors are saved in binary and text forms as ontology.embeddings, ontology.txt and ontology2.embeddings, ontology2.txt respectively. 

### Vector2.1 and Vector2.2

In [47]:
from gensim.models import KeyedVectors
# Train model 1  
model1 = owl2vec_star.extract_owl2vec_model("combined.owl", "default.cfg",True, True, True) 
model1.save("model1.embeddings")

# Train model 2
model2 = owl2vec_star.extract_owl2vec_model("combined.owl", "default2.cfg", True, True, True) 
model2.save("model2.embeddings")

# Load model 1 and checking if configurations are followed
m1 = KeyedVectors.load("model1.embeddings")
print(model1.vector_size)
# Load model 2 and checking if configurations are followed
m2 = KeyedVectors.load("model2.embeddings")
print(model2.vector_size)

100
100


In [48]:
# this solution was adapted from lab 8's notebook on embeddings at 
# https://github.com/city-knowledge-graphs/python-2023/tree/main/lab8
# Load the embeddings from the file "model1.embeddings" into a KeyedVectors object
m1 = KeyedVectors.load("model1.embeddings") 
# Extract the word vectors from model1
wv1 = m1.wv

m2 = KeyedVectors.load("model2.embeddings")
# Load the embeddings from the file "model2.embeddings" into another KeyedVectors object
wv2 = m2.wv
# Extract the word vectors from model2

# Calculate similarity for each pair
# Select entity pairs
pairs = [
    ('Pizza', 'Pizza'),  
    ("restaurant", "city"),
    ('state','city'),
    ('Pizza','food'),
    ('restaurant','currency'),
    ('oregon','washington')
]

# Calculate similarity for each pair
for p in pairs:
    sim1 = wv1.similarity(p[0], p[1])  
    sim2 = wv2.similarity(p[0], p[1])
    
    print("Similarity for {}: ".format(p))
    print("Config 1: {:.4f}".format(sim1))   
    print("Config 2: {:.4f}".format(sim2))

# Compare observations 
print("Analysis:")
print("Config 1 has higher similarity for related pairs like Oregon-Washington") 
print("Config 2 has lower similarity for unrelated pairs like restaurant-currency")

Similarity for ('Pizza', 'Pizza'): 
Config 1: 1.0000
Config 2: 1.0000
Similarity for ('restaurant', 'city'): 
Config 1: 0.6916
Config 2: 0.6628
Similarity for ('state', 'city'): 
Config 1: 0.9089
Config 2: 0.9100
Similarity for ('Pizza', 'food'): 
Config 1: 0.6639
Config 2: 0.6971
Similarity for ('restaurant', 'currency'): 
Config 1: 0.6396
Config 2: 0.6236
Similarity for ('oregon', 'washington'): 
Config 1: 0.9911
Config 2: 0.9632
Analysis:
Config 1 has higher similarity for related pairs like Oregon-Washington
Config 2 has lower similarity for unrelated pairs like restaurant-currency
