## Running OWL2Vec*

In [1]:
#!pip install jupyter==1.0.0
#!pip install rdflib==7.0.0
#!pip install pyparsing==2.4.7
#!pip install scipy==1.12.0
#!pip install numpy==1.26.4 
#!pip install gensim==4.3.1
#!pip install scikit-learn==1.4.0
#!pip install nltk==3.8.1
#!pip install OWLready2==0.45
#!pip install setuptools==69.2.0 
#!pip install pandas==2.2.2

### from the lab

In [2]:
from owl2vec_star import owl2vec_star


#Parameters:
# ontology_file
# config_file
# uri_doc
# lit_doc
# mix_doc
gensim_model = owl2vec_star.extract_owl2vec_model("./case_studies/pizza/pizza.owl", "./default.cfg", True, True, True)

output_folder="./cache/output/"

#Gensim format
gensim_model.save(output_folder+"ontology.embeddings")
    #Txt format
gensim_model.wv.save_word2vec_format(output_folder+"ontology.embeddings.txt", binary=False)


[nltk_data] Downloading package punkt to C:\Users\Fasih
[nltk_data]     Munir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO: Access the ontology ...
INFO: There are 1945 triples in the ontology
INFO: Calculate the ontology projection ...
INFO: Creating ontology graph projection...
INFO: 	Extracting subsumption triples
INFO: 		Time extracting subsumption: 0.8154659271240234 seconds 
INFO: 	Extracting equivalence triples
INFO: 		Time extracting equivalences: 0.023654460906982422 seconds 
INFO: 	Extracting class membership triples.
INFO: 		Time extracting class membership: 0.1835026741027832 seconds 
INFO: 	Extracting sameAs triples
INFO: 		Time extracting sameAs: 0.007008552551269531 seconds 
INFO: 	Extracting triples associated to hasBase
INFO: 		Time extracting triples for property: 0.14656686782836914 seconds 
INFO: 	Extracting triples associated to hasIngredient
INFO: 		Time extracting triples for property: 0.09192728996276855 seconds 
INFO: 	Ex

#### Loading embeddings and getting similarities

In [3]:
from gensim.models import KeyedVectors

#Embedding vectors generated above
model = KeyedVectors.load("./cache/output/ontology.embeddings", mmap='r')
wv = model.wv

vector = wv['pizza']  # Get numpy vector of a word
print("Vector for 'pizza'")
print(vector)

#cosine similarity
similarity = wv.similarity('pizza', 'http://www.co-ode.org/ontologies/pizza/pizza.owl#Pizza')
print(similarity)

similarity = wv.similarity('http://www.co-ode.org/ontologies/pizza/pizza.owl#Margherita', 'margherita')
print(similarity)


#Most similar cosine similarity
result = wv.most_similar(positive=['margherita', 'pizza'])
print(result)

#Most similar entities: cosmul
result = wv.most_similar_cosmul(positive=['margherita'])
print(result)

INFO: loading KeyedVectors object from ./cache/output/ontology.embeddings
INFO: loading wv recursively from ./cache/output/ontology.embeddings.wv.* with mmap=r
INFO: setting ignored attribute cum_table to None
INFO: Word2Vec lifecycle event {'fname': './cache/output/ontology.embeddings', 'datetime': '2024-05-12T10:29:27.230990', 'gensim': '4.3.1', 'python': '3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'loaded'}


Vector for 'pizza'
[-0.35208783  0.4368985   0.20925592  0.6697823   0.06111518  0.30470192
 -0.3959306  -0.21964304 -0.4625729  -0.07736258 -0.0737689   0.36331636
  0.49076882 -0.22288145 -0.0102441   0.07475019  0.54505837 -0.95993793
  0.33339405  0.02049149  0.05990061  0.18572415  0.149573    0.58744216
  0.19489491 -0.20691772  0.39691544 -0.01546526 -0.18707787  0.29640996
 -0.517227   -0.44690946 -0.20935677 -0.45724204  0.19369175 -0.29058358
  0.3926295   0.32093066 -0.43909982 -0.11507002  0.19123915  0.07252536
 -0.43595603 -0.15474185 -0.06024755 -0.05622103 -0.01502387  0.1620362
  0.2283897   0.02902427  0.18861908  0.30497012 -0.25410876 -0.42386362
  0.11755227 -0.13312133  0.26299927 -0.30372876  0.2750289   0.00300724
  0.32037932 -0.02150279 -0.16717035 -0.09260186  0.09661588  0.27102265
  0.04700339  0.12033629 -0.30562702 -0.32709816  0.55238837 -0.7406147
 -0.30856127 -0.414008   -0.32200167  0.1304826   0.19954146 -0.316769
 -0.08567644 -0.49095532  0.01389351

### 2.5 Ontology Embeddings (Task Vector)

original similarity scores are in the report. running this code recalculates the scores but they do dont differ much. scores are all within the same range maybe differences of approximataley 0.05 give or take. 

Quick notes about parameters:

1. Walker Type - wl or random. captures node structurewl can get complex information vs random walks. wl more intensive than simple random
3. Walk Depth - how deep to walk or go down in the structure. going deeper brings more context and wider relations. going deeper increases computation and may get too deep connecting too different concepts
8. Embedding Size - size of vectors. larger vectors store more compex information require more compute
9. Iteration - how many times the data is passed through. more is good but too much more can over fit ie learn too much
10. Window Size - distance between current word and predicted word in a sentence. small window only looks at immidiate context (like uni or bi gram). small window can miss out on wide context
11. Minimum Count - count of times a word should be included - count of 1 means every word is included but if they actually have a count of 1 then they are rare and can introduce noise
12. Negative Sampling - amount of noise. used to distinguish words

#### Subtask Vector.1

##### Configuration 1

taken the code from lab 9, following the instructions and just updating the respective files

In [4]:
#loading the ontologies and saving in a single graph to pass to word2vec

from rdflib import Graph
from rdflib import URIRef, BNode, Literal
from rdflib import Namespace
from rdflib.namespace import OWL, RDF, RDFS, FOAF, XSD
from rdflib.util import guess_format

g = Graph()
g.parse('generated_rdf.ttl', format = 'ttl')
g.parse('pizza-restaurants-ontology.ttl', format = 'ttl')

g.serialize('ontology_for_embeddings.ttl', format = 'ttl')

turtle_path = 'ontology_for_embeddings.ttl'
g.parse(turtle_path, format='turtle')
output_owl = 'ontology_for_embeddings.owl'
g.serialize(output_owl, format='xml')

<Graph identifier=N9e26f1af33864f4cb90778f0fd79ab01 (<class 'rdflib.graph.Graph'>)>

In [5]:
#Parameters:
# ontology_file
# config_file
# uri_doc
# lit_doc
# mix_doc

#Config 1:
#walker = wl
#walk_depth = 5
#URI_Doc = yes
#Lit_Doc = yes
#Mix_Doc = no
#Mix_Type = random
#embed_size = 100
#iteration = 10
#window = 2
#min_count = 1
#negative = 30
#seed = 42
#epoch = 100

gensim_model1 = owl2vec_star.extract_owl2vec_model(
    r"C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\ontology_for_embeddings.owl",
    "default_updated_config_1.cfg", True, True, True)

output_folder="./cache/output/"

#one binary and one non binary file
gensim_model1.wv.save_word2vec_format(output_folder+"ontology.embeddings.config1.txt", binary=False)
gensim_model1.save(output_folder+"ontology.embeddings.config1")

INFO: Access the ontology ...
* Owlready2 * Creating new ontology C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\ontology_for_embeddings <C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\ontology_for_embeddings.owl#>.
* Owlready2 * ADD TRIPLE C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\ontology_for_embeddings.owl http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#Ontology
* Owlready2 *     ...loading ontology C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\ontology_for_embeddings from C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\ontology_for_embeddings.owl...
* Owlready2 *     ...25 properties found: locatedInCity, locatedInAddress, serves, hasIngr

* Owlready2 * Reseting property core.altLabel: new triples are now available.


INFO: 		Time extracting class membership: 0.7552568912506104 seconds 
INFO: 	Extracting sameAs triples
INFO: 		Time extracting sameAs: 0.007184505462646484 seconds 
INFO: 	Extracting triples associated to locatedInCity
INFO: 		Time extracting triples for property: 0.11867904663085938 seconds 
INFO: 	Extracting triples associated to locatedInAddress
INFO: 		Time extracting triples for property: 0.10186934471130371 seconds 
INFO: 	Extracting triples associated to serves
INFO: 		Time extracting triples for property: 0.1018974781036377 seconds 
INFO: 	Extracting triples associated to hasIngredient
INFO: 		Time extracting triples for property: 0.141737699508667 seconds 
INFO: 	Extracting triples associated to locatedInState
INFO: 		Time extracting triples for property: 0.10561513900756836 seconds 
INFO: 	Extracting triples associated to amountCurrency
INFO: 		Time extracting triples for property: 0.10904431343078613 seconds 
INFO: 	Extracting triples associated to locatedIn
INFO: 		Time ext

##### Configuration 2


In [6]:
#Parameters:
# ontology_file
# config_file
# uri_doc
# lit_doc
# mix_doc

#Config 2:
#walker = random
#walk_depth = 1
#URI_Doc = yes
#Lit_Doc = yes
#Mix_Doc = no
#Mix_Type = random
#embed_size = 100
#iteration = 10
#window = 10
#min_count = 1
#negative = 5
#seed = 42
#epoch = 100

gensim_model2 = owl2vec_star.extract_owl2vec_model(
    r"C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\ontology_for_embeddings.owl",
    "default_updated_config_2.cfg", True, True, True)

output_folder="./cache/output/"

#one binary and one non binary file
gensim_model2.wv.save_word2vec_format(output_folder+"ontology.embeddings.config2.txt", binary=False)
gensim_model2.save(output_folder+"ontology.embeddings.config2")

INFO: Access the ontology ...
INFO: There are 6336 triples in the ontology
INFO: Calculate the ontology projection ...
INFO: Creating ontology graph projection...
INFO: 	Extracting subsumption triples
INFO: 		Time extracting subsumption: 0.11549663543701172 seconds 
INFO: 	Extracting equivalence triples
INFO: 		Time extracting equivalences: 0.02979445457458496 seconds 
INFO: 	Extracting class membership triples.
INFO: 		Time extracting class membership: 0.6956355571746826 seconds 
INFO: 	Extracting sameAs triples
INFO: 		Time extracting sameAs: 0.006412029266357422 seconds 
INFO: 	Extracting triples associated to locatedInCity
INFO: 		Time extracting triples for property: 0.11785340309143066 seconds 
INFO: 	Extracting triples associated to locatedInAddress
INFO: 		Time extracting triples for property: 0.10675168037414551 seconds 
INFO: 	Extracting triples associated to serves
INFO: 		Time extracting triples for property: 0.10326981544494629 seconds 
INFO: 	Extracting triples associated

#### Subtask Vector.2

In [7]:
#lets load the text files and parse them

import pandas as pd

In [8]:
file_1 = r"C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\cache\output\ontology.embeddings.config1.txt"
file_2 = r"C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\cache\output\ontology.embeddings.config2.txt"

In [9]:
#splitting on the space and checking if it is a digit
#played around with the below documentation
#also making lists as we need to pick words we think will be similar else i would have just passed the lists directly but now reading them
#documentation: https://stackoverflow.com/questions/23051294/python-list-comprehension-for-words-that-do-not-consist-solely-of-digits

file_path = file_1
words_embeddings_config_1 = []

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        words = line.split()
        for word in words:
            if not any(char.isdigit() for char in word):
                words_embeddings_config_1.append(word)

file_path = file_2
words_embeddings_config_2 = []

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        words = line.split()
        for word in words:
            if not any(char.isdigit() for char in word):
                words_embeddings_config_2.append(word)

In [10]:
words_embeddings_config_1

['of',
 'type',
 'subclassof',
 'super',
 'class',
 'pizza',
 'ingredient',
 'in',
 'has',
 'is',
 'label',
 'located',
 'city',
 'food',
 'restaurant',
 'address',
 'served',
 'chicken',
 'state',
 'item',
 'value',
 'mushroom',
 'name',
 'meat',
 'location',
 'menu',
 'vegan',
 'sauce',
 'seafood',
 'barbecue',
 'fruit',
 'pineapple',
 'hawaiian',
 'supreme',
 'feta',
 'vegetable',
 'country',
 'beans',
 'vegetarian',
 'pepper',
 'marinara',
 'italian',
 'tomato',
 'serves',
 'pepperoni',
 'altlabel',
 'bbq',
 'currency',
 'amount',
 'sausage',
 'pizzeria',
 'cheese',
 'mexican',
 'place',
 'greek',
 'by',
 'onion',
 'bianca',
 'us',
 'mozzarella',
 'margherita',
 'funghi',
 'napolitana',
 'japanese',
 'garlic',
 'meaty',
 'ham',
 'basil',
 'and',
 'sause',
 'capers',
 'bakers',
 'mellow',
 'comment',
 'bar',
 "\\xbc'",
 'white',
 'style',
 'mediterranean',
 'grill',
 'american',
 'the',
 "b'@",
 'olives',
 'kitchen',
 'ernesto',
 "\\xbc'",
 'olathe',
 'shop',
 'coffee',
 'original',

In [11]:
words_embeddings_config_2

['pizza',
 'type',
 'in',
 'Type',
 'located',
 'us',
 'ingredient',
 'city',
 'of',
 'chicken',
 'name',
 'subclassof',
 'restaurant',
 'address',
 'has',
 'state',
 'mushroom',
 'is',
 'pizzeria',
 'item',
 'SubClassOf',
 'value',
 'label',
 'st',
 'usd',
 'currency',
 'amount',
 'the',
 'and',
 'italian',
 'country',
 'bakers',
 'mellow',
 'grill',
 'served',
 'mexican',
 'serves',
 'supreme',
 'greek',
 'rd',
 'some',
 'kitchen',
 'meat',
 'n',
 'ave',
 'olathe',
 'original',
 'new',
 'e',
 'cafe',
 'giorgios',
 'york',
 'los',
 'street',
 'hawaiian',
 's',
 'den',
 'greeks',
 'zios',
 'bar',
 'cheese',
 'tx',
 'ca',
 'pepper',
 'little',
 'angeles',
 'place',
 'buffalo',
 'denver',
 'bbq',
 'mozzarella',
 'fire',
 'antonios',
 'onion',
 'vegetable',
 'village',
 'mi',
 'mount',
 'vegan',
 'white',
 'pub',
 'philadelphia',
 'sauce',
 'pepperoni',
 'seafood',
 'pasta',
 'marinara',
 'tomato',
 'brentwood',
 'bronson',
 'class',
 'super',
 'w',
 'pa',
 'johns',
 'green',
 'kennebunk'

##### Subtask Vector.2.1

similariteis for the first configuration

In [12]:
#Embedding vectors generated above
model1 = KeyedVectors.load(r"C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\cache\output\ontology.embeddings.config1"
                           ,mmap='r')
wv1 = model1.wv

#cosine similarity

#expect to be similar
similarity1 = wv1.similarity('pizza', 'ingredient')
similarity2 = wv1.similarity('pizza', 'food')
similarity3 = wv1.similarity('fruit', 'pineapple')

#expect to be dissimilar
similarity4 = wv1.similarity('meat', 'seafood')
similarity5 = wv1.similarity('cheese', 'currency')
similarity6 = wv1.similarity('salmon', 'seeds')

print('Expect to be Similar')
print(f"The cosine similarity between 'pizza' and 'ingredient' is {similarity1:.3f}")
print(f"The cosine similarity between 'pizza' and 'food' is {similarity2:.3f}")
print(f"The cosine similarity between 'fruit' and 'pineapple' is {similarity3:.3f}")
print('----------------------------')
print('Expect to be Not Similar')
print(f"The cosine similarity between 'meat' and 'seafood' is {similarity4:.3f}")
print(f"The cosine similarity between 'cheese' and 'currency' is {similarity5:.3f}")
print(f"The cosine similarity between 'salmon' and 'seeds' is {similarity6:.3f}")

INFO: loading KeyedVectors object from C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\cache\output\ontology.embeddings.config1
INFO: loading wv recursively from C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\cache\output\ontology.embeddings.config1.wv.* with mmap=r
INFO: setting ignored attribute cum_table to None
INFO: Word2Vec lifecycle event {'fname': 'C:\\Users\\Fasih Munir\\Desktop\\Knowledge Graphs Final Coursework\\OWL2Vec-Star-IN3067-INM713\\OWL2Vec-Star-master\\cache\\output\\ontology.embeddings.config1', 'datetime': '2024-05-12T10:30:17.302633', 'gensim': '4.3.1', 'python': '3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'loaded'}


Expect to be Similar
The cosine similarity between 'pizza' and 'ingredient' is 0.591
The cosine similarity between 'pizza' and 'food' is 0.632
The cosine similarity between 'fruit' and 'pineapple' is 0.785
----------------------------
Expect to be Not Similar
The cosine similarity between 'meat' and 'seafood' is 0.596
The cosine similarity between 'cheese' and 'currency' is 0.128
The cosine similarity between 'salmon' and 'seeds' is 0.408


##### Subtask Vector.2.2

similarities for the second configuration

In [13]:
model2 = KeyedVectors.load(r"C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\cache\output\ontology.embeddings.config2"
                           ,mmap='r')
wv2 = model2.wv

INFO: loading KeyedVectors object from C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\cache\output\ontology.embeddings.config2
INFO: loading wv recursively from C:\Users\Fasih Munir\Desktop\Knowledge Graphs Final Coursework\OWL2Vec-Star-IN3067-INM713\OWL2Vec-Star-master\cache\output\ontology.embeddings.config2.wv.* with mmap=r
INFO: setting ignored attribute cum_table to None
INFO: Word2Vec lifecycle event {'fname': 'C:\\Users\\Fasih Munir\\Desktop\\Knowledge Graphs Final Coursework\\OWL2Vec-Star-IN3067-INM713\\OWL2Vec-Star-master\\cache\\output\\ontology.embeddings.config2', 'datetime': '2024-05-12T10:30:17.331147', 'gensim': '4.3.1', 'python': '3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'loaded'}


In [14]:
words_embeddings_config_2

['pizza',
 'type',
 'in',
 'Type',
 'located',
 'us',
 'ingredient',
 'city',
 'of',
 'chicken',
 'name',
 'subclassof',
 'restaurant',
 'address',
 'has',
 'state',
 'mushroom',
 'is',
 'pizzeria',
 'item',
 'SubClassOf',
 'value',
 'label',
 'st',
 'usd',
 'currency',
 'amount',
 'the',
 'and',
 'italian',
 'country',
 'bakers',
 'mellow',
 'grill',
 'served',
 'mexican',
 'serves',
 'supreme',
 'greek',
 'rd',
 'some',
 'kitchen',
 'meat',
 'n',
 'ave',
 'olathe',
 'original',
 'new',
 'e',
 'cafe',
 'giorgios',
 'york',
 'los',
 'street',
 'hawaiian',
 's',
 'den',
 'greeks',
 'zios',
 'bar',
 'cheese',
 'tx',
 'ca',
 'pepper',
 'little',
 'angeles',
 'place',
 'buffalo',
 'denver',
 'bbq',
 'mozzarella',
 'fire',
 'antonios',
 'onion',
 'vegetable',
 'village',
 'mi',
 'mount',
 'vegan',
 'white',
 'pub',
 'philadelphia',
 'sauce',
 'pepperoni',
 'seafood',
 'pasta',
 'marinara',
 'tomato',
 'brentwood',
 'bronson',
 'class',
 'super',
 'w',
 'pa',
 'johns',
 'green',
 'kennebunk'

In [15]:
#cosine similarity

#expect to be similar
similarity7 = wv2.similarity('chicken', 'ingredient')
similarity8 = wv2.similarity('mushroom', 'onion')
similarity9 = wv2.similarity('philadelphia', 'brentwood')

#expect to be dissimilar
similarity10 = wv2.similarity('pineapple', 'bacon')
similarity11 = wv2.similarity('country', 'pepperoni')
similarity12 = wv2.similarity('artichokes', 'oregano')

print('Expect to be Similar')
print(f"The cosine similarity between 'chicken' and 'ingredient' is {similarity7:.3f}")
print(f"The cosine similarity between 'mushroom' and 'onion' is {similarity8:.3f}")
print(f"The cosine similarity between 'philadelphia' and 'brentwood' is {similarity9:.3f}")
print('----------------------------')
print('Expect to be Not Similar')
print(f"The cosine similarity between 'pineapple' and 'bacon' is {similarity10:.3f}")
print(f"The cosine similarity between 'country' and 'pepperoni' is {similarity11:.3f}")
print(f"The cosine similarity between 'artichokes' and 'oregano' is {similarity12:.3f}")

Expect to be Similar
The cosine similarity between 'chicken' and 'ingredient' is 0.707
The cosine similarity between 'mushroom' and 'onion' is 0.730
The cosine similarity between 'philadelphia' and 'brentwood' is 0.427
----------------------------
Expect to be Not Similar
The cosine similarity between 'pineapple' and 'bacon' is 0.827
The cosine similarity between 'country' and 'pepperoni' is 0.555
The cosine similarity between 'artichokes' and 'oregano' is 0.677
