## Example process of running pipelines to create ontology

In [1]:
from notebooks import *
#from pathlib import Path

### Define parameters to run

- `data_file`: Where we have the raw data file stored
- `data_path`: Folder where we have the processed data stored
- `domain` : Domain we want to create for the ontology
- `ontology_name`: Name of the ontology
- `ontology_path`: Folder where we have the generated ontology stored

In [2]:
data_file = '../data/canadacities/canadacities.csv'
data_path = '../data/canadacities'
out_file = '../data/canadacities/canada_provinces.json'
domain='http://bast.ai'
ontology_name='canada_city'
ontology_path='../ontologies/canada_city'#remember make OWL file call it canada_city

### Make sure the folders are created

In [3]:
#FileIO = Path("data_path")
FileIO.exists_or_create(data_path)
FileIO.exists_or_create(ontology_path)

### Intialize creation of ontology

In [4]:
from bast_ai_buildowl.svc import InitializeOwl
InitializeOwl(domain=domain,
                owl_name=ontology_name,
                output_path=ontology_path).run()

../ontologies/canada_city/canada_city.owl
File: ../ontologies/canada_city/canada_city.owl is already exist.
Pipeline will do nothing.
Owl base: http://bast.ai/canada_city


In [5]:
def cleanse (input_str: str) -> str:
    result = input_str
    result = result.replace(" ","_")  
    result = result.replace(",","_")
    result = result.replace("&","_")
    result = result.replace("_and_","_")
    result = result.replace("'","_")
    result = result.replace('"',"_")
    result = result.replace("/","_")
    result = result.replace("’","_")
    result = result.replace("'","_")
    while ("__" in result):
        result = result.replace("__","_")
    
    return result

In [6]:
def get_label (cleansed_str: str) -> str:
    result = cleansed_str
    result = result.replace("_"," ")
    result = result.replace("-"," ")
    result = TextUtils.title_case(result)
    return result

In [7]:
# def get_synonyms (input_str: str,
#                   prefix_str: str = None) -> list:
#     result = list()
#     tokens = re.split("&|,| and ",input_str)
#     for token in tokens:
#         token = token.strip()
#         if (prefix_str):
#             result.append(prefix_str + " " + token)
#         else:
#             result.append(token)
#     if ( len(result) > 1):
#         return result
#     else: 
#         return []

### Prepare data files to put in ontology

In [8]:
import pandas as pd
import json

# Define input and output file paths so we can prepare province data
source_file = data_file


# Read source file
records = pd.read_csv(source_file).values.tolist()

# Prepare ontology data structure by defining root node
out_data = [{"entity": "canada_provinces", "label": "Canada Provinces"}]
provinces = dict()

# Process each record store each column category into variable
for r in records:
    city_ascii = r[1]  # ASCII version of city name
    city = r[0]  # City name
    province_id = r[2]  # Province identifier
    province_name = r[3]  # Province name

    # Check if province_id is in data and set keys to provinces dictionary
    if province_id != 'province_id':  # Skip header row
        if province_id not in provinces:
            provinces[province_id] = {
                'province_id': province_id,
                'province_name': province_name,
                'cities': []#cities is dictionary first key city second key city_ascii
            }
        provinces[province_id]['cities'].append({"city": city, "city_ascii": city_ascii}) #city data stored within province

# Define cleanse function to clean entity names
def cleanser(entity_name):
    return entity_name.replace(" ", "_").replace("'", "").lower()#remove single quotes and place underscores

# Create a set to track the provinces that are processed
processed_provinces = set()

# Convert the province and city data into the ontology hierachy 
for province_id, province_data in provinces.items():
    province_code_cleanse = cleanser(province_data['province_id'])#return key value pairs as tuples in provinces province id key, data is value
    
    # Add the province entry if it was not processed
    if province_code_cleanse not in processed_provinces:
        out_data.append({
            "entity": province_code_cleanse,
            "parentEntity": "canada_provinces",
            "label": province_data["province_name"]
        })
        processed_provinces.add(province_code_cleanse)
    
    # Now add the cities for each province
    for city_data in province_data["cities"]:
        city_cleanse = cleanse(city_data["city"])
        out_data.append({
            "entity": city_cleanse,
            "parentEntity": province_code_cleanse,
            "label": [city_data["city_ascii"]],
            "synonyms": city_data["city"]
        })

# Write the output data to JSON file with ensure_ascii is False to avoid unicode escape sequences 
with open(out_file, 'w', encoding='utf-8') as f:
    json.dump(out_data, f, indent=4, ensure_ascii=False)

# Check if the output file has been written successfully
print(f"Output written in this file: {out_file}")


Output written in this file: ../data/canadacities/canada_provinces.json


In [9]:
# NOT DOING RIGHT NOW
from bast_ai_buildowl.svc import AddEntities
AddEntities(owl_path=f'{ontology_path}/{ontology_name}.owl').run(entities_file=out_file)