## Example process of running pipelines to create ontology

In [1]:
from notebooks import *

### Define parameters to run

- `data_file`: Where we have the raw data file stored
- `data_path`: Folder where we have the processed data stored
- `domain` : Domain we want to create for the ontology
- `ontology_name`: Name of the ontology
- `ontology_path`: Folder where we have the generated ontology stored

In [2]:
data_file = ''
data_path = '../data/uscities'

domain='http://bast.ai'
ontology_name='us_city'
ontology_path='../ontologies/us_city'

### Make sure the folders are created

In [None]:
FileIO.exists_or_create(data_path)
FileIO.exists_or_create(ontology_path)

### Intialize creation of ontology

In [None]:
from bast_ai_buildowl.svc import InitializeOwl
InitializeOwl(domain=domain,
                owl_name=ontology_name,
                output_path=ontology_path).run()

In [5]:
def cleanse (input_str: str) -> str:
    result = input_str
    result = result.replace(" ","_")  
    result = result.replace(",","_")
    result = result.replace("&","_")
    result = result.replace("_and_","_")
    result = result.replace("'","_")
    result = result.replace('"',"_")
    result = result.replace("/","_")
    while ("__" in result):
        result = result.replace("__","_")
    
    return result

In [6]:
def get_label (cleansed_str: str) -> str:
    result = cleansed_str
    result = result.replace("_"," ")
    result = result.replace("-"," ")
    result = TextUtils.title_case(result)
    return result

In [7]:
def get_synonyms (input_str: str,
                  prefix_str: str = None) -> list:
    result = list()
    tokens = re.split("&|,| and ",input_str)
    for token in tokens:
        token = token.strip()
        if (prefix_str):
            result.append(prefix_str + " " + token)
        else:
            result.append(token)
    if ( len(result) > 1):
        return result
    else: 
        return []

### Prepare data files to put in ontology

In [8]:
## Prepare States data

source_file = f'{data_path}/uscities.csv'
out_file = f'{data_path}/us_states.json'

records = FileIO.read_lines(source_file)

out_data = [    
    {   "entity": "us_state",
        "label": "US State",
    }
]

states = dict()

for r in records:

    tokens = r.split(',')
    state_id = tokens[2]
    state_name = tokens[3]
    city = tokens[0]
    county = tokens[5]
    county_id = tokens[4]

    if state_id != 'state_id':
        if state_id not in states:

            cities = set()
            cities.add(city)
            
            counties = {
                county_id : {
                    "name" : county,
                    "cities" : cities
                }
            }

            states[state_id] = {
                'state_id': state_id,
                'state_name': state_name,
                'counties' : counties
            }
        else:

            current_counties = states[state_id]['counties']

            if ( county_id in current_counties):
                current_counties = states[state_id]['counties'][county_id]['cities'].add(city)
            else:
                cities = set()
                cities.add(city)
                current_counties [county_id] =  {
                    "name" : county,
                    "cities" : cities
                }

for state_code in states:
    out_data.append (
        {
            "entity": state_code,
            "parentEntity": "us_state",
            "label": states[state_code]["state_name"],
        }
    )

    counties = states[state_code]["counties"]
    for county_code in counties:
        county_name = counties[county_code]["name"]
        county_name_cleanse = cleanse(county_name)
        out_data.append (
            {
                "entity": county_code,
                "parentEntity": state_code,
                "label": f"{county_name} county",
            }
        )

        cities = county_name = counties[county_code]["cities"]

        for city in cities:
            city_cleanse = cleanse(city)
            out_data.append (
                    {
                        "entity": city_cleanse,
                        "parentEntity": county_code,
                        "label": city,
                    }
                )

FileIO.write_json (out_data, out_file)


In [9]:
from bast_ai_buildowl.svc import AddEntities
AddEntities(owl_path=f'{ontology_path}/{ontology_name}.owl').run(entities_file=out_file)
