# P&ID files contextualization
This notebook contains a workflow for contextualizing of P&ID files in PDF format. 

Authors: Alina Astrakova and Jan Inge Bergseth

In [None]:
import os
from getpass import getpass
import urllib
from cognite.experimental import CogniteClient 
from cognite.client.data_classes.files import FileMetadataUpdate

# Initialize

In [None]:
project = "project"
api_key = getpass()

In [None]:
client = CogniteClient(api_key=api_key, project=project, client_name="dshub")
client.login.status()

# Download and prepare data

In [None]:
# Download asset based on root_id
root_id = 2932685165441395
assets = client.assets.retrieve_subtree(root_id)

In [None]:
# Map asset names to a list with asset_ids with this name
asset_id_dict = dict()
for asset in assets:
    name = asset.name
    if name not in asset_id_dict:
        asset_id_dict[name] = [asset.id]
    else:
        asset_id_dict[name].append(int(asset.id))

In [None]:
# Get files based on source
source = "some_source"
files = client.files.list(mime_type="application/pdf", source=source, limit=-1)
print("Number of files:", len(files))

In [None]:
# Create entites based on asset names
entities = [asset.name for asset in assets]
print("Number of entities:", len(entities))

In [None]:
# Parse files, create SVGs with the found assets, contextualize the original files in CDF, upload the new SVGs to CDF

file_metadata_updates = list()

# add source for the new SVGs
svg_source="contextualization"

# add mime_type
svg_mime_type = "image/svg+xml"

# overwrite the svg file if already uploaded
svg_overwrite = True

for file in files:
    print(f"Parse and convert P&ID to SVG, input file: {file.name}")

    # run pnid_parse job
    job = client.pnid_parsing.parse(file_id=file.id, entities=entities, partial_match=True)
    
    # store the svg url with the highlighted detected entities
    svg_url = job.result["svgUrl"]
    
    # store the results with the entities found and the corresponding bounding boxes 
    items = job.result["items"]

    # get only entities from the results
    entities_found = [item["text"] for item in items]
    if not entities_found:
        # skip to the next file if no assets are found
        continue
    
    # remove duplicate entities
    entities_found = list(set(entities_found))
    
    # make a list of asset_ids based on the found entities 
    asset_ids_found = set()
    for entity in entities_found:
        # asset name could correspond to several asset_ids
        asset_ids_found.update(asset_id_dict[entity])
        
    asset_names = ','.join(map(str, entities_found))
    print(f"\tFound assets: {asset_names}")
    
    # update metadata for the original file
    file_metadata_update = FileMetadataUpdate(id=file.id).asset_ids.set(list(asset_ids_found))
    file_metadata_updates.append(file_metadata_update)
    
    # add metadata to the new svg, e.g.,
    svg_metadata = {"OriginalSource": file.source}
    
    # keep the name for the file, replacing the extention
    name_svg = os.path.splitext(file.name)[0] + ".svg"
    
    # give a meaningful external_id
    external_id_svg = file.external_id + "+svg"
    
    # download the svg file from the url
    urllib.request.urlretrieve (svg_url, name_svg)
    
    # upload svg with the same asset_ids
    client.files.upload(name_svg, 
                        external_id=external_id_svg,
                        asset_ids=list(asset_ids_found), 
                        mime_type=svg_mime_type,
                        source=svg_source, 
                        overwrite=svg_overwrite,
                        metadata=svg_metadata)
    print("... finished.\n")

In [None]:
# Update the metadata of the existing files
client.files.update(file_metadata_updates)