In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
import os
import json

import numpy as np
import pandas as pd
from cognite.client import CogniteClient

from cognite.client.data_classes.three_d import ThreeDAssetMapping

from cognite.datastudio.entity_matcher import EntityMatcher

In [6]:
from utils import chunk_create_rules_df, chunk_predict, get_matches_with_rules

In [7]:
project = "akerbp"
api_key_name = "AKERBP_API_KEY"

In [9]:
# In case you need to add api-key
#from add_client_api_key import ClientApiKeyWidget
#client_api_key_widget = ClientApiKeyWidget(api_key_name=api_key_name, project=project)

In [10]:
client = CogniteClient(os.environ[api_key_name], project, "local-jupyter-notebook")

  debug=debug,


In [11]:
# define 3d model_id and revision
model_id = 1078941578276888
revision_id = 506407845865623

# define root_id for assets
root_id = 8129784932439587

#define a name to store thre predicted result
entity_matcher_results_file = "enma_skarv_fpso.json"

In [None]:
# we provide functions to install 3d-nodes, assets and asset-mappings
from data_load_cdf import load_assets, load_threednodes, filter_df_threednodes, load_asset_mappings

In [12]:
# download 3d nodes, it might take time
df_threednodes = load_threednodes(client, model_id, revision_id)
# filter the names of the 3d nodes that do not need contexualization
df_threednodes = filter_df_threednodes(df_threednodes, key_words=("EQUIPMENT", "BRANCH", "STRUCTURE", " of "))
df_threednodes.rename(columns={"name": "left_side_name"}, inplace=True)

Loading data from local...
Filtering 3D-nodes:
361909 initially loaded
361909 after filtering on empty name
361909 after dropping duplicates
361909 after filtering on EQUIPMENT
361909 after filtering on BRANCH
361909 after filtering on STRUCTURE
361909 after filtering on  of 
Loading data from local...


In [None]:
# download assets
df_assets = load_assets(client, root_id).rename(columns={"name": "right_side_name"})

In [13]:
#download existing asset mappings from the 3d model
df_asset_mappings = load_asset_mappings(client, model_id, revision_id)

Loading data from local...


In [14]:
# Since df_asset_mappings includes only IDs,
# in order to get the names we join on the 
# df_assets, df_threednodes including available respective IDs and names.
df_existing_matches = (
        df_asset_mappings[["nodeId", "assetId"]]
        .merge(
            df_assets[["id", "right_side_name"]],
            how="left",
            left_on="assetId",
            right_on="id",
        )
        .drop(columns="id")
        .merge(
            df_threednodes[["id", "left_side_name"]],
            how="left",
            left_on="nodeId",
            right_on="id",
        )[["left_side_name", "right_side_name"]]
    )

In [15]:
#initialize the entity matcher
entity_matcher = EntityMatcher(client)

In [98]:
# create rules form the existing mappings if exist
df_matches = \
    df_existing_matches[["left_side_name", "right_side_name"]]\
    .dropna()\
    .rename(columns = {"left_side_name": "input", "right_side_name": "predicted"})
df_matches["score"] = 1.0

pd_rules_from_existing = chunk_create_rules_df(entity_matcher, df_matches.to_dict('records'), size=100000)

Finished with chunk nr. 0 .


In [68]:
# make predictions 
if os.path.exists(entity_matcher_results_file):
    print("Loading predicted from local...")
    with open(entity_matcher_results_file, "r") as f:
        predicted_matches = json.load(f)
else:
    model = entity_matcher.fit(df_assets["right_side_name"].tolist())
    predicted_matches = chunk_predict(model, df_threednodes["left_side_name"].tolist(), 100000)
    # store all predictions in a file
    with open(entity_matcher_results_file, "w") as f:
        json.dump(predicted_matches, f)

Finished with chunk nr. 0 .
Finished with chunk nr. 1 .
Finished with chunk nr. 2 .
Finished with chunk nr. 3 .


In [133]:
# predicted_matches filter on NAs, 
# NEEDreset index to match order before creating rules
df_predicted_matches = pd.DataFrame.from_dict(predicted_matches).dropna().reset_index(drop=True)

In [109]:
# create rules for predicted matches
pd_rules_from_predicted = chunk_create_rules_df(entity_matcher, df_predicted_matches.to_dict('records'), size=100000)

Finished with chunk nr. 0 .
Finished with chunk nr. 1 .
Finished with chunk nr. 2 .
Finished with chunk nr. 3 .


In [142]:
# associate matches with rules
df_predicted_with_rules = get_matches_with_rules(df_predicted_matches, pd_rules_from_predicted)

In [149]:
# assosicate predicted results with IDs
df_predicted_results_raw = df_predicted_with_rules\
    .merge(df_assets, left_on="predicted", right_on="right_side_name", how="inner")\
    .drop(columns=["right_side_name"])\
    .rename(columns={"id":"asset_id"})\
    .merge(df_threednodes, left_on="input", right_on="left_side_name", how="inner")\
    .drop(columns=["left_side_name"])\
    .rename(columns={"id":"node_id"})
df_predicted_results_raw.sample(5)

Unnamed: 0,input,predicted,score,inputPattern,predictPattern,numMatches,avgScore,asset_id,node_id
273641,/PS-P300-0407/SPNT,PS-P300-2772,0.75,/[L1]-[L2][D3]-D/L,[L1]-[L2][D3]-D,15536,0.755997,606648530772463,8835667779331163
111869,/PS-U800-2727/FRAM-02/MAIN,U800-S-3000,0.40825,/L-[L1][D2]-D/L-D/L,[L1][D2]-L-D,3778,0.435028,381752738470181,3963993522999103
164643,/NOZZ-EQP-45-30-14,30,0.57735,/L-L-D-[D1]-D,[D1],42,0.58044,4363862858249661,8047697019963956
225113,/PS-P500-0514/FRAM-01/MAIN/TMPL/BEAM-A,84-EB-500A-P01,0.61721,/L-[L1][D2]-D/L-[D3]/L/L/L-[L4],D-L-[D2][L4]-[L1][D3],13087,0.619927,1114763551659361,4849656034816847
63450,/PS-P500-0803/FRAM-01/CONN,PS-P500-1219,0.67082,/[L1]-[L2][D3]-D/L-D/L,[L1]-[L2][D3]-D,26928,0.651369,222810173061777,7069670572936062


In [150]:
# comment or uncomment for different filtering
df_predicted_results = df_predicted_results_raw.copy()

# filter on the score value 
df_predicted_result = df_predicted_result[df_predicted_result["score"] > 0.0]

# filter on the avgScore value
df_predicted_result = df_predicted_result[df_predicted_result["avgScore"] > 0.0]

# filter by the number of matcher per rule
df_predicted_result = df_predicted_result[df_predicted_result["numMatches"] > 0]

# filter by merging on existing rules only
df_predicted_result = df_predicted_result.merge(pd_rules_from_existing\
    .rename(columns={"numMatches": "numMatchesExisting"})
    .drop(columns=["avgScore","matchIndex"]), on=["inputPattern", "predictPattern"],
    how="inner")

# filter out input with existing asset mappings
df_predicted_result = df_predicted_result\
    .merge(df_existing_matches.rename(columns={"right_side_name": "existing_match"}), left_on=["input"], right_on=["left_side_name"], how="left")\
    .drop(columns=["left_side_name"])
df_predicted_result= df_predicted_result[df_predicted_result["existing_match"].isna()]

# filter based on a list of manual rules
"""
rules_from_list = [("/[D1]-[L2]-[D3]", "[D1]-[L2]-[D3]")]
def get_rule_tuple(row):
    return (row["inputPattern"], row["predictPattern"])

df_predicted_result = df_predicted_result[df_predicted_result.apply(get_rule_tuple, axis=1)\
    .isin(rules_from_list)]
"""

df_predicted_result.sample(5)

Unnamed: 0,input,predicted,score,inputPattern,predictPattern,numMatches,avgScore,asset_id,node_id,numMatchesExisting,existing_match
1117,/46-C-6006..,46-C-6006,1.0,/[D1]-[L2]-[D3],[D1]-[L2]-[D3],8691,1.0,4617634349308497,5371319586913922,8645,
8660,/65-CT-701.,65-CT-701,1.0,/[D1]-[L2]-[D3],[D1]-[L2]-[D3],8691,1.0,4580658744499941,7057340498779513,8645,
27093,/29-CZ-001/WETCOG,40-CZ-001,0.66667,/D-[L1]-[D2]/L,D-[L1]-[D2],15,0.642851,926483281787503,8056766072949524,1,
8897,/13-V-6090.,13-V-6090,1.0,/[D1]-[L2]-[D3],[D1]-[L2]-[D3],8691,1.0,2448976479245900,2579392606278530,8645,
21663,/27-V-6019,27-V-1543,0.66667,/[D1]-[L2]-D,[D1]-[L2]-D,106,0.66667,940266011560,5699151852329016,1,


In [153]:
# Create list of dictionaries to create ThreeDAssetMapping
resulting_asset_mappings =list(df_predicted_result[["node_id","asset_id"]].T.to_dict().values())
print(len(resulting_asset_mappings))
resulting_asset_mappings

166


[{'node_id': 1388685240252594, 'asset_id': 6058605378531024},
 {'node_id': 3644220755669401, 'asset_id': 332221397725721},
 {'node_id': 5371319586913922, 'asset_id': 4617634349308497},
 {'node_id': 8658613293743961, 'asset_id': 7513670350185809},
 {'node_id': 6178893682177146, 'asset_id': 8328160876101782},
 {'node_id': 521307451842620, 'asset_id': 3616266713471915},
 {'node_id': 6245595512554181, 'asset_id': 1329591692274365},
 {'node_id': 8890325298752301, 'asset_id': 8546797805361910},
 {'node_id': 1592669954683812, 'asset_id': 8242396260938696},
 {'node_id': 8063511847194956, 'asset_id': 8433968346665436},
 {'node_id': 905691562238341, 'asset_id': 7788302680161738},
 {'node_id': 2596085698585769, 'asset_id': 8879001088864882},
 {'node_id': 6501567433145041, 'asset_id': 6021432548043362},
 {'node_id': 5317066354543971, 'asset_id': 3301957454959280},
 {'node_id': 268321040081547, 'asset_id': 5506288362746893},
 {'node_id': 6503287689645967, 'asset_id': 6174613176572695},
 {'node_id':

In [154]:
# Create ThreeDAssetMappings
cdf_asset_mappings = []
for asset_mapping_dict in resulting_asset_mappings:
    cdf_asset_mappings.append(ThreeDAssetMapping(**asset_mapping_dict))

In [None]:
# Uncomment to write to clean:
#client.three_d.asset_mappings.create(model_id, revision_id, cdf_asset_mappings)