In [2]:
import os
import sys
import logging
import warnings
import pandas as pd

from pprint import pprint
from cocoon.core.llm.bedrock import BedrockLLM
from cocoon.core.embeddings.bedrock import BedrockEmbeddings
from cocoon.search_service import find_similar_indices, find_entity_relation_matches_and_cluster
from cocoon.embedding_service import initialize_faiss_index_from_embeddings, create_embeddings
from cocoon.output_service import generate_output

warnings.filterwarnings("ignore")
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(levelname)s - %(message)s")

## 1. Basic setup

In [3]:
# Set AWS Profile
os.environ["AWS_DEFAULT_PROFILE"] = "sagar-wip-cli"

In [4]:
# Basic config for path/location settings
output_model_name = "titan_g1"

folder_name = "testing"
input_file_name = "EOS_Run1"

files = {
    # Pre-computed embeddings for NAICS dataset
    "embeddings": f's3://esgflo-wip-eu-dev-rightwhale-vault-store/scope3/naics/{output_model_name}_embeddings_output.csv',
    
    # Location of input csv file
    "data": f's3://esgflo-wip-eu-dev-rightwhale-vault-store/scope3/{folder_name}/{input_file_name}.csv',
    # Location of embeddings computed based on the input csv file
    "embedded_data": f's3://esgflo-wip-eu-dev-rightwhale-vault-store/scope3/{folder_name}/{input_file_name}_embedded.csv',
    # Location of the output csv file
    "output": f's3://esgflo-wip-eu-dev-rightwhale-vault-store/scope3/{folder_name}/{input_file_name}_output.csv',
}

## 2. Read Files

This section deals with reading the input file and the pre-computed embeddings of NAICS database from the location provided above.

### 2.1. Input File

In [5]:
# Read the data file
data_df = pd.read_csv(files["data"], encoding="latin-1")
data_df = data_df[data_df.columns[-2:]]
data_df.columns = ["description1", "group_name"]
data_df = data_df[["group_name", "description1"]]

# Extract the columns to use in the future
columns_to_use = data_df.columns

# Create a label i.e., a concatenation of all the useful columns
data_df["label"] = data_df.apply(lambda x: ",".join(x), axis=1)

# Display the top 5 rows
data_df.head()

INFO - Credentials found in config file: ~/.aws/config


Unnamed: 0,group_name,description1,label
0,Drives,X-axis drive,"Drives,X-axis drive"
1,Other accessories,Z-guide used,"Other accessories,Z-guide used"
2,Modular assemblies,Dosing system,"Modular assemblies,Dosing system"
3,Coolers,Cooling system air-cooled 1.5kW complete,"Coolers,Cooling system air-cooled 1.5kW complete"
4,Scanner,Scanner head system,"Scanner,Scanner head system"


### 2.2. Pre-computed Embeddings of NAICS

In [6]:
# Load NAICS embeddings
reference_df = pd.read_csv(files["embeddings"])

In [7]:
reference_df.head()

Unnamed: 0,label,index_ids,2017 NAICS Title,embedding
0,Abrasive Product Manufacturing,[292],Abrasive Product Manufacturing,"[0.828125, 0.47851562, 0.32421875, 0.3125, -0...."
1,Adhesive Manufacturing,[253],Adhesive Manufacturing,"[0.8046875, -0.018798828, 0.21289062, 1.328125..."
2,Administrative Management and General Manageme...,[800],Administrative Management and General Manageme...,"[-0.14453125, -0.24609375, 0.41015625, -0.3828..."
3,Advertising Agencies,[811],Advertising Agencies,"[-0.3515625, 0.203125, 0.38671875, 0.58203125,..."
4,Advertising Material Distribution Services,[817],Advertising Material Distribution Services,"[0.3359375, 0.19042969, 0.088378906, 0.5429687..."


## 3. Create embeddings of the `label` column in the input data

In [8]:
# Initialize the Embedding class
# Here, we're using the Titan-G1 Model from Bedrock

embed_model = BedrockEmbeddings(
    model_id="amazon.titan-embed-text-v1",
    aws_region_name="eu-central-1",
    dims=1536
)

INFO - Credentials found in config file: ~/.aws/config


In [9]:
# Create the embeddings of the `label` column in the input data
# Save the embeddings dataframe in the `output_csv_filepath` location

input_embedded_df = create_embeddings(
    embed_model = embed_model,
    df = data_df,
    output_csv_filepath = files["embedded_data"]
)

INFO - 'amazon.titan-embed-text-v1' used for Embedding Text
INFO - All labels embedded and CSV updated.


In [10]:
input_embedded_df.head()

Unnamed: 0,label,index_ids,group_name,description1,embedding
0,"AMCM Customized System M,Customized M 290-1 1k...",[5],AMCM Customized System M,Customized M 290-1 1kW Digital Can,"[-0.140625, 0.42578125, -0.439453125, -0.03295..."
1,"CO_ laser,CO2 laser FSV30SAG (30W, AIR, OEM)",[21],CO_ laser,"CO2 laser FSV30SAG (30W, AIR, OEM)","[-0.16015625, 0.53125, 0.34375, -0.146484375, ..."
2,"Coolers,Cooler 1.5kW air/water cooler complete",[31],Coolers,Cooler 1.5kW air/water cooler complete,"[-0.609375, 1.1171875, -0.63671875, -0.1455078..."
3,"Coolers,Cooler 4.5kW air/water cooler complete",[18],Coolers,Cooler 4.5kW air/water cooler complete,"[-0.5, 1.078125, -0.75390625, -0.2119140625, -..."
4,"Coolers,Cooling system air-cooled 1.5kW complete",[3],Coolers,Cooling system air-cooled 1.5kW complete,"[-0.267578125, 1.453125, -0.5390625, 0.2470703..."


## 4. Initialize FAISS Index from NAICS Embeddings

In [11]:
# Load FAISS index
index = initialize_faiss_index_from_embeddings(
    embed_model=embed_model,
    df=reference_df
)

## 5. Find Similar between Input Label Embeddings and NAICS Embeddings

In [12]:
# Find similar indexes between input embedded df and database index
distances, indexes = find_similar_indices(
    input_embedded_df,
    index
)

In [13]:
display(distances[:5])
display(indexes[:5])

array([[350.10736, 351.23447, 352.7542 , 370.46704, 388.22443, 389.4735 ,
        392.26947, 393.13736, 394.7958 , 394.80554],
       [425.91248, 451.0672 , 459.53387, 468.84985, 477.5291 , 477.86914,
        485.52667, 495.46918, 498.49567, 498.9358 ],
       [397.44446, 424.2782 , 427.17703, 433.45987, 443.7939 , 470.3966 ,
        473.03143, 484.15982, 486.6496 , 499.37234],
       [398.2353 , 422.08936, 431.43433, 433.5531 , 439.87164, 467.75195,
        473.90787, 486.2905 , 492.7293 , 501.32294],
       [392.77338, 434.71942, 438.53342, 452.29214, 453.3752 , 477.81686,
        494.00903, 502.22293, 508.9557 , 525.5472 ]], dtype=float32)

array([[ 33,  30,   8,  24, 494, 798, 902, 431, 425,  83],
       [  8, 425,  33,  24,  30,  34, 431, 978, 508, 737],
       [  8, 756, 994, 912, 809, 978, 425,  33, 534, 270],
       [  8, 756, 912, 994, 809, 425, 978,  33, 534, 270],
       [  8, 912, 994, 978, 425, 756, 809,  33, 534,  30]])

## 6. Perform Entiry Relation Match and Clustering between Input Emeddings <> NAICS Embeddings <> Similar Indices

In [14]:
# Initialize the LLM model
# We're using the Claude Sonnet from AWS Bedrock

llm = BedrockLLM(
    model_id="anthropic.claude-3-sonnet-20240229-v1:0",
    region_name="us-east-1",
    model_kwargs={
        "temperature": 1
    }
)

In [15]:
# Perform the matching
find_entity_relation_matches_and_cluster(
    input_df = input_embedded_df, 
    similar_indexes = indexes, 
    refernece_df = reference_df, 
    llm = llm,
    columns_to_use=columns_to_use,
    verbose=False
)

INFO - Begin processing entity match for 43 rows
INFO - 32 rows remain...
INFO - 21 rows remain...
INFO - 10 rows remain...
INFO - Completed processing entity match


In [16]:
# Updated dataframe after performing entity relation match
input_embedded_df.head()

Unnamed: 0,label,index_ids,group_name,description1,embedding,matches
0,"AMCM Customized System M,Customized M 290-1 1k...",[5],AMCM Customized System M,Customized M 290-1 1kW Digital Can,"[-0.140625, 0.42578125, -0.439453125, -0.03295...",{'Input Entity Guess': 'The input entity appea...
1,"CO_ laser,CO2 laser FSV30SAG (30W, AIR, OEM)",[21],CO_ laser,"CO2 laser FSV30SAG (30W, AIR, OEM)","[-0.16015625, 0.53125, 0.34375, -0.146484375, ...",{'Input Entity Guess': 'The input entity appea...
2,"Coolers,Cooler 1.5kW air/water cooler complete",[31],Coolers,Cooler 1.5kW air/water cooler complete,"[-0.609375, 1.1171875, -0.63671875, -0.1455078...",{'Input Entity Guess': 'The input entity seems...
3,"Coolers,Cooler 4.5kW air/water cooler complete",[18],Coolers,Cooler 4.5kW air/water cooler complete,"[-0.5, 1.078125, -0.75390625, -0.2119140625, -...",{'Input Entity Guess': 'The input entity appea...
4,"Coolers,Cooling system air-cooled 1.5kW complete",[3],Coolers,Cooling system air-cooled 1.5kW complete,"[-0.267578125, 1.453125, -0.5390625, 0.2470703...",{'Input Entity Guess': 'The input entity seems...


## 7. Generate output

In [18]:
output = generate_output(input_embedded_df)

In [19]:
pprint(output)

[{'exact_match': {'entity': [],
                  'reason': 'None of the reference entities seem to be an '
                            'exact match for the input entity.'},
  'general': {'entity': ['All Other Miscellaneous General Purpose Machinery '
                         'Manufacturing',
                         'All Other Miscellaneous Electrical Equipment and '
                         'Component Manufacturing'],
              'reason': 'These entities are general categories that the input '
                        'entity could potentially fall under, but they are too '
                        'broad to be certain.'},
  'input_data': {'ai_input_description': 'The input entity appears to be some '
                                         'kind of customized machinery or '
                                         'equipment, potentially for '
                                         'industrial or manufacturing '
                                         'purposes.',
             

In [20]:
output

[{'input_data': {'entity': {'group_name': 'AMCM Customized System M',
    'description1': 'Customized M 290-1 1kW Digital Can'},
   'ai_input_description': 'The input entity appears to be some kind of customized machinery or equipment, potentially for industrial or manufacturing purposes.'},
  'exact_match': {'reason': 'None of the reference entities seem to be an exact match for the input entity.',
   'entity': []},
  'related': {'conflicted_assumption': {'reason': "The input entity mentions it is a 'Digital Can' which conflicts with the assumptions of the reference entities about being machinery or equipment.",
    'entity': []},
   'additional_assumption': {'reason': 'The input entity does not provide enough details to make assumptions about its specific use or purpose.',
    'entity': ['All Other Miscellaneous General Purpose Machinery Manufacturing',
     'All Other Miscellaneous Electrical Equipment and Component Manufacturing',
     'Air-Conditioning and Warm Air Heating Equipme