In [None]:
# Upgrade pip
!pip install --upgrade pip

# Install required libraries
# ------------------------
# Original Cocoon data library
# ! pip install --root-user-action=ignore cocoon_data==0.1.57 -qq

# ESG Flo Fork of Cocoon data
# !git clone https://github.com/esg-flo/cocoon.git
!git clone -b st/feature/bedrock https://github.com/esg-flo/cocoon.git

# Uninstall libraries
# !pip uninstall cocoon_data==0.1.57 -y

In [76]:
# Import the original cocoon package
# from cocoon_data import *

# Import the ESG Flo fork for cocoon package
from cocoon.cocoon_data import *

import os
import warnings
import pandas as pd

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

# Run tenant files against Ecoinvent or Naisc Databases

There are sections to run the files using either of emebeddings models from Amazon. The embedding models are: `Titan G1` and `Titan V2`. Difference between the two models:

- Output Vector Size (Embedding size):
    - `Titan G1`: 1536
    - `Titan V2`: 1024 (default), 512, 256
- Input format:
    - `Titan G1`:
        - The only available field is inputText, in which you can include text to convert into embeddings.
        ```json
        {
            "inputText": string
        }
        ```
    - `Titan V2`:
        - The inputText parameter is **required**. The normalize and dimensions parameters are optional.
        - `inputText` – Enter text to convert to embeddings.
        - `normalize` - flag indicating whether or not to normalize the output embeddings. Defaults to true.
        - `dimensions` - The number of dimensions the output embeddings should have. The following values are accepted: 1024 (default), 512, 256.
        ```json
        {
            "inputText": string,
            "dimensions": int,
            "normalize": boolean
        }
        ```

More info on the models checkout https://eu-central-1.console.aws.amazon.com/bedrock/home?region=eu-central-1#/providers?model=amazon.titan-embed-text-v1. <br>
More info on the `Request`/`Response` format and example code checkout https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-embed-text.html.

**Note**: While using `Titan V2` the default value for `normalize` is set to `False`. The two models in our codebase can be used by passing **`titan-g1`** or **`titan-v2`** as the parameter value for `model_name` parameter. You can find more details on this below

**Output files**: The output for each run can be found in the `output/{database}/{embedding_model_name}/{input_filename}_{database}_{output_model_name}.html`

For example, input_filename = "20240701_lzboy

# Run tenant files against Ecoinvent or Naisc Databases

There are sections to run the files using either of emebeddings models from Amazon. The embedding models are: `Titan G1` and `Titan V2`. Difference between the two models:

- Output Vector Size (Embedding size):
    - `Titan G1`: 1536
    - `Titan V2`: 1024 (default), 512, 256
- Input format:
    - `Titan G1`:
        - The only available field is inputText, in which you can include text to convert into embeddings.
        ```json
        {
            "inputText": string
        }
        ```
    - `Titan V2`:
        - The inputText parameter is **required**. The normalize and dimensions parameters are optional.
        - `inputText` – Enter text to convert to embeddings.
        - `normalize` - flag indicating whether or not to normalize the output embeddings. Defaults to true.
        - `dimensions` - The number of dimensions the output embeddings should have. The following values are accepted: 1024 (default), 512, 256.
        ```json
        {
            "inputText": string,
            "dimensions": int,
            "normalize": boolean
        }
        ```

More info on the models checkout https://eu-central-1.console.aws.amazon.com/bedrock/home?region=eu-central-1#/providers?model=amazon.titan-embed-text-v1. <br>
More info on the `Request`/`Response` format and example code checkout https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-embed-text.html.

**Note**: While using `Titan V2` the default value for `normalize` is set to `False`. The two models in our codebase can be used by passing **`titan-g1`** or **`titan-v2`** as the parameter value for `model_name` parameter. You can find more details on this below

**Output files**: The output for each run can be found in the `output/{database}/{embedding_model_name}/{input_filename}_{database}_{output_model_name}.html`

For example, 
```json
{
    "input_filename": "20240701_lzboy", 
    "database": "ecoinvent", 
    "embedding_model_name": "titan-g1"
}
```

Location of the output html file at `output/ecoinvent/titan_g1/20240701_lzboy_ecoinvent_titan_g1.html`

In [16]:
# Set API_TYPE to bedrock where we want to use the titan models and claude as llm
openai.api_type = "bedrock"

## Titan V2

In [125]:
model_name = "titan-v2"
output_model_name = "titan_v2"

### Ecoinvent

In [4]:
# Update `data`, `embedded_data`, and `output` fields based on input

files = {
    "vocabulary": f's3://esgflo-ml-test-scope3/ecoinventdb/20240201_scope3ecoinventdata.csv',
    "embeddings": f's3://esgflo-ml-test-scope3/ecoinventdb/{output_model_name}_embeddings_output.csv',
    
    "data": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers.csv',
    "embedded_data": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers_embedded.csv',
    "output": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers_output.csv',
}

In [63]:
vocabulary_df = pd.read_csv(files["vocabulary"])
print(f"Vocabulary size: {vocabulary_df.shape[0]}")

Vocabulary size: 20938


In [64]:
# Read the data file
data_df = pd.read_csv(files["data"])
columns_to_use = data_df.columns.values

# Select a subset of columns
# columns_to_use = []

data_df = data_df[columns_to_use].drop_duplicates() # Remove the duplicate rows

# In case, you need to run a subset of the rows. Otherwise, comment the below line.
data_df = data_df.iloc[:2]

print(f"Size of the data file: {data_df.shape[0]}")

Size of the data file: 2


In [65]:
data_df.head(2)

Unnamed: 0,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)"
0,Nonresidential maintenance and repair
1,Adhesive manufacturing


In [66]:
# Load the embeddings
# Note: We have specified the `model_name` as `titan-v2` to choose `Titan-V2` model. 
# Use the same value across all the code cells in this section.

reference_df = pd.read_csv(files["embeddings"])
index = load_embedding(reference_df, label_embedding="embedding", model_name=model_name)
faiss.write_index(index, "embeddings.index")

In [68]:
# Prepare label
data_df["label"] = data_df.apply(lambda x: ",".join(x), axis=1)

In [69]:
data_df.head(2)

Unnamed: 0,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",label
0,Nonresidential maintenance and repair,Nonresidential maintenance and repair
1,Adhesive manufacturing,Adhesive manufacturing


In [70]:
# Embed all the labels using the specified model
embedded_df = embed_labels(data_df, files["embedded_data"], model_name=model_name)

Embedding Labels: 100%|██████████| 2/2 [00:00<00:00,  2.56label/s]

All labels embedded and CSV updated.





In [71]:
embedded_df.head(2)

Unnamed: 0,label,index_ids,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",embedding
0,Adhesive manufacturing,[1],Adhesive manufacturing,"[-1.4375, 3.234375, -1.625, 0.38476562, -1.468..."
1,Nonresidential maintenance and repair,[0],Nonresidential maintenance and repair,"[-1.6875, 1.328125, 0.52734375, 0.5703125, 1.6..."


In [72]:
synthea_embedded_label = pd.read_csv(files["embedded_data"])
synthea_embedded_label.head(2)

Unnamed: 0,label,index_ids,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",embedding
0,Adhesive manufacturing,[1],Adhesive manufacturing,"[-1.4375, 3.234375, -1.625, 0.38476562, -1.468..."
1,Nonresidential maintenance and repair,[0],Nonresidential maintenance and repair,"[-1.6875, 1.328125, 0.52734375, 0.5703125, 1.6..."


In [73]:
D, I = df_search(synthea_embedded_label, index)

In [74]:
entity_relation_match_cluster(
    input_df=synthea_embedded_label,
    attributes=columns_to_use,
    I=I,
    refernece_df=reference_df,
    label="label"
)

👉 2 rows remain...
👉 1 rows remain...


In [98]:
clusters = compute_cluster(synthea_embedded_label)
final_html = generate_report_for_cluster(synthea_embedded_label, clusters)
# print(final_html)

In [None]:
filename = files["data"].split("/")[-1].split(".")[0]

OUTPUT_DIR = f"./output/ecoinvent/{output_model_name}/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

with open(f"{OUTPUT_DIR}/{filename}_ecoinvent_{output_model_name}.html", "w") as file:
    file.write(final_html)

### Naics

In [82]:
# Update `data`, `embedded_data`, and `output` fields based on input

files = {
    "vocabulary": f's3://esgflo-ml-test-scope3/naicsdb/SupplyChainGHGEmissionFactors_v1.2_NAICS_CO2e_USD2021.csv',
    "embeddings": f's3://esgflo-ml-test-scope3/naicsdb/{output_model_name}_embeddings_output.csv',
    
    "data": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers.csv',
    "embedded_data": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers_embedded.csv',
    "output": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers_output.csv',
}

In [83]:
vocabulary_df = pd.read_csv(files["vocabulary"])
print(f"Vocabulary size: {vocabulary_df.shape[0]}")

Vocabulary size: 1016


In [84]:
# Read the data file
data_df = pd.read_csv(files["data"])
columns_to_use = data_df.columns.values

# Select a subset of columns
# columns_to_use = []

data_df = data_df[columns_to_use].drop_duplicates() # Remove the duplicate rows

# In case, you need to run a subset of the rows. Otherwise, comment the below line.
data_df = data_df.iloc[:2]

print(f"Size of the data file: {data_df.shape[0]}")

Size of the data file: 2


In [85]:
data_df.head(2)

Unnamed: 0,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)"
0,Nonresidential maintenance and repair
1,Adhesive manufacturing


In [86]:
# Load the embeddings
# Note: We have specified the `model_name` as `titan-v2` to choose `Titan-V2` model. 
# Use the same value across all the code cells in this section.

reference_df = pd.read_csv(files["embeddings"])
index = load_embedding(reference_df, label_embedding="embedding", model_name=model_name)
faiss.write_index(index, "embeddings.index")

In [87]:
# Prepare label
data_df["label"] = data_df.apply(lambda x: ",".join(x), axis=1)

In [88]:
data_df.head(2)

Unnamed: 0,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",label
0,Nonresidential maintenance and repair,Nonresidential maintenance and repair
1,Adhesive manufacturing,Adhesive manufacturing


In [89]:
# Embed all the labels using the specified model
embedded_df = embed_labels(data_df, files["embedded_data"], model_name=model_name)

All labels already embedded.


In [90]:
embedded_df.head(2)

Unnamed: 0,label,index_ids,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",embedding
0,Adhesive manufacturing,[1],Adhesive manufacturing,"[-1.4375, 3.234375, -1.625, 0.38476562, -1.468..."
1,Nonresidential maintenance and repair,[0],Nonresidential maintenance and repair,"[-1.6875, 1.328125, 0.52734375, 0.5703125, 1.6..."


In [91]:
synthea_embedded_label = pd.read_csv(files["embedded_data"])
synthea_embedded_label.head(2)

Unnamed: 0,label,index_ids,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",embedding
0,Adhesive manufacturing,[1],Adhesive manufacturing,"[-1.4375, 3.234375, -1.625, 0.38476562, -1.468..."
1,Nonresidential maintenance and repair,[0],Nonresidential maintenance and repair,"[-1.6875, 1.328125, 0.52734375, 0.5703125, 1.6..."


In [92]:
D, I = df_search(synthea_embedded_label, index)

In [93]:
entity_relation_match_cluster(
    input_df=synthea_embedded_label,
    attributes=columns_to_use,
    I=I,
    refernece_df=reference_df,
    label="label"
)

👉 2 rows remain...
👉 1 rows remain...


In [97]:
clusters = compute_cluster(synthea_embedded_label)
final_html = generate_report_for_cluster(synthea_embedded_label, clusters)
# print(final_html)

In [None]:
filename = files["data"].split("/")[-1].split(".")[0]

OUTPUT_DIR = f"./output/naics/{output_model_name}/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

with open(f"{OUTPUT_DIR}/{filename}_naics_{output_model_name}.html", "w") as file:
    file.write(final_html)

## Titan G1

In [131]:
model_name = "titan-g1"
output_model_name = "titan_g1"

### Ecoinvent

In [111]:
# Update `data`, `embedded_data`, and `output` fields based on input

files = {
    "vocabulary": f's3://esgflo-ml-test-scope3/ecoinventdb/20240201_scope3ecoinventdata.csv',
    "embeddings": f's3://esgflo-ml-test-scope3/ecoinventdb/{output_model_name}_embeddings_output.csv',
    
    "data": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers.csv',
    "embedded_data": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers_embedded.csv',
    "output": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers_output.csv',
}

In [112]:
vocabulary_df = pd.read_csv(files["vocabulary"])
print(f"Vocabulary size: {vocabulary_df.shape[0]}")

Vocabulary size: 20938


In [113]:
# Read the data file
data_df = pd.read_csv(files["data"])
columns_to_use = data_df.columns.values

# Select a subset of columns
# columns_to_use = []

data_df = data_df[columns_to_use].drop_duplicates() # Remove the duplicate rows

# In case, you need to run a subset of the rows. Otherwise, comment the below line.
data_df = data_df.iloc[:2]

print(f"Size of the data file: {data_df.shape[0]}")

Size of the data file: 2


In [114]:
data_df.head(2)

Unnamed: 0,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)"
0,Nonresidential maintenance and repair
1,Adhesive manufacturing


In [115]:
# Load the embeddings
# Note: We have specified the `model_name` as `titan-v2` to choose `Titan-V2` model. 
# Use the same value across all the code cells in this section.

reference_df = pd.read_csv(files["embeddings"])
index = load_embedding(reference_df, label_embedding="embedding", model_name=model_name)
faiss.write_index(index, "embeddings.index")

In [116]:
# Prepare label
data_df["label"] = data_df.apply(lambda x: ",".join(x), axis=1)

In [117]:
data_df.head(2)

Unnamed: 0,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",label
0,Nonresidential maintenance and repair,Nonresidential maintenance and repair
1,Adhesive manufacturing,Adhesive manufacturing


In [118]:
# Embed all the labels using the specified model
embedded_df = embed_labels(data_df, files["embedded_data"], model_name=model_name)

Embedding Labels: 100%|██████████| 2/2 [00:00<00:00,  2.63label/s]

All labels embedded and CSV updated.





In [119]:
embedded_df.head(2)

Unnamed: 0,label,index_ids,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",embedding
0,Adhesive manufacturing,[1],Adhesive manufacturing,"[1.09375, -0.16210938, 0.20898438, 1.4140625, ..."
1,Nonresidential maintenance and repair,[0],Nonresidential maintenance and repair,"[-0.21777344, -0.39453125, -0.32617188, 1.2343..."


In [120]:
synthea_embedded_label = pd.read_csv(files["embedded_data"])
synthea_embedded_label.head(2)

Unnamed: 0,label,index_ids,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",embedding
0,Adhesive manufacturing,[1],Adhesive manufacturing,"[1.09375, -0.16210938, 0.20898438, 1.4140625, ..."
1,Nonresidential maintenance and repair,[0],Nonresidential maintenance and repair,"[-0.21777344, -0.39453125, -0.32617188, 1.2343..."


In [121]:
D, I = df_search(synthea_embedded_label, index)

In [122]:
entity_relation_match_cluster(
    input_df=synthea_embedded_label,
    attributes=columns_to_use,
    I=I,
    refernece_df=reference_df,
    label="label"
)

👉 2 rows remain...
👉 1 rows remain...


In [123]:
clusters = compute_cluster(synthea_embedded_label)
final_html = generate_report_for_cluster(synthea_embedded_label, clusters)
# print(final_html)

In [124]:
filename = files["data"].split("/")[-1].split(".")[0]

OUTPUT_DIR = f"./output/ecoinvent/{output_model_name}/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

with open(f"{OUTPUT_DIR}/{filename}_ecoinvent_{output_model_name}.html", "w") as file:
    file.write(final_html)

### Naics

In [None]:
# Update `data`, `embedded_data`, and `output` fields based on input

files = {
    "vocabulary": f's3://esgflo-ml-test-scope3/naicsdb/SupplyChainGHGEmissionFactors_v1.2_NAICS_CO2e_USD2021.csv',
    "embeddings": f's3://esgflo-ml-test-scope3/naicsdb/{output_model_name}_embeddings_output.csv',
    
    "data": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers.csv',
    "embedded_data": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers_embedded.csv',
    "output": f's3://esgflo-ml-test-scope3/20240701/20240620_LZB_Spend_based_suppliers_output.csv',
}

In [None]:
vocabulary_df = pd.read_csv(files["vocabulary"])
print(f"Vocabulary size: {vocabulary_df.shape[0]}")

Vocabulary size: 1016


In [None]:
# Read the data file
data_df = pd.read_csv(files["data"])
columns_to_use = data_df.columns.values

# Select a subset of columns
# columns_to_use = []

data_df = data_df[columns_to_use].drop_duplicates() # Remove the duplicate rows

# In case, you need to run a subset of the rows. Otherwise, comment the below line.
data_df = data_df.iloc[:2]

print(f"Size of the data file: {data_df.shape[0]}")

Size of the data file: 2


In [None]:
data_df.head(2)

Unnamed: 0,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)"
0,Nonresidential maintenance and repair
1,Adhesive manufacturing


In [None]:
# Load the embeddings
# Note: We have specified the `model_name` as `titan-v2` to choose `Titan-V2` model. 
# Use the same value across all the code cells in this section.

reference_df = pd.read_csv(files["embeddings"])
index = load_embedding(reference_df, label_embedding="embedding", model_name=model_name)
faiss.write_index(index, "embeddings.index")

In [None]:
# Prepare label
data_df["label"] = data_df.apply(lambda x: ",".join(x), axis=1)

In [None]:
data_df.head(2)

Unnamed: 0,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",label
0,Nonresidential maintenance and repair,Nonresidential maintenance and repair
1,Adhesive manufacturing,Adhesive manufacturing


In [None]:
# Embed all the labels using the specified model
embedded_df = embed_labels(data_df, files["embedded_data"], model_name=model_name)

All labels already embedded.


In [None]:
embedded_df.head(2)

Unnamed: 0,label,index_ids,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",embedding
0,Adhesive manufacturing,[1],Adhesive manufacturing,"[-1.4375, 3.234375, -1.625, 0.38476562, -1.468..."
1,Nonresidential maintenance and repair,[0],Nonresidential maintenance and repair,"[-1.6875, 1.328125, 0.52734375, 0.5703125, 1.6..."


In [None]:
synthea_embedded_label = pd.read_csv(files["embedded_data"])
synthea_embedded_label.head(2)

Unnamed: 0,label,index_ids,"Category of Product/Service Being Purchased - Level 1\n(GICS, NAICS or other industry classification systems)",embedding
0,Adhesive manufacturing,[1],Adhesive manufacturing,"[-1.4375, 3.234375, -1.625, 0.38476562, -1.468..."
1,Nonresidential maintenance and repair,[0],Nonresidential maintenance and repair,"[-1.6875, 1.328125, 0.52734375, 0.5703125, 1.6..."


In [None]:
D, I = df_search(synthea_embedded_label, index)

In [None]:
entity_relation_match_cluster(
    input_df=synthea_embedded_label,
    attributes=columns_to_use,
    I=I,
    refernece_df=reference_df,
    label="label"
)

👉 2 rows remain...
👉 1 rows remain...


In [None]:
clusters = compute_cluster(synthea_embedded_label)
final_html = generate_report_for_cluster(synthea_embedded_label, clusters)
# print(final_html)

In [None]:
filename = files["data"].split("/")[-1].split(".")[0]

OUTPUT_DIR = f"./output/naics/{output_model_name}/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

with open(f"{OUTPUT_DIR}/{filename}_naics_{output_model_name}.html", "w") as file:
    file.write(final_html)

# Create Naics or Ecoinvent database embeddings

In [132]:
! pip install textblob -qq
! python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [133]:
import ast
from textblob import TextBlob

In [136]:
openai.api_type = "bedrock"
openai.api_type

'bedrock'

## Titan G1

Use `Titan G1` to embed the databases.

### NAICS

In [134]:
# Change the value for `embeddings` while creating new embeddings

files = {
    "vocabulary": f's3://esgflo-ml-test-scope3/naicsdb/SupplyChainGHGEmissionFactors_v1.2_NAICS_CO2e_USD2021.csv',
    "embeddings": f's3://esgflo-ml-test-scope3/naicsdb/titan_g1_embeddings_output.csv'
}

In [135]:
vocabulary_df = pd.read_csv(files['vocabulary'])
print(f"Vocabulary size: {vocabulary_df.shape[0]}")

Vocabulary size: 1016


In [137]:
df = vocabulary_df[vocabulary_df["2017 NAICS Title"] != None]
naics_g1_new_df = df[["2017 NAICS Title"]].copy()
naics_g1_new_df['label'] = naics_g1_new_df['2017 NAICS Title'].apply(lambda x: ' '.join(TextBlob(x).words))

In [138]:
res_naics_g1 = embed_labels(naics_g1_new_df, files['embeddings'], model_name="titan-g1")

All labels already embedded.


In [None]:
res_naics_g1.head()

In [152]:
# Check the length of the embedding. It should be 1536
assert 1536 == len(ast.literal_eval(res_naics_g1["embedding"].iloc[0]))

### Ecoinvent

In [139]:
# Change the value for `embeddings` while creating new embeddings

files = {
    "vocabulary": f's3://esgflo-ml-test-scope3/ecoinventdb/20240201_scope3ecoinventdata.csv',
    "embeddings": f's3://esgflo-ml-test-scope3/ecoinventdb/titan_g1_embeddings_output.csv',
}

In [140]:
vocabulary_df = pd.read_csv(files['vocabulary'])
print(f"Vocabulary size: {vocabulary_df.shape[0]}")

Vocabulary size: 20938


In [141]:
df = vocabulary_df[vocabulary_df["activity_name"] != None]
ecoinvent_g1_new_df = df[["activity_name"]].copy()
ecoinvent_g1_new_df['label'] = ecoinvent_g1_new_df['activity_name'].apply(lambda x: ' '.join(TextBlob(x).words))

In [142]:
res_ecoinvent_g1 = embed_labels(ecoinvent_g1_new_df, files['embeddings'], model_name="titan-g1")

All labels already embedded.


In [143]:
res_ecoinvent_g1.head()

Unnamed: 0,label,index_ids,embedding
0,"1,1-difluoroethane production HFC-152a","[14316, 15532]","[0.56640625, 0.44921875, -0.00091934204, -0.04..."
1,"1,1-dimethylcyclopentane to generic market for...",[20891],"[1.28125, 0.33203125, 0.36132812, -0.21679688,..."
2,1-methoxy-2-propanol production,[8412],"[0.56640625, -0.044189453, -0.34179688, -0.941..."
3,1-methylcyclopropene production,"[3983, 9856]","[0.31640625, 0.25585938, -0.18554688, -0.6875,..."
4,1-naphthylacetic acid production,[6058],"[0.23828125, 0.24316406, -0.20214844, -0.50390..."


In [153]:
# Check the length of the embedding. It should be 1536
assert 1536 == len(ast.literal_eval(res_ecoinvent_g1["embedding"].iloc[0]))

## Titan V2

Use `Titan G1` to embed the databases. Note: The value for `normalize` parameter is set to `False` by default.

### NAICS

In [154]:
files = {
    "vocabulary": f's3://esgflo-ml-test-scope3/naicsdb/SupplyChainGHGEmissionFactors_v1.2_NAICS_CO2e_USD2021.csv',
    "embeddings": f's3://esgflo-ml-test-scope3/naicsdb/titan_v2_embeddings_output.csv'
}

In [155]:
vocabulary_df = pd.read_csv(files['vocabulary'])
print(f"Vocabulary size: {vocabulary_df.shape[0]}")

Vocabulary size: 1016


In [156]:
df = vocabulary_df[vocabulary_df["2017 NAICS Title"] != None]
naics_v2_new_df = df[["2017 NAICS Title"]].copy()
naics_v2_new_df['label'] = naics_v2_new_df['2017 NAICS Title'].apply(lambda x: ' '.join(TextBlob(x).words))

In [157]:
res_naics_v2 = embed_labels(naics_v2_new_df, files['embeddings'], model_name="titan-v2")

All labels already embedded.


In [158]:
res_naics_v2.head()

Unnamed: 0,label,index_ids,2017 NAICS Title,embedding
0,Abrasive Product Manufacturing,[292],Abrasive Product Manufacturing,"[-3.75, 1.0703125, 0.08984375, 0.16699219, -2...."
1,Adhesive Manufacturing,[253],Adhesive Manufacturing,"[-1.921875, 3.328125, -1.828125, 0.74609375, -..."
2,Administrative Management and General Manageme...,[800],Administrative Management and General Manageme...,"[-3.71875, 2.921875, 1.0625, -1.2109375, 1.445..."
3,Advertising Agencies,[811],Advertising Agencies,"[2.109375, 2.4375, 2.859375, 1.78125, -1.55468..."
4,Advertising Material Distribution Services,[817],Advertising Material Distribution Services,"[-2.796875, 0.37109375, 2.046875, 1.109375, -4..."


In [159]:
# Check the length of the embedding. It should be 1024
assert 1024 == len(ast.literal_eval(res_naics_v2["embedding"].iloc[0]))

### Ecoinvent

In [160]:
files = {
    "vocabulary": f's3://esgflo-ml-test-scope3/ecoinventdb/20240201_scope3ecoinventdata.csv',
    "embeddings": f's3://esgflo-ml-test-scope3/ecoinventdb/titan_v2_embeddings_output.csv',
}

In [161]:
vocabulary_df = pd.read_csv(files['vocabulary'])
print(f"Vocabulary size: {vocabulary_df.shape[0]}")

Vocabulary size: 20938


In [162]:
df = vocabulary_df[vocabulary_df["activity_name"] != None]
ecoinvent_v2_new_df = df[["activity_name"]].copy()
ecoinvent_v2_new_df['label'] = ecoinvent_v2_new_df['activity_name'].apply(lambda x: ' '.join(TextBlob(x).words))

In [163]:
res_ecoinvent_v2 = embed_labels(ecoinvent_v2_new_df, files['embeddings'], model_name="titan-v2")

All labels already embedded.


In [164]:
res_ecoinvent_v2.head()

Unnamed: 0,label,index_ids,embedding
0,"1,1-difluoroethane production HFC-152a","[14316, 15532]","[0.73046875, 0.27734375, 1.2421875, 0.578125, ..."
1,"1,1-dimethylcyclopentane to generic market for...",[20891],"[-1.6953125, 0.8828125, 2.8125, -1.34375, -1.5..."
2,1-methoxy-2-propanol production,[8412],"[1.546875, 2.0625, 2.796875, -1.21875, 1.13281..."
3,1-methylcyclopropene production,"[3983, 9856]","[1.7265625, 2.484375, 4.21875, -3.015625, 0.14..."
4,1-naphthylacetic acid production,[6058],"[-0.12597656, 2.546875, 1.546875, 1.03125, -1...."


In [165]:
# Check the length of the embedding. It should be 1024
assert 1024 == len(ast.literal_eval(res_ecoinvent_v2["embedding"].iloc[0]))