In [2]:
import sys

sys.path.append("../")

import cohere
import json

from pathlib import Path
from config import Paths, APIKeys 

from bs4 import BeautifulSoup

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-base-v4")

In [4]:
def get_sections_by_tag(soup: BeautifulSoup, tag: str) -> dict:
    tags = soup.find_all(tag)

    if len(tags) == 0:
        raise ValueError(f"No {tag} tags found in the document")

    output = []

    elements = []
    for elem in soup.find_all():
        if elem != tags[0]:
            elements.append(str(elem))
        else:
            break
    output.append(
        {"section_name": "", "section_elements": elements}
    )

    for i in range(len(tags)):
        # Get the current tag
        current_tag = tags[i]

        # Get the next tag, if it exists
        if i < len(tags) - 1:
            next_tag = tags[i + 1]
        else:
            next_tag = None

        # Find all elements between the current tag and the next tag
        elements = []
        next_sibling = current_tag.next_sibling
        while next_sibling and next_sibling != next_tag:
            if next_sibling.name != tag:
                elements.append(str(next_sibling))
            next_sibling = next_sibling.next_sibling

        output.append(
            {"section_name": current_tag.get_text(), "section_elements": elements}
        )

    return output

def split_by_element(soup: BeautifulSoup, parent_name: str) -> dict:
    output = []
    curr_text, curr_elements = "", []
    for element in soup.find_all():
        text = element.get_text()
        if len(tokenizer(curr_text + text, padding=False, truncation=False)["input_ids"]) < 512:
            curr_text += " " + text
            curr_elements.append(str(element))
        else:
            text_formatted = BeautifulSoup("\n".join(curr_elements), "html.parser").get_text().strip()
            if text_formatted:
                output.append(
                    {
                        "section": f'{parent_name}',
                        "text": text_formatted,
                        "elements": curr_elements,
                    }
                )
            curr_text, curr_elements = "", []
    return output

def split_by_hierarchy(soup: BeautifulSoup, parent_name: str) -> dict:
    output = []
    try:
        h3s = get_sections_by_tag(soup, "h3")
        for h3 in h3s:
            soup = BeautifulSoup(
                "\n".join(h3["section_elements"]), "html.parser"
            )
            text = soup.get_text().strip()
            if len(tokenizer(text, padding=False, truncation=False)["input_ids"]) < 512:
                output.append(
                    {
                        "section": f'{parent_name}>{h3["section_name"]}',
                        "text": h3["section_name"] + "\n" + text,
                        "elements": h3["section_elements"],
                    }
                )
            else:
                try:
                    h4s = get_sections_by_tag(soup, "h4")
                    for h4 in h4s:
                        soup = BeautifulSoup(
                            "\n".join(h4["section_elements"]), "html.parser"
                        )
                        text = soup.get_text().strip()
                        if len(tokenizer(text, padding=False, truncation=False)["input_ids"]) < 512:
                            output.append(
                                {
                                    "section": f'{parent_name}>{h3["section_name"]}>{h4["section_name"]}',
                                    "text": h4["section_name"] + "\n" + text,
                                    "elements": h4["section_elements"],
                                }
                            )
                        else:
                            output.extend(
                                split_by_element(soup, f'{parent_name}>{h3["section_name"]}>{h4["section_name"]}')
                            )
                except ValueError:
                    output.extend(split_by_element(soup, f'{parent_name}>{h3["section_name"]}'))
    except ValueError:
        output.extend(split_by_element(soup, parent_name))

    return output

In [5]:
# test_file = "handbook_total-rewards_benefits_parental-leave-toolkit-index.json"

handbook_parsed = Path(f"{Paths.data}/handbook_parsed")
handbook_processed = Path(f"{Paths.data}/handbook_processed")

for f in handbook_processed.glob("*.json"):
    f.unlink()

for f in handbook_parsed.glob("*.json"):
    output = []
    with open(f, "r") as f:
        data = json.load(f)
        for section in data["elements"]:
            soup = BeautifulSoup("\n".join(section["section_elements"]), "html.parser")
            text = soup.get_text()
            if len(tokenizer(text, padding=False, truncation=False)["input_ids"]) < 512:
                output.append(
                    {
                        "title": data["title"],
                        "path": data["path"],
                        "section": section["section_name"],
                        "text": text,
                        "elements": section["section_elements"],
                    }
                )
            else:
                reduced_elements = split_by_hierarchy(soup, section["section_name"])
                for element in reduced_elements:
                    output.append(
                        {
                            "title": data["title"],
                            "path": data["path"],
                            "section": element["section"],
                            "text": element["text"],
                            "elements": element["elements"],
                        }
                    )
    filename = f.name.replace(f"{Paths.data}/handbook_parsed", "")
    full_filename = f"{Paths.data}/handbook_processed{filename}"    
    with open(full_filename, "w") as fw:
        json.dump(output, fw)

Token indices sequence length is longer than the specified maximum sequence length for this model (997 > 512). Running this sequence through the model will result in indexing errors


In [6]:
handbook_merged = Path(f"{Paths.data}/handbook_merged")
full_filename = f"{handbook_merged}/handbook.json"

output_merged = []
for f in handbook_processed.glob("*.json"):
    with open(f, "r") as f:
        data = json.load(f)
        output_merged.extend(data)

with open(full_filename, "w") as fw:
    json.dump(output_merged, fw)

In [14]:
from functools import cache

co = cohere.Client(APIKeys.cohere)

@cache
def get_embeddings(batch: tuple):
    response = co.embed(
        model='large',
        texts=list(batch)
    )
    return response.embeddings

In [17]:
vectors = []
batch_size = 96 
batch = []

for i, elem in enumerate(output_merged):
    if i % 500 == 0:
        print(i, len(output_merged))

    if len(elem["text"]) > 5:
        batch.append("\n".join(elem["text"]))

    if len(batch) >= batch_size:
        print("Encoding batch", len(batch))
        vectors.append(get_embeddings(tuple(batch)))
        batch = []
        break

if len(batch) > 0:
    vectors.append(get_embeddings(tuple(batch)))
    batch = []

0 26488
Encoding batch 96


In [18]:
vectors

In [None]:
for m, v in zip(output_merged, vectors[0]):
    m["vector"] = v.tolist()

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

query_text = "what factors should you consider when producing a digital replica?"
embedding = model.encode(query_text).tolist()

scores = cosine_similarity([embedding], vectors[0])[0] 
top_scores_ids = np.argsort(scores)[-30:][::-1]

# print top 10 results
for i in top_scores_ids:
    print(f"Score: {scores[i]}")
    for meta_ in meta[i]:
        print(meta_["text"])
    print("---")

Score: 0.4446302883636793
Synthetic data production is the creation of 3d models with textures that are used in out ML training. These models are typically built and textured in Blender.
Time Estimation
This  spread sheet  can be used to calculate the time to build the product.
Conclusions
The six images provide a relatively quick solution to creating synthetic replicas.
With the six image process it is possible to create products within days of meeting with the customer. There is no question that this has to be one of the most time efficient processes. For a large proportion of boxed goods this process is efficient and effective.
---
Score: 0.44143337105584224
The six image process does not provide good labels on products with curved and complex surfaces and it would be better to extract the labels and scan them on a flat bed scanner. Scanner labels are better quality (in colour and distortion) and are quicker to place on the model.
More complex models and more accurate surface repres

In [None]:
df = pd.DataFrame(unnested_meta).to_csv(
    DATA / "processed" / "notion_research_9462314059bd4a9e8a997d481bf6b009.csv",
    index=False,
)