## Import libraries

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [9]:
import os
from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.autonotebook import tqdm


some function

In [16]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
# Change the index name to be all lower case
pc.create_index("attorney", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))

In [31]:
import json
data = json.load(open("patent_attorney.json"))
print(data[0])


{'name': 'Alice Johnson', 'specialty': 'Biotechnology', 'summary': 'Expert in patenting biopharmaceuticals and genetic engineering innovations.', 'budget_range': {'start': 5000, 'end': 15000}, 'location': 'San Francisco, CA', 'years_of_experience': 8, 'contact': 'alice.johnson@example.com'}


In [30]:
processed_data = []
client = OpenAI()
for attorney in data:  # Iterate over each attorney in the metadata
    response = client.embeddings.create(
        input=attorney["name"],  # Use the attorney name for embedding
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": sanitize_id(attorney["name"]),  # Sanitize the attorney name as ID
        "metadata": {  # Change 'summary' to 'metadata'
            "specialty": attorney["specialty"],
            "summary": attorney["summary"],
            "budget_range": json.dumps(attorney["budget_range"]),  # Convert to string
            "location": attorney["location"],
            "years_of_experience": attorney["years_of_experience"],
            "contact": attorney["contact"]
        }
    })

In [32]:
processed_data[0]

{'values': [-0.010891364887356758,
  -0.012150655500590801,
  -0.009639570489525795,
  0.05166089907288551,
  -0.0029608323238790035,
  -0.025665543973445892,
  0.03487035632133484,
  0.0299531277269125,
  0.012435494922101498,
  -0.02553061954677105,
  0.03088260442018509,
  -0.029863178730010986,
  -0.008147910237312317,
  0.0348103903234005,
  -0.02194763720035553,
  0.0411967933177948,
  0.0022168762516230345,
  0.01698543317615986,
  -0.04839274287223816,
  0.06344426423311234,
  0.028408998623490334,
  0.014249473810195923,
  -0.005434439051896334,
  -0.03891807794570923,
  0.02380659058690071,
  0.014609270729124546,
  0.0069710733368992805,
  -0.00279779895208776,
  0.017750002443790436,
  -0.03226182609796524,
  0.04935220256447792,
  -0.03088260442018509,
  -0.023506758734583855,
  0.0027134716510772705,
  0.05546875670552254,
  -0.03085262142121792,
  0.01971389539539814,
  -0.029293499886989594,
  0.04878251999616623,
  -0.00673495652154088,
  0.02593539096415043,
  -0.0452

In [33]:
index = pc.Index("attorney")
print(index)
index.upsert(vectors=processed_data)

<pinecone.data.index.Index object at 0x122290fa0>


{'upserted_count': 18}

In [26]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}