## Import libraries

In [6]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [7]:
import os
from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.autonotebook import tqdm


some function

In [11]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index("abstract", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))

In [14]:
import json
data = json.load(open("metadata.json"))


In [15]:
import unicodedata

def sanitize_id(law_firm_name):
    # Normalize and encode to ASCII, replacing non-ASCII characters
    return unicodedata.normalize('NFKD', law_firm_name).encode('ascii', 'ignore').decode('ascii')

processed_data = []
client = OpenAI()
for patent in data:  # Iterate over each patent in the metadata
    response = client.embeddings.create(
        input=patent["title"],  # Use the patent title for embedding
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": sanitize_id(patent["title"]),  # Sanitize the patent title as ID
        "metadata": {
            "patent": patent["patent"],
            "assignee": patent["assignee"],
            "date": patent["date"],
            "abstract": patent["abstract"],
            "us_pat_no": patent["us_pat_no"]
        }
    })

In [16]:
processed_data[0]

{'values': [-0.009389418177306652,
  -0.012687225826084614,
  0.025810012593865395,
  0.03815501183271408,
  -0.06401479989290237,
  -0.02546156384050846,
  0.010042757727205753,
  -0.013639234937727451,
  -0.007485401351004839,
  -0.005372937768697739,
  -0.0007497845799662173,
  -0.028597593307495117,
  0.037931010127067566,
  0.03188295662403107,
  -0.004256038460880518,
  -0.009040971286594868,
  0.04358084127306938,
  0.03073805570602417,
  -0.028323812410235405,
  0.06371613591909409,
  0.012102331966161728,
  0.03813012316823006,
  -0.035417210310697556,
  0.03693544492125511,
  0.014410797506570816,
  -0.008400076068937778,
  0.02327132225036621,
  0.012438335455954075,
  0.02976738102734089,
  0.0004145593266002834,
  0.059584539383649826,
  -0.02486422471702099,
  0.014784133993089199,
  -0.069888636469841,
  -0.047463539987802505,
  0.03210695832967758,
  0.00045578190474770963,
  -0.02042151801288128,
  -0.028249144554138184,
  0.008624077774584293,
  0.025884678587317467,


In [17]:
index = pc.Index("abstract")
index.upsert(vectors=processed_data)

{'upserted_count': 50}

In [26]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}