In [1]:
import pandas as pd
import numpy as np
import yaml

from pinecone import Pinecone, ServerlessSpec

with open('../../config.yaml','r') as f:
    config = yaml.safe_load(f)

PINECONE_API_KEY = config['PINECONE_API_KEY']
PINECONE_INDEX_NAME = config['PINECONE_INDEX_NAME']
PINECONE_NAMESPACE = config['PINECONE_NAMESPACE']
PINECONE_EMBEDDING_MODEL = config['PINECONE_EMBEDDING_MODEL']
PINECONE_EMBEDDING_DIMENSION = config['PINECONE_EMBEDDING_DIMENSION']
PINECONE_CLOUD = config['PINECONE_CLOUD']
PINECONE_REGION = config['PINECONE_REGION']

pc = Pinecone(api_key=PINECONE_API_KEY)

CITY = "los_angeles"

In [2]:
# Create serverless index if it doesn't already exist

if not pc.has_index(PINECONE_INDEX_NAME):
    pc.create_index(
        name = PINECONE_INDEX_NAME, 
        dimension = PINECONE_EMBEDDING_DIMENSION, 
        metric='cosine',
        spec=ServerlessSpec(
            cloud = PINECONE_CLOUD,
            region = PINECONE_REGION
        )
    )
  

In [3]:
# Get embeddings on an entire dataframe

df = pd.read_csv("../../intermediate_data/chapter_data.csv")

data = []
for idx, row in df.iterrows():
    data.append({
        'document_id': CITY + '_' + row['Chapter ID'],
        'municipality': CITY,
        'document_type': 'Chapter',
        'document_text': row['Chapter Text']
    })

data[0]

{'document_id': 'los_angeles_JD_C1',
 'municipality': 'los_angeles',
 'document_type': 'Chapter',
 'document_text': 'CHAPTER I | GENERAL PROVISIONS AND ZONING'}

In [4]:
# Embed vectors

embeddings = pc.inference.embed(
    model = PINECONE_EMBEDDING_MODEL,
    inputs = [d['document_text'] for d in data],
    parameters = {'input_type': 'passage'}
)

print(embeddings[0])

{'values': [0.0233306884765625, -0.005184173583984375, ..., -0.021697998046875, -0.00855255126953125]}


In [6]:
# Upsert the data

while not pc.describe_index(PINECONE_INDEX_NAME).status['ready']:
    time.sleep(1)

index = pc.Index(PINECONE_INDEX_NAME)

vectors = []
for d, e in zip(data, embeddings):
    vectors.append({
        'id': d['document_id'],
        'values': e['values'],
        'metadata': d
    })

index.upsert(vectors=vectors, namespace=PINECONE_NAMESPACE)

index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'main': {'vector_count': 20}},
 'total_vector_count': 20}

In [7]:
# Run a similarity search for a query

query = "Which chapters are most relevant to a home builder?"

embedding = pc.inference.embed(
    model = PINECONE_EMBEDDING_MODEL,
    inputs = [query],
    parameters = {'input_type': 'query'}
)

results = index.query(
    namespace = PINECONE_NAMESPACE,
    vector = embedding[0].values,
    top_k = 3,
    include_values = False,
    include_metadata = True
)

results

{'matches': [{'id': 'los_angeles_JD_C9',
              'metadata': {'document_id': 'los_angeles_JD_C9',
                           'document_text': 'CHAPTER IX | BUILDING REGULATIONS',
                           'document_type': 'Chapter',
                           'municipality': 'los_angeles'},
              'score': 0.817296326,
              'values': []},
             {'id': 'los_angeles_JD_C16',
              'metadata': {'document_id': 'los_angeles_JD_C16',
                           'document_text': 'CHAPTER XVI | HOUSING REGULATIONS',
                           'document_type': 'Chapter',
                           'municipality': 'los_angeles'},
              'score': 0.816022575,
              'values': []},
             {'id': 'los_angeles_JD_C13',
              'metadata': {'document_id': 'los_angeles_JD_C13',
                           'document_text': 'CHAPTER XIII | THE EMERGENCY '
                                            'ENERGY CURTAILMENT PLAN OF THE '
          