In [1]:
# Load environment variables
# python -m ipykernel install --user --name=webnearme-venv
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())



True

In [2]:
# allows notebook to work
import requests
from requests.packages.urllib3.util.ssl_ import create_urllib3_context

CIPHERS = (
    'ECDHE+AESGCM:ECDHE+CHACHA20:DHE+AESGCM:DHE+CHACHA20:ECDH+AESGCM:ECDH+CHACHA20:DH+AESGCM:DH+CHACHA20:'
    'ECDHE+AES:!aNULL:!eNULL:!EXPORT:!DES:!MD5:!PSK:!RC4:!HMAC_SHA1:!SHA1:!DHE+AES:!ECDH+AES:!DH+AES'
)

requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = CIPHERS
# Skip the following two lines if they cause errors
# requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST = CIPHERS
# requests.packages.urllib3.contrib.pyopenssl.inject_into_urllib3()
requests.packages.urllib3.util.ssl_.create_default_context = create_urllib3_context


# we will create a hybrid index that can search semantic and keyword

In [3]:
# index = pinecone.Index('websitenearme-fast-api')

In [4]:
# this is our text corpus
all_sentences = ["Fast websites, fast!","HIRE US","MAIN PRODUCT/SERVICE 1","OUR EX1: WEBSITE DESIGN","","Initial website, same template as this site	1 design + hosting and domain (godaddy or transferred to godaddy)= ~$550","AI-powered SEO initial setup	$0, included","multilingual	$0, included","chatbot + maintenance	NOT INCLUDED","Website design","MAIN PRODUCT/SERVICE 2","OUR EX2: CHATBOTS FOR OUR WEBSITES","","Website + 1 site-specific chatbot	$100 * 1 time, chatbot setup, $20/month chatGPT subscription","Train model with AI-generated + your questions & answers	$100/ training round","Multiple chatbots with different personalities/ purposes	$100 each","Connect chatbot to your existing website (i.e., we did not build your website)	Please contact us with specifics, these start at $500/each and we reserve the right to say no","Chatbot types and prices","MAIN PRODUCT/SERVICE 3","EX3: SUBSCRIPTIONS","","monthly site health + maintenance + minor updates + 3 emergency updates	$50/month","monthly site health + maintenance + minor updates + 1 emergency update/week	$250/month","emergency update	$50","content generation- 4 posts, 1 per week	$100/month","Subscriptions and prices"]

In [5]:
# this uses HuggingFace 
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base')

all_embeddings = model.encode(all_sentences)
print(all_embeddings.shape)
print(all_embeddings.shape[1]) # this is the dimension we need to instantiate our pinecone index
print(all_embeddings[0:1])

(26, 768)
768
[[-2.35444633e-03  5.48605137e-02 -3.57232727e-02 -8.66033509e-03
   1.22838309e-02 -2.31732037e-02  2.15398464e-02  3.68994251e-02
  -1.17613141e-04 -4.66104783e-02 -5.90305449e-03 -6.81372806e-02
   3.35595235e-02  5.76601364e-02  5.46999602e-03  1.67380031e-02
   4.78123734e-03  1.83588676e-02 -1.51509894e-02  1.55020389e-03
   1.04834959e-02 -4.37937155e-02 -2.71082576e-02 -1.38526736e-02
   2.51252055e-02 -3.59955840e-02 -7.81918876e-03  3.21647860e-02
  -4.07713130e-02 -5.99804297e-02  2.53628823e-03 -1.73783060e-02
  -2.66849659e-02  2.87912805e-02  3.07434567e-09  1.53542096e-02
  -1.65473588e-03 -7.35954242e-03 -9.00205299e-02  1.03918694e-01
  -3.22410986e-02  9.26716160e-03 -3.19100432e-02  1.39510101e-02
   1.24675678e-02 -4.18858081e-02  5.70722297e-03 -5.45279644e-02
  -4.88943644e-02  5.97022660e-02 -3.53687368e-02  1.49220414e-02
  -3.00463904e-02 -2.08735559e-02  3.38984355e-02 -3.03387512e-02
  -1.00002084e-02 -5.85185178e-02  6.18790351e-02  6.68968447e

# the above is sufficient for upserting semantic searches, however the next step allows us to be able to do keyword searches

In [6]:
# this pip install allows notebooks to work
%pip install sacremoses
# this is HuggingFace transformer
from transformers import AutoTokenizer

# if using vast amounts of data, heed the parallelism warning
# - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

# transfo-xl tokenizer uses word-level encodings
tokenizer = AutoTokenizer.from_pretrained('transfo-xl-wt103')

all_tokens = [tokenizer.tokenize(sentence.lower()) for sentence in all_sentences]
all_tokens[0]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


['fast', 'websites', ',', 'fast', '!']

## we will connect to pinecone

In [7]:
#Import and initialize Pinecone client

import os
import pinecone
from langchain.vectorstores import Pinecone

# connect to pinecone environment
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV') 
)

# # Initialize Pinecone
INDEX_NAME = "websitenearme-fast-api"
DIMENSIONS=all_embeddings.shape[1]

# # Create and configure index if doesn't already exist
if INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(
        name=INDEX_NAME, 
        metric="eucladian",
        dimension=DIMENSIONS
        )

index = pinecone.Index(INDEX_NAME)  # this index var is necessary for upsert later
pinecone.list_indexes()
pinecone.describe_index(INDEX_NAME)

IndexDescription(name='websitenearme-fast-api', metric='cosine', replicas=1, dimension=768.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

## now we will upsert the data

In [8]:
# only do this if i want a local json file
import json
from datetime import datetime

# Assume all_embeddings and all_tokens are already defined

# reformat the data
upserts = {'vectors': []}
for i, (embedding, tokens) in enumerate(zip(all_embeddings, all_tokens)):
    vector = {
        'id': f'{i}',
        'metadata': {
            'tokens': tokens,
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')  # Place timestamp within metadata
        },
        'values': embedding.tolist()
    }
    upserts['vectors'].append(vector)

# Add namespace at the root level
upserts['namespace'] = 'websitenearme'

# save to JSON
with open('./upsert.json', 'w') as f:
    json.dump(upserts, f, indent=4)



In [9]:
# upsert data straight into pinecone
# import json
from datetime import datetime
from tqdm.auto import tqdm  # for progress bar

# Assume all_embeddings and all_tokens are already defined

# reformat the data
# data means the zipped embeddings and all tokens
# this correctly formats the json dump
# and is working to upload straight 
# into pincone as of Aguust 9, 2023
upserts = {'vectors': []}
for i, (embedding, tokens) in enumerate(zip(all_embeddings, all_tokens)):
    vector = {
        'id': f'{i}',
        'metadata': {
            'tokens': tokens,
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')  # Place timestamp within metadata
        },
        'values': embedding.tolist()
    }
    upserts['vectors'].append(vector)

# Add namespace at the root level
upserts['namespace'] = 'websitenearme'

# Use upserts['vectors'] as the dataset
dataset = upserts['vectors']

# Insert data as batches
batch_size = 100
for i in tqdm(range(0, len(dataset), batch_size)):
    # set end of current batch
    i_end = i + batch_size
    if i_end > len(dataset):
        # correct if batch is beyond dataset size
        i_end = len(dataset)
    batch = dataset[i: i_end]
    # Upsert the batch (assuming the structure of batch matches the expected format)
    index.upsert(vectors=batch)


  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
# KEYWORD search
query_sentence = "prices"
xq = model.encode(query_sentence).tolist()
# get the data from Pinecone
result = index.query(xq, top_k=1, includeMetadata=True)
result

{'matches': [{'id': '25',
              'metadata': {'time_stamp': datetime.datetime(2023, 8, 9, 11, 13, 50),
                           'tokens': ['subscriptions', 'and', 'prices']},
              'score': 0.488167524,
              'values': []}],
 'namespace': ''}

In [16]:
# SEMANTIC search
query_sentence = "what are your prices"
xq = model.encode(query_sentence).tolist()
# get the data from Pinecone
result = index.query(xq, top_k=3, includeMetadata=True)
result


{'matches': [{'id': '17',
              'metadata': {'time_stamp': datetime.datetime(2023, 8, 9, 11, 13, 50),
                           'tokens': ['chatbot', 'types', 'and', 'prices']},
              'score': 0.294125766,
              'values': []},
             {'id': '24',
              'metadata': {'time_stamp': datetime.datetime(2023, 8, 9, 11, 13, 50),
                           'tokens': ['content',
                                      'generation-',
                                      '4',
                                      'posts',
                                      ',',
                                      '1',
                                      'per',
                                      'week',
                                      '$',
                                      '100',
                                      '/',
                                      'month']},
              'score': 0.264597684,
              'values': []},
             {'id': '15'

THIS IS EXTREMELY USEFUL INFO TO KNOW THAT A USER WOULD NOT BE ABLE TO ASK ABOUT PRICES AND GET BACK INFO ON SOME MAIN PRODUCTS SUCH AS:
WEBSITE DEVELOPMENT
OTHER SERVICES
SUBSCRIPTIONS
EMERGENCIES, ETC


there fore I must change the top_k=3 to a larger number to see all outputs and can more easily adjust vs messing with the prompts first


In [18]:
print("The row is enumerating the top_k=n value")
print(f"Query Sentence: {query_sentence}\n")
print("Results:\n")

# print results neatly:
import pandas as pd
from datetime import datetime

def dict_to_dataframe(data: dict) -> pd.DataFrame:
    """
    Convert the provided dictionary to a pandas DataFrame.
    
    Args:
    - data (dict): The dictionary containing token and metadata information.
    
    Returns:
    pd.DataFrame: The formatted DataFrame.
    """
    
    # Extract the 'matches' from the data
    matches = data['matches']
    
    # Prepare lists to store extracted data
    tokens_list, score_list, id_list = [], [], []

    # Iterate over matches and extract required data
    for match in matches:
        tokens = ', '.join(match['metadata']['tokens'])
        score = match['score']
        id = match['id']

        tokens_list.append(tokens)
        score_list.append(score)
        id_list.append(id)

    # Create a DataFrame
    df = pd.DataFrame({
        'Tokens': tokens_list,
        'Score': score_list,
        'ID': id_list
    })

    return df

# Test the function
# data = {
#     'matches': [
#         {'id': '17',
#          'metadata': {'time_stamp': datetime(2023, 8, 9, 11, 13, 50),
#                       'tokens': ['chatbot', 'types', 'and', 'prices']},
#          'score': 0.294125766,
#          'values': []},
#         {'id': '24',
#          'metadata': {'time_stamp': datetime(2023, 8, 9, 11, 13, 50),
#                       'tokens': ['content', 'generation-', '4', 'posts', ',', '1', 'per', 'week', '$', '100', '/', 'month']},
#          'score': 0.264597684,
#          'values': []},
#         {'id': '15',
#          'metadata': {'time_stamp': datetime(2023, 8, 9, 11, 13, 50),
#                       'tokens': ['multiple', 'chatbots', 'with', 'different', 'personalities', '/', 'purposes', '$', '100', 'each']},
#          'score': 0.259196728,
#          'values': []}
#     ],
#     'namespace': ''
# }

df = dict_to_dataframe(result)
print(df)


Query Sentence: what are your prices

Results:

                                              Tokens     Score  ID
0                        chatbot, types, and, prices  0.294126  17
1  content, generation-, 4, posts, ,, 1, per, wee...  0.264598  24
2  multiple, chatbots, with, different, personali...  0.259197  15
