In [7]:
import pandas as pd
import numpy as np
from openai import OpenAI

client = OpenAI(
      api_key=os.getenv('OPENAI_KEY')
)

In [17]:
# function to generate vector embeddings with the OpenAI text-embedding-3-large API from local CSV files
def generate_vector_embeddings(input_xls_file, output_csv_file):
    # import necessary libraries

    # load the input CSV file
    df = pd.read_excel(input_xls_file)

    # create a list of the text data
    text_data = df['conversation'].tolist()

    # generate vector embeddings for the text data
    vector_embeddings = []
    for text in text_data: 
        response = client.embeddings.create(
        model="text-embedding-3-small",
        dimensions=1536,
        input=text    # return_numpy=True,
    )
        vector_embeddings.append(response.data[0].embedding)
    
    # convert the list of vector embeddings to a numpy array
    # vector_embeddings = np.array(vector_embeddings)

    # add the vector embeddings to the output CSV file
    df['vector_embeddings'] = vector_embeddings

    # save the output CSV file
    df.to_csv(output_csv_file, index=False)

In [18]:
generate_vector_embeddings("data/support_tickets_sm.xlsx","data/output.csv")

In [28]:
df = pd.read_excel("data/support_tickets_sm.xlsx")

sample = df['conversation'][0:10]

sample

0    Agent: Thank you for calling BrownBox Customer...
1    Agent: Thank you for calling BrownBox customer...
2    Agent: Thank you for calling BrownBox Customer...
3    Customer: Hi, I am facing an issue while loggi...
4    Agent: Thank you for contacting BrownBox custo...
5    Agent: Thank you for calling BrownBox customer...
6    Agent: Hello, thank you for contacting BrownBo...
7    Agent: Hello, thank you for calling BrownBox C...
8    Agent: Hello, thank you for contacting BrownBo...
9    Agent: Thank you for calling BrownBox Customer...
Name: conversation, dtype: object

In [29]:
response = client.embeddings.create(
        model="text-embedding-3-small",
        dimensions=1536,
        input=sample    # return_numpy=True,
    )
len(response.data)

10

In [55]:
import inspect
for i in response.data:
    print(i.index)

0
1
2
3
4
5
6
7
8
9


In [1]:
import pinecone_datasets

dataset = pinecone_datasets.load_dataset('wikipedia-simple-text-embedding-ada-002-100K')
len(dataset)

  from tqdm.autonotebook import tqdm


100000

In [99]:
inspect(dataset)

NameError: name 'dataset' is not defined

In [6]:
# we drop sparse_values as they are not needed for this example
dataset.documents.drop(['metadata'], axis=1, inplace=True)
dataset.documents.rename(columns={'blob': 'metadata'}, inplace=True)
# we will use rows of the dataset up to index 30_000
dataset.documents.drop(dataset.documents.index[30_000:], inplace=True)

In [None]:
pinecone_api_key = api_key=os.getenv('PINECONE_KEY')
openai_api_key = os.getenv('OPENAI_KEY')

In [150]:
import os
pinecone_api_key = os.getenv('PINECONE_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
use_serverless = True

In [10]:
# Langchain quickstart
from pinecone import Pinecone, ServerlessSpec, PodSpec
import time

# configure client
pc = Pinecone(api_key=pinecone_api_key)

if use_serverless:
    spec = ServerlessSpec(cloud='aws', region='us-west-2')
else:
    # if not using a starter index, you should specify a pod_type too
    spec = PodSpec()

# check for and delete index if already exists
index_name = 'langchain-retrieval-augmentation-fast'
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# create a new index
pc.create_index(
    index_name,
    dimension=1536,  # dimensionality of text-embedding-ada-002
    metric='dotproduct',
    spec=spec
)

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [11]:
index = pc.Index(index_name)
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [12]:
for batch in dataset.iter_documents(batch_size=100):
    index.upsert(batch)

## native quickstart

In [13]:
# Create index
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)

pc.create_index(
    name="servicecloud",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws', 
        region='us-west-2'
    ) 
) 

# Indexing with native Pinecone SDK

In [22]:
def get_embeddings_batch(list):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        dimensions=1536,
        input=list    # return_numpy=True,
    )

### setup and embeddings

In [57]:
import os
import pandas as pd
from openai import OpenAI

CHUNK_SIZE=15
PINECONE_API_KEY = os.getenv('PINECONE_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_KEY')

client = OpenAI(
      api_key=OPENAI_API_KEY
)

def get_embeddings_batch(list):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        dimensions=1536,
        input=list    
    )
    return response

### import CSV, generate and store embeddings

In [97]:

# import csv data, get embeddings by chunk, append to dataframe, write out a copy to CSV
def process_csv(input_file, input_column_index, output_file):
    dflist = []
    for chunk in pd.read_csv(input_file, chunksize=CHUNK_SIZE,engine='python',nrows=100):
        tempdf = chunk
        embeddingslist = []
        embeddings = get_embeddings_batch(chunk[input_column_index].tolist())
        for i in embeddings.data: #list comprehension here? 
            embeddingslist.append(i.embedding)
        tempdf['embedding']=embeddingslist
        dflist.append(tempdf)
    df_with_embeddings = pd.concat(dflist) 
    df_with_embeddings.to_csv(output_file) # can we chain these?    
    return df_with_embeddings
        

In [137]:
df_embeddings = process_csv('data/support_tickets_out.csv','conversation','output_sm.csv')

### create pinecone index if it doesn't exist

In [133]:
# done
from pinecone import Pinecone, ServerlessSpec
from pinecone.core.client import exceptions
pc = Pinecone(api_key=PINECONE_API_KEY)

indexes = ['servicecloud', 'testindex5']
for i in indexes: 
    try: 
        pc.create_index(
        name=i,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-west-2"
            )
        )
        print('created index: ' + i)
    except exceptions.PineconeApiException as e:
        if e.status == 409:
            print("error calling pinecone API: " + e.body)
        else: 
            raise


error calling pinecone API: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}
error calling pinecone API: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


### upsert documents from a dataframe

In [151]:
pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index('testindex')

In [164]:

def upsert_to_pinecone(pc_client,df, namespace, PINECONE_BATCHSIZE=100):
     for i in range(0, len(df), PINECONE_BATCHSIZE):
     # Get the current chunk of data.
          batch = df[i:i+100]
          # batchvectors = dict(zip(batch['rowno'],batch['embedding']))
          batchvectors = []
          for i,row in batch.iterrows(): 
               batchvectors.append(dict(id=str(row['rowno']), 
                                        metadata=dict(conversation=row['conversation']),
                                        values=row['embedding']))
          print(batchvectors[0])
          pc_client.upsert(
               vectors=batchvectors,
               namespace=namespace
          )


In [165]:
upsert_to_pinecone(pc_client=index,df=df_embeddings,namespace='servicecloud')

{'id': '0', 'metadata': {'conversation': "Agent: Thank you for calling BrownBox Customer Support. My name is Tom. How may I assist you today?\n\nCustomer: Hi Tom, I'm trying to log in to my account to purchase an Oven Toaster Grill (OTG), but I'm unable to proceed as it's asking for mobile number or email verification. Can you help me with that?\n\nAgent: Sure, I can assist you with that. May I know your registered mobile number or email address, please?\n\nCustomer: My registered mobile number is +1 123-456-7890.\n\nAgent: Thank you. Let me check that for you. I'm sorry to inform you that we don't have this number on our records. Can you please confirm if this is the correct number?\n\nCustomer: Oh, I'm sorry. I might have registered with a different number. Can you please check with my email address instead? It's johndoe@email.com.\n\nAgent: Sure, let me check that for you. (After a few moments) I see that we have your email address on our records. We'll be sending you a verification