In [None]:
pip install cx-copilot

In [None]:
!pip install pandas
!pip install numpy
!pip install -qU pip pinecone-client

In [24]:
from cx_copilot import OpenAIEmbeddingBlock, PineconeVectorDBBlock, GPTCompletionBlock, CXCopilot
import pandas as pd
import numpy as np
import pinecone
import random
import itertools

embedding = OpenAIEmbeddingBlock(open_ai_key='YOUR_OPEN_AI_KEY')
database = PineconeVectorDBBlock("PINECONE_KEY", "PINECONE_DATACENTER")
gpt = GPTCompletionBlock(open_ai_key = 'YOUR_OPEN_AI_KEY')

In [26]:
input_datapath = '../data/Substack_Data.csv'
df = pd.read_csv(input_datapath)
print("Columns: ", df.columns)
print("Length: ", len(df))
df.sample(n=3)

Columns:  Index(['Question', 'Answer'], dtype='object')
Length:  374


Unnamed: 0,Question,Answer
103,How do I import my mailing list from another p...,Have a list of emails you'd like to import to ...
70,Some things to keep in mind when using sections:,We recommend using sections for publishing new...
165,How can I add a guest author to a post?\r,Have an author you've been collaborating with ...


In [27]:
def embed(x):
    try:
        return embedding.embed_text(x)
    except:
        return 'error:' + str(x)

df['embedded_answers'] = df.Answer.apply(lambda x: embed(x))

In [28]:
to_upsert = []

for id, row in df.iterrows():
    meta = {'value': row['Answer'], 'question': row['Question']}
    if row['Answer'] == '':
      continue
    to_upsert.append((str(id), row['embedded_answers'], meta))

to_upsert = list(filter(lambda x: isinstance(x[2]['value'] , str) and isinstance(x[2]['question'], str), to_upsert))

print(len(to_upsert))

374


In [31]:
pinecone.init(
    api_key="PINECONE_KEY",
    environment="DATA"
)

index_name = 'substack-answers'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=1536)

# connect to index
index = pinecone.Index(index_name)
# describe pinecone index
print(pinecone.describe_index(index_name))

IndexDescription(name='substack-answers', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='p1.x1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')


In [32]:

def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

# Upsert data with 100 vectors per upsert request
for ids_vectors_chunk in chunks(to_upsert, batch_size=100):
    
    index = pinecone.Index(index_name)
    index.describe_index_stats()
    try:
      index.upsert(vectors=ids_vectors_chunk)  # Assuming `index` defined elsewhere
    except Exception as e:
      print(e)
      print(ids_vectors_chunk[0][2])