# Prepare data and load into the vector database

In [80]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

True

In [85]:
import pandas as pd

df = pd.read_csv('data-1717596851080.csv')
def prepare_text(row):
    return f"Event Name: {row['Event Name']}, Start Date: {row['Start Date']}, End Date: {row['End Date']}, \
    City: {row['City']}, Country Code: {row['Country Code']}"

texts = df.apply(prepare_text, axis=1).tolist()
event_ids = df['Event Id'].tolist()

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
embedding_vectors = embeddings.embed_documents(texts)

In [139]:
df['Checkin Count'] = df['Checkin Count'].astype(int)
df['Bookmark Count'] = df['Bookmark Count'].astype(int)
df.fillna("NaN", inplace=True)

pinecone_vectors = [
    {
        'id': str(event_id),
        'values': embedding,
        'metadata': {
            'Event Name': df.iloc[idx]['Event Name'],
            'Start Date': df.iloc[idx]['Start Date'],
            'End Date': df.iloc[idx]['End Date'],
            'City': df.iloc[idx]['City'],
            'Country Code': df.iloc[idx]['Country Code'],
            'Checkin Count': int(df.iloc[idx]['Checkin Count']),
            'Bookmark Count': int(df.iloc[idx]['Bookmark Count']),
            'text': prepare_text(df.iloc[idx])
        }
    }
    for idx, (event_id, embedding) in enumerate(zip(event_ids, embedding_vectors))
]

In [141]:
from pinecone import Pinecone
import os
index_name = "events-qa-index"

pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )
index = pc.Index(index_name)

In [143]:
def upload_in_batches(index, vectors, batch_size=100):
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)

upload_in_batches(index, pinecone_vectors, batch_size=100)