### Indexing Data
This notebook indexes a subset of the [Yelp Reviews Dataset](https://business.yelp.com/data/resources/open-dataset/) for RAG in a Qdrant database. 
This is an **research-only** educational database released by Yelp for the purposes of education. 
More details about the license and terms of use of the dataset can be found in the [description of the dataset](../../data/Yelp_Dataset_Documentation_and_ToS_copy.pdf)

In [1]:
# Import wall
import tiktoken
import os
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
from qdrant_client.http.models import Distance, VectorParams
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredHTMLLoader
from langchain_community.vectorstores import Qdrant
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

In [2]:
from dotenv import load_dotenv; _ = load_dotenv()

In [3]:
import os

DATA_ROOT = "../../data/yelp/Yelp_JSON/"
REVIEWS = os.path.join(DATA_ROOT, "yelp_academic_dataset_review.json")
BUSINESS = os.path.join(DATA_ROOT, "yelp_academic_dataset_business.json")
CHECKIN = os.path.join(DATA_ROOT, "yelp_academic_dataset_checkin.json")
TIP = os.path.join(DATA_ROOT, "yelp_academic_dataset_tip.json")
USER = os.path.join(DATA_ROOT, "yelp_academic_dataset_user.json")
DATA_SUBSET = os.path.join(DATA_ROOT, "review_text.jsonl")

In [9]:
import numpy as np
from typing import List
from langchain_core.documents.base import Document
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


loader = JSONLoader(
    file_path=DATA_SUBSET,
    jq_schema='.full_review',
    json_lines=True
)
documents = loader.lazy_load()

In [10]:
SUBSET_SIZE = 500_000
docs = []
for i, doc in enumerate(documents):
    if i >= SUBSET_SIZE:
        break   
    docs.append(doc)
del documents
documents = docs

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=250,
    length_function=len
  )
documents = text_splitter.split_documents(documents)

In [15]:
for doc in documents:
    doc.metadata['source'] = doc.metadata['source'].split('/')[-1]

In [16]:
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings

# REPLACE THIS WITH APPROPRIATE DEPLOYMENT URL
EMBED_MODEL_URL = "https://klnki3w1q88gr09t.us-east-1.aws.endpoints.huggingface.cloud"

embeddings = HuggingFaceEndpointEmbeddings(
    model=EMBED_MODEL_URL,
    task="feature-extraction",
    huggingfacehub_api_token=os.environ["HF_TOKEN"],
    )

In [17]:
client = QdrantClient(
    url=os.environ.get('QDRANT_DB_BITTER_MAMMAL'), # Name of the qdrant cluster is bitter_mammal
    api_key=os.environ.get('QDRANT_API_KEY_BITTER_MAMMAL'),
)

In [18]:
client.create_collection(
    collection_name="yelp_reviews",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)

True

In [19]:
vector_store = QdrantVectorStore(
    client=client,
    collection_name="yelp_reviews",
    embedding=embeddings,
)



In [20]:
from tqdm import tqdm
def add_documents(vector_store, documents, start=0):
    for i in tqdm(range(start, len(documents), 10)):
        batch = documents[i:i+10]
        vector_store.add_documents(
            documents=batch,
        )
        with open("checkpoint.txt", "w") as f:
            f.write(str(start+i+10))

In [21]:
add_documents(vector_store, documents)


100%|██████████| 83764/83764 [5:19:24<00:00,  4.37it/s]    
