In [14]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Qdrant

import csv
import os
import sys

In [10]:
csv.field_size_limit(sys.maxsize)

loader = CSVLoader(file_path="/mnt/code/data/clean_help.csv", source_column="url")

data = loader.load()

In [11]:
article_texts = []
chunk_size = 5000
chunk_overlap = 0
strip_whitespace = True

article_text = loader.load_and_split(
        RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap, 
            strip_whitespace=strip_whitespace
        )
    )
    
# refine texts
for chunk in article_text:
    # index of documentation path 
    path_idx = chunk.page_content.find("\n")
    chunk.page_content = chunk.page_content[path_idx + 1:]
        
article_texts.append(article_text)

In [12]:
# Print sample to test quality
article_texts[0][20].page_content

'text: How Do We Make Money? Our business is based on a simple idea: When you follow our links to visit a store, that store pays us a commission on whatever you buy during your visit. We then share that commission with you, our members, as Cash Back. In other words, retailers pay to partner with us because we send shoppers to their websites or brick-and-mortar stores. They may also pay us to help grow their brand. That means we might feature them on our website, in emails, etc. Because of these partnerships, members like you benefit by earning Cash Back and getting great offers. To sum it up, we help retailers succeed and, at the same time, help our members save money. What does it mean when Rakuten states that a cash back percentage "was" a certain amount? Stores pay us a commission for sending our members to their websites, and we share it with you as cash back. When a store joins Rakuten, we set a cash back percentage for that store (a "base cash back percentage"). We work with our 

In [15]:
# Load the embedding model
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embedding_model_name = "BAAI/bge-small-en"
os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/mnt/code/model_cache/'
embeddings = HuggingFaceBgeEmbeddings(model_name=embedding_model_name,
                                      model_kwargs=model_kwargs,
                                      encode_kwargs=encode_kwargs
                                     )

In [16]:
qdrant_key = os.environ.get("QDRANT_KEY")
qdrant_url = "https://59f8f159-fb60-44e8-bfc4-9f35c77ca8d4.us-east4-0.gcp.cloud.qdrant.io:6333"

doc_store = Qdrant.from_documents(article_texts[0],
                              embedding=embeddings,
                              url = qdrant_url,
                              api_key= qdrant_key,
                              collection_name=f"rakuten")