# Motivation
This notebook indexes all data available in [Kaggle Jobs Dataset](https://www.kaggle.com/datasets/promptcloud/indeed-job-posting-dataset) into a Qdrant Cloud database so it is available for use in the chat bot. This data has been preprocessed in the file `../data/jobs.csv`.

In [21]:
import tiktoken
import os
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
from qdrant_client.http.models import Distance, VectorParams
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredHTMLLoader
from langchain_community.vectorstores import Qdrant
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

In [22]:
from dotenv import load_dotenv; _ = load_dotenv()

In [31]:
import numpy as np
from typing import List
from langchain_core.documents.base import Document
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


loader = CSVLoader('../data/jobs.csv')
documents = loader.load()
median_len = get_median_document_length(documents)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=250,
    length_function=len
  )
documents = text_splitter.split_documents(documents)

In [32]:
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings

EMBED_MODEL_URL = "https://kokpt1wbsv2ul4jl.us-east-1.aws.endpoints.huggingface.cloud"

embeddings = HuggingFaceEndpointEmbeddings(
    model=EMBED_MODEL_URL,
    task="feature-extraction",
    huggingfacehub_api_token=os.environ["HF_TOKEN"],
)

# Build the Vectorstore

In [41]:
client = QdrantClient(
    url=os.environ.get('QDRANT_DB_BITTER_MAMMAL'), # Name of the qdrant cluster is bitter_mammal
    api_key=os.environ.get('QDRANT_API_KEY_BITTER_MAMMAL'),
)

In [42]:
client.create_collection(
    collection_name="indeed_jobs_db3",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

True

In [43]:
vector_store = QdrantVectorStore(
    client=client,
    collection_name="indeed_jobs_db3",
    embedding=embeddings,
)

In [48]:
from tqdm import tqdm
def add_documents(vector_store, documents, start=0):
    for i in tqdm(range(0, len(documents), 10)):
        batch = documents[i:i+10]
        vector_store.add_documents(
            documents=batch,
        )
        with open("checkpoint.txt", "w") as f:
            f.write(str(start+i+10))

In [None]:
add_documents(vector_store, documents[110610:], start=110610)

 37%|███████████████████████████████████████████▎                                                                        | 16424/43931 [54:24<1:30:19,  5.08it/s]