## Load Text

In [4]:
with open("data/hr_policy.txt", "r") as f:
    content = f.read()

In [5]:
import tiktoken


tokenizer = tiktoken.get_encoding("p50k_base")

def tiktoken_len(text):
    token = tokenizer.encode(
        text,
        disallowed_special=()
    )
    
    return len(token)

In [6]:
tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

28

## Chunkin Function

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

chunk = text_splitter.split_text(content)
chunk[0]

'HR POLICY MANUAL - LEAVE POLICY\nI. PURPOSE\nThis policy is designed to provide a clear and consistent understanding of the leave benefits provided by our company to its employees. It covers the rules and regulations regarding Vacation Leave, Sick Leave, and Service Incentive Leave.\nII. SCOPE\nThis policy applies to all regular full-time employees of the company, regardless of their position or department.\nIII. LEAVE POLICY\nA. Vacation Leave\n1.\tEligibility and Accrual: All regular full-time employees are eligible for Vacation Leave. Employees will earn 1.25 days of Vacation Leave per month of service, accruing to 15 days per year.\n2.\tApplication: Leave applications must be submitted through the Employee Self Service portal at least one day before the intended leave date. Approval from the immediate supervisor is required.\n3.\tUnused Leave: Unused Vacation Leave can be carried over to the next year. However, the total accumulated leave should not exceed 30 days. Any excess leav

## Create embedding

In [9]:
from langchain.embeddings.openai import OpenAIEmbeddings
from uuid import uuid4
import os

embed = OpenAIEmbeddings(model='text-embedding-ada-002')
vector = [(str(uuid4()), embed.embed_documents([text])[0], {"text": text}) for text in chunk]

In [10]:
vector

[('f0215f08-3bb8-4933-a08d-8a311a880b27',
  [0.0031668582262554426,
   0.024034974235727112,
   0.02202014546203329,
   -0.04625010324764336,
   -0.03288723038064544,
   0.008371295107581299,
   0.003961416285160742,
   -0.008618274394942714,
   -0.009040738132140758,
   0.0024064221954474066,
   -0.007233890445715695,
   -0.008065820382725979,
   -0.018432447026599852,
   0.027375693383799134,
   -0.011088065405911795,
   -0.0027703916593047233,
   0.014792753785010496,
   -0.04401429212400481,
   0.011478033437000247,
   0.010932079124798955,
   -0.006960913755276319,
   0.011465034036969362,
   -0.02556884569737407,
   0.030079466144743827,
   -0.0037111873806222394,
   0.0071298994364200445,
   0.03712487087794693,
   -0.010334129075119202,
   0.004224643891238224,
   -0.012056484386633672,
   0.02914354473277662,
   -0.004091405163195622,
   -0.0015899285803125287,
   0.008618274394942714,
   -0.021981147261940632,
   -0.005306802965617645,
   0.0012812045875260762,
   -0.00110409

#### Prep Pinecone Index

In [None]:
# import pinecone


# index_name = 'tk-policy'
# dimension=1536

# pinecone.init(
#         api_key="<your pineceone api key>",  # get yours from pinecone.io. there is a free tier.
#         environment="<your pinecone environment>"  
# )

# # delete index if it exists
# if index_name in pinecone.list_indexes():
#     pinecone.delete_index(index_name)

# # create index
# pinecone.create_index(
#         name=index_name,
#         metric='cosine',
#         dimension=dimension       
# )

## Upsert vectors to index

In [11]:
import pinecone

pinecone.init(      
	api_key='65cd1f0b-fa6c-42dc-902e-5be9acffa7c5',      
	environment='us-west1-gcp-free'      
)      
index = pinecone.Index('tk-policy')

  from tqdm.autonotebook import tqdm


In [12]:
index.upsert(
    vectors=vector,
    values=True,
    include_metadata=True
)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 20}},
 'total_vector_count': 20}