## 1. Document Loading & Splitting

In [1]:
from langchain.document_loaders import TextLoader
loader = TextLoader("content_test.txt")
pages = loader.load() #list of docs

In [2]:
type(pages[0])

langchain.schema.document.Document

In [3]:
len(pages)

1

In [4]:
page = pages[0]
page.page_content[:500]

'Page Title: Recognition Program - CodiStars Content: The CodiStars awards program are awards from you for you - for people who stand out in the company in a special way. Thanks to it, you can appreciate people who actively represent the activities and thus deserve an award. Your voice has power, so we encourage you to get involved. Remember that the CodiStars program is for employees and associates only, therefore you cannot award titles to people who are no longer working at CodiLime. The award'

In [5]:
page.metadata

{'source': 'content_test.txt'}

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

![alternative text](obrazki/splits.png)


In [7]:
chunk_size = 26
chunk_overlap = 4

![alternative text](obrazki/chunki.png)

In [8]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [9]:
text1 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

## 2. Vectorstores & Embeddings

In [10]:
chunk_size = 1500
chunk_overlap = 150

In [11]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)


In [12]:
splits = r_splitter.split_documents(pages)

In [13]:
#splits

In [14]:
len(splits)

48

[embedding model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

In [15]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [16]:
from langchain.vectorstores import Chroma

In [17]:
persist_directory = 'docs/chroma/'

In [18]:
!rm -rf ./docs/chroma  # remove old database files if any

In [19]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

Using embedded DuckDB with persistence: data will be stored in: docs/chroma/


In [20]:
print(vectordb._collection.count())

48


In [21]:
question = "What is CodiStars Program?"

In [22]:
docs = vectordb.similarity_search(question,k=3)

In [23]:
docs

[Document(page_content="Page Title: Recognition Program - CodiStars Content: The CodiStars awards program are awards from you for you - for people who stand out in the company in a special way. Thanks to it, you can appreciate people who actively represent the activities and thus deserve an award. Your voice has power, so we encourage you to get involved. Remember that the CodiStars program is for employees and associates only, therefore you cannot award titles to people who are no longer working at CodiLime. The awards are given in 3 categories that relate directly to our values: Team Up To Win - Team Player Disrupt category is Grow - Improver Act to Deliver category - Do-er category Below you will find a detailed description of each category: Team Player - a distinction for a person whose actions have positively influenced contacts between employees and the mood in teams. It can be organizing integration, helping a teammate, or engaging in additional activities such as volunteering. 

In [24]:
docs[0].page_content

"Page Title: Recognition Program - CodiStars Content: The CodiStars awards program are awards from you for you - for people who stand out in the company in a special way. Thanks to it, you can appreciate people who actively represent the activities and thus deserve an award. Your voice has power, so we encourage you to get involved. Remember that the CodiStars program is for employees and associates only, therefore you cannot award titles to people who are no longer working at CodiLime. The awards are given in 3 categories that relate directly to our values: Team Up To Win - Team Player Disrupt category is Grow - Improver Act to Deliver category - Do-er category Below you will find a detailed description of each category: Team Player - a distinction for a person whose actions have positively influenced contacts between employees and the mood in teams. It can be organizing integration, helping a teammate, or engaging in additional activities such as volunteering. Improver - a distinctio

In [25]:
vectordb.persist()

## 3. Question Answering - RetreivalQA

![alternative text](obrazki/retreivalQA.png)


In [26]:
from langchain.chains import RetrievalQA

In [27]:
from typing import Dict, Any, List
from langchain.callbacks.stdout import StdOutCallbackHandler

class CustomStdOutCallbackHandler(StdOutCallbackHandler):
    def on_llm_start(
            self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
    ) -> None:
        print("***")
        print(prompts)
        print("***")

    def on_chain_start(
            self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any
    ) -> None:
        print("***")
        print(inputs)
        print("***")

In [28]:
from langchain import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="google/flan-t5-base",
    task="text2text-generation",
    model_kwargs={"temperature": 0, "max_length": 600},
    callbacks=[CustomStdOutCallbackHandler()],
)


In [29]:
from langchain import PromptTemplate

template = """
Use provided context, otherwise do not make the answer up.
{context}
Question: {question}
Answer:"""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])


chain_type_kwargs = {"prompt": prompt}

In [30]:
qa = RetrievalQA.from_chain_type(
        llm=llm, chain_type="stuff", retriever=vectordb.as_retriever(),
        return_source_documents=True, chain_type_kwargs=chain_type_kwargs, callbacks=[CustomStdOutCallbackHandler()])

In [31]:
query = input("\nEnter a query: ")
print(llm.generate(prompts=[query]))
res = qa(query)
answer, docs = res['result'], res['source_documents']
print("\n\n> Question:")
print(query)
print("\n> Answer:")
print(answer)


***
['Provide the rules for using LuxMed card']
***




generations=[[Generation(text='LuxMed card is a card that is used to pay for medical services.', generation_info=None)]] llm_output=None run=[RunInfo(run_id=UUID('59e11750-d4ae-4914-a456-9b0209d2c44a'))]
***
{'query': 'Provide the rules for using LuxMed card'}
***


Token indices sequence length is longer than the specified maximum sequence length for this model (1480 > 512). Running this sequence through the model will result in indexing errors


***
['\nUse provided context, otherwise do not make the answer up.\nPage Title: Luxmed card / Luxmed card Content:INLINE General rules for using LUX MED services: By joining the LUX MED Program, you undertake to use the services for a minimum of one year. You cannot cancel the package earlier, the only exception to this rule is the termination of cooperation with CodiLime. This also applies to your family members and partners If you register your family members or partners for the Family Package, it must be identical to the Individual package you have chosen. Persons who want to receive care at LUX MED are required to submit a completed accession form to the HR and payroll department (payroll @codilime.com) by the 20th day of the month preceding the month from which the package is to apply at the latest (forms to be completed below). The co-financing for the card from CodiLime is granted to employees with at least 3 months of work experience in the company. Before they work in the comp