## 1. Document Loading & Splitting

In [32]:
from langchain.document_loaders import TextLoader
loader = TextLoader("content_test.txt")
pages = loader.load() #list of docs

In [33]:
type(pages[0])

langchain.schema.document.Document

In [3]:
len(pages)

1

In [34]:
page = pages[0]
page.page_content[:500]

'Page Title: Recognition Program - CodiStars Content: The CodiStars awards program are awards from you for you - for people who stand out in the company in a special way. Thanks to it, you can appreciate people who actively represent the activities and thus deserve an award. Your voice has power, so we encourage you to get involved. Remember that the CodiStars program is for employees and associates only, therefore you cannot award titles to people who are no longer working at CodiLime. The award'

In [35]:
page.metadata

{'source': 'content_test.txt'}

In [36]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

![alternative text](obrazki/splits.png)


In [40]:
chunk_size = 26
chunk_overlap = 4

![alternative text](obrazki/chunki.png)

In [41]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [42]:
text1 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

## 2. Vectorstores & Embeddings

In [11]:
chunk_size = 1500
chunk_overlap = 150

In [12]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)


In [13]:
splits = r_splitter.split_documents(pages)

In [14]:
#splits

In [15]:
len(splits)

107

[embedding model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

In [16]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [17]:
from langchain.vectorstores import Chroma

In [18]:
persist_directory = 'docs/chroma/'

In [19]:
!rm -rf ./docs/chroma  # remove old database files if any

In [20]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

Using embedded DuckDB with persistence: data will be stored in: docs/chroma/


In [21]:
print(vectordb._collection.count())

107


In [22]:
question = "What is CodiStars Program?"

In [23]:
docs = vectordb.similarity_search(question,k=3)

In [24]:
docs

[Document(page_content="Page Title: Recognition Program - CodiStars Content: The CodiStars awards program are awards from you for you - for people who stand out in the company in a special way. Thanks to it, you can appreciate people who actively represent the activities and thus deserve an award. Your voice has power, so we encourage you to get involved. Remember that the CodiStars program is for employees and associates only, therefore you cannot award titles to people who are no longer working at CodiLime. The awards are given in 3 categories that relate directly to our values: Team Up To Win - Team Player Disrupt category is Grow - Improver Act to Deliver category - Do-er category Below you will find a detailed description of each category: Team Player - a distinction for a person whose actions have positively influenced contacts between employees and the mood in teams. It can be organizing integration, helping a teammate, or engaging in additional activities such as volunteering. 

In [31]:
docs[0].page_content

"Page Title: Recognition Program - CodiStars Content: The CodiStars awards program are awards from you for you - for people who stand out in the company in a special way. Thanks to it, you can appreciate people who actively represent the activities and thus deserve an award. Your voice has power, so we encourage you to get involved. Remember that the CodiStars program is for employees and associates only, therefore you cannot award titles to people who are no longer working at CodiLime. The awards are given in 3 categories that relate directly to our values: Team Up To Win - Team Player Disrupt category is Grow - Improver Act to Deliver category - Do-er category Below you will find a detailed description of each category: Team Player - a distinction for a person whose actions have positively influenced contacts between employees and the mood in teams. It can be organizing integration, helping a teammate, or engaging in additional activities such as volunteering. Improver - a distinctio

In [25]:
vectordb.persist()

## 3. Question Answering - RetreivalQA

![alternative text](obrazki/retreivalQA.png)


In [26]:
from langchain.chains import RetrievalQA

In [27]:
from typing import Dict, Any, List
from langchain.callbacks.stdout import StdOutCallbackHandler

class CustomStdOutCallbackHandler(StdOutCallbackHandler):
    def on_llm_start(
            self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
    ) -> None:
        print("***")
        print(prompts)
        print("***")

    def on_chain_start(
            self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any
    ) -> None:
        print("***")
        print(inputs)
        print("***")

In [28]:
from langchain import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="google/flan-t5-base",
    task="text2text-generation",
    model_kwargs={"temperature": 0, "max_length": 600},
    callbacks=[CustomStdOutCallbackHandler()],
)


In [29]:
# prompt template
from langchain import PromptTemplate

template = """
Use provided context, otherwise do not make the answer up.
{context}
Question: {question}
Answer:"""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])


chain_type_kwargs = {"prompt": prompt}

In [30]:
qa = RetrievalQA.from_chain_type(
        llm=llm, chain_type="stuff", retriever=vectordb.as_retriever(),
        return_source_documents=True, chain_type_kwargs=chain_type_kwargs, callbacks=[CustomStdOutCallbackHandler()])

In [31]:
query = input("\nEnter a query: ")
print(llm.generate(prompts=[query]))
res = qa(query)
answer, docs = res['result'], res['source_documents']
print("\n\n> Question:")
print(query)
print("\n> Answer:")
print(answer)

***
['Who is Katarzyna Hewelt?']
***




generations=[[Generation(text='szczecin', generation_info=None)]] llm_output=None run=[RunInfo(run_id=UUID('1eb678b9-9f94-44be-89e3-37e63b6e9919'))]
***
{'query': 'Who is Katarzyna Hewelt?'}
***


Token indices sequence length is longer than the specified maximum sequence length for this model (1644 > 512). Running this sequence through the model will result in indexing errors


***
["\nUse provided context, otherwise do not make the answer up.\nPage Title: Employees Content: Katarzyna Hewelt - Junior Data Scientist\n/   Tomasz Jedroska - Head of Unit: BU Data Engineering\n/   Sebastina Pecio - Software Engineer\n/   Maciej Manturewicz - Director of Engineering\n/   Marek Niedzwiedz - CEO\n\nPage Title: Trip to Gdansk Content:Office Address GdanskOffice Address GdanskCodiLime Sp. z o.o.Olivia Four (Olivia Business Centre)Al. Grunwaldzka 472B room 2.12 (second floor)80-309 GdańskCodiLime Sp. z o.o.Olivia Four (Olivia Business Centre)Al. Grunwaldzka 472B p. 2.12 (second floor)80-309 GdańskRecommended hotelsRecommended hotelsThe list of hotels close to the office with a good reputation and availability is:Hotel OliviaSmart Hotel\xa0B&B RatuszHotel OliwskiRailway station relatively close to the office: Gdańsk OliwaFor more information: Access to the office / Access to the office The list of hotels, close to the office with good reputation and availability are:Hote