In [35]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

### Document Loaders

In [38]:
loader = TextLoader('sample.txt')
documents = loader.load()

In [39]:
len(documents)

1

### Document Transformer

In [40]:
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

Created a chunk of size 555, which is longer than the specified 200
Created a chunk of size 291, which is longer than the specified 200
Created a chunk of size 511, which is longer than the specified 200
Created a chunk of size 739, which is longer than the specified 200
Created a chunk of size 738, which is longer than the specified 200
Created a chunk of size 520, which is longer than the specified 200
Created a chunk of size 599, which is longer than the specified 200
Created a chunk of size 351, which is longer than the specified 200
Created a chunk of size 992, which is longer than the specified 200
Created a chunk of size 567, which is longer than the specified 200
Created a chunk of size 479, which is longer than the specified 200
Created a chunk of size 300, which is longer than the specified 200
Created a chunk of size 364, which is longer than the specified 200


In [41]:
len(texts)

14

### Text Embedding Model
Convert the text into numerical representation

In [42]:
embeddings = SentenceTransformerEmbeddings()

### Vector Stores

Store and search over embedded data

In [43]:
db = Chroma.from_documents(texts,embeddings)

In [44]:
db._collection.get(include=['embeddings'])

{'ids': ['109115af-3bd1-11ef-b61d-8c1d96954c36',
  '109115b0-3bd1-11ef-b76e-8c1d96954c36',
  '109115b1-3bd1-11ef-9115-8c1d96954c36',
  '109115b2-3bd1-11ef-acf6-8c1d96954c36',
  '109115b3-3bd1-11ef-8adf-8c1d96954c36',
  '109115b4-3bd1-11ef-b070-8c1d96954c36',
  '109115b5-3bd1-11ef-84e7-8c1d96954c36',
  '109115b6-3bd1-11ef-a9c8-8c1d96954c36',
  '109115b7-3bd1-11ef-88c9-8c1d96954c36',
  '109115b8-3bd1-11ef-8258-8c1d96954c36',
  '109115b9-3bd1-11ef-b075-8c1d96954c36',
  '109115ba-3bd1-11ef-b717-8c1d96954c36',
  '109115bb-3bd1-11ef-907c-8c1d96954c36',
  '109115bc-3bd1-11ef-9b75-8c1d96954c36',
  'b07d81a1-3bce-11ef-86df-8c1d96954c36',
  'b07d95e4-3bce-11ef-ac2f-8c1d96954c36',
  'b07d95e5-3bce-11ef-a484-8c1d96954c36',
  'f91e7d3f-3bcf-11ef-849c-8c1d96954c36',
  'f91e90eb-3bcf-11ef-b404-8c1d96954c36',
  'f91e90ec-3bcf-11ef-b0b8-8c1d96954c36',
  'f91e90ed-3bcf-11ef-8cce-8c1d96954c36'],
 'embeddings': [[0.015375418588519096,
   -0.01167468074709177,
   -0.015028324909508228,
   0.040298879146575

### Retrievers
Query your data

In [55]:
retriever = db.as_retriever(search_kwargs={"k":2})

In [46]:
retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001FC52A22610>, search_kwargs={'k': 1})

### Question 1

In [56]:
docs = retriever.get_relevant_documents("What is the capital of Boogie Land?")

In [57]:
docs

[Document(page_content='The president of Boogie Land is Boogie Boo.\n\n  The capital city of Boogie Land is Boogie City.\n\n  The population of Boogie Land is 7843.', metadata={'source': 'sample.txt'}),
 Document(page_content='The president of Boogie Land is Boogie Boo.\n\n  The capital city of Boogie Land is Boogie City.\n\n  The population of Boogie Land is 7843.', metadata={'source': 'sample.txt'})]

### Question 2

In [58]:
docs2 = retriever.get_relevant_documents("What is main festival of mithila culture")

In [59]:
docs2

[Document(page_content='Historically, the region of Mithila has been known for its scholarship and artistic achievements. The ancient scholars of Mithila contributed to various fields such as philosophy, theology, and law, making it a prominent center of learning in ancient India. The regionâ€™s cultural expressions are diverse and colorful, encompassing traditional music, dance, and unique art forms such as Madhubani painting. Madhubani art, characterized by intricate patterns and vibrant colors, often depicts mythological themes and is an integral part of Maithili cultural identity.', metadata={'source': 'sample.txt'}),
 Document(page_content='Maithili culture, rooted in the Mithila region of Nepal and India, is a rich tapestry of traditions, customs, and artistic expressions. The Maithili language, a prominent Indo-Aryan language, is spoken by millions and is recognized as one of the 22 scheduled languages of India. It boasts a rich literary tradition, with ancient texts and poetry 

### Question 3

In [60]:
docs3 = retriever.get_relevant_documents("When was constitution promulgated in Nepal?")

In [61]:
docs3

[Document(page_content='Nepal adopted its new constitution in 2015, which marked a milestone in its political evolution. The constitution establishes Nepal as a federal democratic republic, divides the country into seven provinces, and outlines the separation of powers among the executive, legislative, and judicial branches. It guarantees fundamental rights and freedoms to its citizens, including freedom of speech, equality before the law, and the right to education and healthcare. The constitution also emphasizes the inclusion of marginalized communities and aims to promote social justice and equitable development.', metadata={'source': 'sample.txt'}),
 Document(page_content="Despite these advancements, Nepal's political scene continues to face challenges, including political instability, governance issues, and the need for effective implementation of constitutional provisions. However, the resilience and determination of the Nepalese people remain strong as they work towards building