In [29]:
from utils.file_handler import list_of_files
files = list_of_files("data/raw/en/",".md")
files = [f for f in files if ("README" not in f and "SUMMARY" not in f)] # Remove README files

In [34]:
from src.data_ingestion.parsers import read_file
from utils.logger import get_logger
logger = get_logger(__name__)
import re

def process_md_adv(path,headings:list[str])->dict:
    """
    Reads a Markdown (.md) file and returns the required headers
    :param path: path to the Markdown file
    :param headings: list of headings to extract
    :return: a dictionary of the extracted headings
    """
    content = read_file(path)

    resp = dict((zip(headings, [""] * len(headings))))
    resp['Reference'] = ":".join(path.split('/')[-2:]).replace('.md', '')
    if not path.endswith('.md'):
        logger.warning('File extension must be .md', path)
        return resp
    for i in range(len(headings)):
        try:
            key = headings[i]
            pattern = rf"# {key}(.*?)(?=#|$)"
            match = re.search(pattern, content, re.DOTALL)
            match_text = match.group(1) if match else ""
            resp[key] = match_text.replace(':\n\n', ' ').strip()
        except Exception as e:
            logger.error(f"Error parsing file: {path}")
    if not any(resp.values()):
        logger.warning(f"No valid headings found in {path}")
    else:
        logger.info(f"Successfully parsed headings from {path}")
    return resp


process_md_adv("data/raw/en/18/78.md",["Text","Translation","Purport"])

2025-05-28 17:39:29,485 - __main__ - INFO - Successfully parsed headings from data/raw/en/18/78.md
2025-05-28 17:39:29,485 - __main__ - INFO - Successfully parsed headings from data/raw/en/18/78.md


{'Text': '78\n\nयत्र योगेश्वरः कृष्णो यत्र पार्थो धनुर्धरः ।  \nतत्र श्रीर्विजयो भूतिर्ध्रुवा नीतिर्मतिर्मम ॥७८॥\n\nyatra yogeśvaraḥ kṛṣṇo  \nyatra pārtho dhanur-dharaḥ  \ntatra śrīr vijayo bhūtir  \ndhruvā nītir matir mama',
 'Translation': '**Wherever there is Krishna, the master of all mystics, and wherever there is Arjuna, the supreme archer, there will also certainly be opulence, victory, extraordinary power, and morality. That is my opinion.**',
 'Purport': 'The Bhagavad-gita began with an inquiry of Dhritarashtra\'s. He was hopeful of the victory of his sons, assisted by great warriors like Bhishma, Drona and Karna. He was hopeful that the victory would be on his side. But after describing the scene on the battlefield, Sanjaya told the King, "You are thinking of victory, but my opinion is that where Krishna and Arjuna are present, there will be all good fortune." He directly confirmed that Dhritarashtra could not expect victory for his side. Victory was certain for the side

In [1]:
docs = []
for file in files:
    with open(file, "r", encoding="utf-8") as f:
        try:
            content = f.read()
            doc = process_md_adv(content, 
                                ["Text", "Translation", "Purport"])
        except Exception as e:
            print(file)
            break
        docs.append(doc)

NameError: name 'files' is not defined

In [36]:
len(docs)

661

In [37]:
import pandas as pd
df = pd.DataFrame(docs)
path = "data/processed/processed_data.csv"
df.to_csv(path, index=False)
print("Data saved to data/processed/processed_data.csv")

Data saved to data/processed/processed_data.csv


In [38]:
df.head()

Unnamed: 0,Text,Translation,Purport,Reference
0,,,,### Setting the Scene\n\nAlthough widely publi...
1,,,,### THE DISCIPLIC SUCCESSION\n\nThis _Bhagavad...
2,,,,### Preface\n\nOriginally I wrote _Bhagavad-gī...
3,,,,"gurum"">gurum evabhigacchet: [MU 1.2.12] one mu..."
4,,,,# Text 3\n\nपश्यैतां पाण्डुपुत्राणामाचार्य महत...


In [17]:
from dotenv import load_dotenv
load_dotenv()

True

In [18]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectorstore = FAISS.from_texts(docs, embeddings)

TypeError: expected string or bytes-like object, got 'dict'

In [None]:
def get_relevant_documents(query: str):
  """
  Retrieves relevant documents from the vectorstore for a given query.

  Args:
    query: The search query.

  Returns:
    A list of relevant documents.
  """
  return vectorstore.similarity_search(query)

In [None]:

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chat_models import init_chat_model

model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

def get_response_from_query(query: str):
    """
    Retrieves relevant documents and generates a response for a given query
    using newer LangChain patterns.

    Args:
      query: The search query.

    Returns:
      The generated response from the relevant documents.
    """
    # Define a template for the prompt
    template = """Answer the following question based only on the provided context:
    {context}

    Question: {input}
    """
    prompt = ChatPromptTemplate.from_template(template)

    # Create a stuff document chain
    # This chain combines documents into a single prompt and passes it to the model
    document_chain = create_stuff_documents_chain(model, prompt)

    # Create a retriever from your vectorstore
    # This retriever will fetch the most relevant documents based on the query
    retriever = vectorstore.as_retriever()

    # Create a retrieval chain
    # This chain first retrieves documents using the retriever,
    # then passes them to the document_chain to generate the response
    retrieval_chain = create_retrieval_chain(retriever, document_chain)

    # Invoke the retrieval chain with the query
    # Use .invoke() instead of .run()
    response = retrieval_chain.invoke({"input": query})

    # The response from the retrieval_chain.invoke will be a dictionary.
    # The generated answer is typically in the 'answer' key.
    return response['answer']

In [None]:
get_response_from_query("What is the purpose of life according to the Bhagavad Gita?")

'According to the Bhagavad-gita, the purpose of life is to deliver mankind from the nescience of material existence. It is also to revive our sanatana occupation, or sanatana-dharma, which is the eternal occupation of the living entity, which is the rendering of service to the Supreme Personality of Godhead.'

In [None]:
get_response_from_query("What is real religion")

'According to the provided text, the real principle of religious faith is situated in the mode of pure goodness, and the essence of all religion is to surrender unto Krishna'

In [None]:
print(get_response_from_query("How to control my lust?"))

Based on the provided text, here are some ways to control lust:

*   **Regulate the senses from the very beginning** and curb lust, the greatest sinful enemy.
*   **Learn Krishna consciousness from the very beginning of life.**
*   **Transform lust into love for the Supreme** or transform it into Krishna consciousness by desiring everything for Krishna.
*   **Steady the mind by deliberate spiritual intelligence (Krishna consciousness)** and conquer lust by spiritual strength.
*   **Elevate the mode of passion to the mode of goodness** by the prescribed method of living and acting.
*   **Develop Krishna consciousness gradually** to be situated in a transcendental position without being influenced by the material senses and the mind.


In [None]:
query = input("Enter your query: ")
print(get_response_from_query(query))

According to the provided text:

*   Everything that takes place is due to the combination of kshetra and kshetra-jna, the body and the spirit soul. This combination of material nature and the living entity is made possible by the Supreme God Himself.
*   The Supreme Personality of Godhead provides the seed, and living entities seem to come out as products of material nature.
*   Every living entity, according to his past activities, has a different body, created by this material nature, so that the entity can enjoy or suffer according to his past deeds.
