# NYD Hacakthon
#### This notebook is a collaborative effort of:
Deeksha Athreya, Deekshita Sriyaa K, Arushi Prakash

## Installations and Imports

In [1]:
!pip install sentence-transformers
!pip install faiss-gpu
!pip install transformers
!pip install scikit-learn
!pip install numpy
!pip install ipywidgets

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [2]:
# necessary Imports
import pandas as pd
import numpy as np
import faiss
import json
import re
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
import base64
from transformers import AutoTokenizer, AutoModelForCausalLM
from collections import defaultdict
from sentence_transformers import SentenceTransformer


## RAG Pipeline

### 1. Initialize Models

This cell initializes the tokenizer and models for text generation (AutoModelForCausalLM) and semantic similarity (SentenceTransformer).

In [3]:
# Initialize models
model_name = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### 2. Load DataFrames

In [4]:

# Load all DataFrames
gita_verses_df = pd.read_csv('Bhagwad_Gita_Verses_English_Questions.csv')
yoga_verses_df = pd.read_csv('Patanjali_Yoga_Sutras_Verses_English_Questions.csv')
gita_concepts_df = pd.read_csv('Bhagwad_Gita_Verses_Concepts.csv')
yoga_concepts_df = pd.read_csv('Patanjali_Yoga_Sutras_Verses_English.csv')

### 3. Standardize DataFrame Columns and Handeling Null values
Ensure uniform column names and handle missing values.

In [5]:
# Standardize columns for all DataFrames
for df in [gita_verses_df, yoga_verses_df, gita_concepts_df, yoga_concepts_df]:
    df.columns = df.columns.str.strip().str.lower()
    df.fillna("", inplace=True)

# Define keywords
gita_keywords = {'krishna', 'arjuna', 'dharma', 'bhakti', 'gita','mahabharata', 'war'}
patanjali_keywords = {'samadhi', 'asana', 'pranayama', 'meditation'}


### 4. Define Keywords

In [6]:
# Functions to handle keywords and mappings
def get_keywords_from_concepts(df):
    if 'keyword' in df.columns:
        keywords = set()
        for keyword in df['keyword'].dropna():
            if isinstance(keyword, str):
                words = keyword.lower().split()
                keywords.update(words)
        return keywords
    return set()

gita_keywords.update(get_keywords_from_concepts(gita_concepts_df))
patanjali_keywords.update(get_keywords_from_concepts(yoga_concepts_df))

### 5. Create Semantic Search Index
Generate embeddings and FAISS indices for efficient similarity search.

In [7]:

def create_source_index(df):
    df['semantic_text'] = df['translation']
    if 'question' in df.columns:
        df['semantic_text'] += ' ' + df['question']

    embeddings = semantic_model.encode(df['semantic_text'].tolist(), convert_to_tensor=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings.cpu().detach().numpy())
    return index, embeddings

gita_index, gita_embeddings = create_source_index(gita_verses_df)
yoga_index, yoga_embeddings = create_source_index(yoga_verses_df)


### 6. Determine Query Source

In [8]:

# Determine source
def determine_source(query):
    query_words = set(query.lower().split())
    gita_matches = query_words.intersection(gita_keywords)
    yoga_matches = query_words.intersection(patanjali_keywords)
    return 'gita' if len(gita_matches) > len(yoga_matches) else 'yoga'

### 7. Generate Text
Use the language model to generate a response based on context and question.

In [9]:
# Generate text
def generate_text(context, question, source, max_new_tokens=50, temperature=0.7, top_p=0.9):
    prompt = f"""
Answer the following question based on the verse from {source}.
Provide a concise and focused answer that directly addresses the question.

Verse:
{context}

Question:
{question}

Answer:
"""
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return full_response.split("Answer:")[1].strip() if "Answer:" in full_response else "Answer not available."

###  8. Retrieve and Answer
Handle query processing, semantic search, and response generation.

In [10]:
# Retrieve and answer query
query_history = []

# Set a distance threshold
DISTANCE_THRESHOLD = 50.0  # Adjust this based on experimentation

def retrieve_and_answer(query):
    query_history.append(query)
    source = determine_source(query)
    df = gita_verses_df if source == 'gita' else yoga_verses_df
    index = gita_index if source == 'gita' else yoga_index

    # Generate embedding for the query
    query_embedding = semantic_model.encode([query], convert_to_tensor=True).cpu().detach().numpy()

    # Perform FAISS search
    distances, indices = index.search(query_embedding, 1)  # Top 1 match
    closest_distance = distances[0][0]
    #print(closest_distance)
    # Check distance threshold
    if closest_distance > DISTANCE_THRESHOLD:
        return {
            "source": "No source",
            "context": "No relevant context found.",
            "answer": "Your query does not match the content of the available verses. Please try rephrasing your question."
        }

    # Retrieve the relevant row from the DataFrame
    verse_row = df.iloc[indices[0][0]]
    context = f"Chapter {verse_row['chapter']}, Verse {verse_row['verse']}:\n{verse_row['translation']}"

    # Generate an answer using the language model
    answer = generate_text(context, query, "Bhagavad Gita" if source == "gita" else "Patanjali Yoga Sutras")
    context+=f"\nSankrit {verse_row['sanskrit']}"
    return {"source": source, "context": context, "answer": answer }

# File to store the questions and answers
qa_log_file = "qna_log.json"


### 9. Support for Multiple Questions

In [11]:

def split_questions(query):
    """
    Split a query into multiple questions based on common separators and question marks.
    Returns a list of individual questions.
    """
    # First split by common separators
    separators = r'(?<=[.!?])\s+(?=[A-Z])|(?<=\?)\s*|\band\b|\bor\b|;'
    questions = re.split(separators, query)

    # Clean and filter the questions
    cleaned_questions = []
    for q in questions:
        q = q.strip()
        # Ensure it's a proper question (has substance and ends with ?)
        if q and len(q) > 5:  # Minimum length to be considered a question
            if not q.endswith('?'):
                q += '?'
            cleaned_questions.append(q)

    return cleaned_questions

In [12]:
def retrieve_and_answer_multiple(query):
    """
    Process multiple questions from a single query and return combined results.
    """
    questions = split_questions(query)
    results = []

    for question in questions:
        result = retrieve_and_answer(question)
        results.append({
            "question": question,
            "answer": result["answer"],
            "context": result["context"],
            "source": result["source"]
        })

        # Append individual Q&A to JSON log
        append_to_json_file(results[-1], qa_log_file)
        #print(results)
    return results


### 10. Log Results
Save questions and answers to a JSON log file.

In [13]:

def append_to_json_file(data, filename):
    try:
        with open(filename, "r") as file:
            existing_data = json.load(file)
    except FileNotFoundError:
        existing_data = []

    existing_data.append(data)

    with open(filename, "w") as file:
        json.dump(existing_data, file, indent=4)

### 11. Create Interface

In [14]:

def get_json_download_link():
    """
    Create a download link for the JSON file
    """
    try:
        with open(qa_log_file, "r") as file:
            json_data = file.read()

        # Encode the JSON data
        b64_data = base64.b64encode(json_data.encode()).decode()

        # Create download link
        href = f'data:application/json;base64,{b64_data}'

        return f"""
        <a href="{href}"
           download="qa_history.json"
           class="download-button">
           📥 Download Q&A History
        </a>
        """
    except FileNotFoundError:
        return "<p>No Q&A history available for download.</p>"

In [15]:
# Updated custom style with better content handling
custom_style = """
<style>
    .result-container {
        padding: 15px;
        background: #e8f0fe;
        border: 1px solid #cbd5e0;
        border-radius: 8px;
        margin-bottom: 15px;
        font-family: Arial, sans-serif;
        color: #333;
        max-height: 500px;
        overflow-y: auto;
    }
    .question-header {
        background: #d4e6f1;
        padding: 8px;
        margin: -15px -15px 10px -15px;
        border-radius: 8px 8px 0 0;
        font-weight: bold;
        position: sticky;
        top: 0;
        z-index: 1;
    }
    .content-section {
        margin: 10px 0;
        padding: 5px;
        background: rgba(255, 255, 255, 0.5);
        border-radius: 4px;
    }
    .content-section pre {
        white-space: pre-wrap;
        word-wrap: break-word;
        max-width: 100%;
        margin: 5px 0;
        padding: 8px;
        background: rgba(255, 255, 255, 0.7);
        border-radius: 4px;
        overflow-x: auto;
    }
    .result-container strong {
        color: #0056b3;
        display: block;
        margin-top: 10px;
    }
    .feedback-container {
        margin-top: 15px;
        padding: 10px;
        background: #fef9e7;
        border: 1px solid #f7dc6f;
        border-radius: 8px;
        font-family: Arial, sans-serif;
        color: #5d4037;
    }
    .download-button {
        display: inline-block;
        padding: 8px 16px;
        background: #4CAF50;
        color: white;
        text-decoration: none;
        border-radius: 4px;
        margin: 10px 0;
        font-family: Arial, sans-serif;
        transition: background-color 0.3s;
    }
    .download-button:hover {
        background: #45a049;
    }
    button:hover {
        background-color: #e3f2fd;
        color: #0d47a1;
    }
    textarea {
        border-radius: 8px;
        border: 1px solid #cbd5e0;
        font-family: Arial, sans-serif;
        padding: 10px;
    }
    select {
        border-radius: 8px;
        border: 1px solid #cbd5e0;
        padding: 5px;
        font-family: Arial, sans-serif;
    }
    /* Custom scrollbar styling */
    .result-container::-webkit-scrollbar {
        width: 8px;
    }
    .result-container::-webkit-scrollbar-track {
        background: #f1f1f1;
        border-radius: 4px;
    }
    .result-container::-webkit-scrollbar-thumb {
        background: #888;
        border-radius: 4px;
    }
    .result-container::-webkit-scrollbar-thumb:hover {
        background: #555;
    }
</style>
"""

In [16]:
def create_interface():
    query_input = widgets.Textarea(
        placeholder="Ask your questions here... (You can ask multiple questions separated by ?)",
        layout=widgets.Layout(width="100%", height="80px"),
        style={'description_width': 'initial'}
    )
    search_button = widgets.Button(description="🔍 Search", button_style='info')
    feedback_input = widgets.Textarea(
        placeholder="Provide your feedback...",
        layout=widgets.Layout(width="100%", height="60px"),
        style={'description_width': 'initial'}
    )

    download_area = widgets.HTML(value=get_json_download_link())

    output_area = widgets.Output()

    def format_source(source):
        """Format the source text based on the condition"""
        if source.lower() == 'gita':
            return 'Bhagavad Gita'
        else:
            return 'Patanjali Yoga Sutras'

    def on_search_clicked(b):
        with output_area:
            clear_output()
            results = retrieve_and_answer_multiple(query_input.value)

            # Display multiple results with improved content sections
            for result in results:
                display(HTML(f"""
                <div class="result-container">
                    <div class="question-header">{result['question']}</div>
                    <div class="content-section">
                        <strong>Source:</strong>
                        <pre>{format_source(result['source'])}</pre>
                    </div>
                    <div class="content-section">
                        <strong>Chapter-Verse Answer:</strong>
                        <pre>{result['context']}</pre>
                    </div>
                    <div class="content-section">
                        <strong>Context:</strong>
                        <pre>{result['answer']}</pre>
                    </div>
                </div>
                """))

            # Update download link after new results
            download_area.value = get_json_download_link()


    search_button.on_click(on_search_clicked)

    display(HTML(custom_style))
    display(widgets.VBox([
        query_input,
        widgets.HBox([search_button]),
        download_area,
        output_area
    ]))

create_interface()

VBox(children=(Textarea(value='', layout=Layout(height='80px', width='100%'), placeholder='Ask your questions …