In [1]:
import os
import json
import numpy as np
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

from geminiAPI import geminAPI

In [3]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})


# RAG (Retrival Augumented Generation)

In [5]:
from langchain.vectorstores import FAISS
from geminiAPI import geminAPI
vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

def rag(query, k=5):
    
    retrieved_docs_with_scores = vector_store.similarity_search_with_score(query, k=k)
    text = '\n'.join([t[0].page_content for t in retrieved_docs_with_scores])
    file_names = list(set(t[0].metadata.get("file_path", "") for t in retrieved_docs_with_scores))
    print(f"Files used: {file_names}")
    
    prompt = f"""
    Role: You are a helpful assistant. Answer the question based on the context provided.
    Context: {text}
    Question: {query}
    """
    response = geminAPI(prompt)
    return response

## Using RAG

In [9]:
question = "What are the advantages of sexual reproduction over asexual reproduction?"
response = rag(question)
print(f"Question: {question}\nResponse: {response}")

Files used: ['book\\jesc108.txt', 'book\\jesc107.txt']
Question: What are the advantages of sexual reproduction over asexual reproduction?
Response: Based on the context provided, the advantage of sexual reproduction over asexual reproduction is that sexual reproduction allows for greater variation to be generated.



# Word Embedding
A way to represent words as a vector so that computers can make sense of the words

## Embeding Models
NLP techniques to covert word into vectors


### The binary representation of the word

"man" is  : 01101101 01100001 01101110

"woman" is: 01110111 01101111 01101101 01100001 01101110.

For computers "man" and "woman" are two different strings

In [10]:
"man" == "woman"

False

## With vector representation
Two words can be compared

In [19]:
# Example: get vector for two words and calculate cosine similarity
word1 = "man"
word2 = "man"
vec1 = np.array(embeddings.embed_query(word1))
vec2 = np.array(embeddings.embed_query(word2))

# Compute cosine similarity
cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

print(f"vector representation of '{word1}':\n{vec1[:5]}\n")
print(f"vector representation of '{word2}':\n{vec2[:5]}\n")

print(f"Cosine similarity between '{word1}' and '{word2}':", cos_sim)
print(len(vec1))

vector representation of 'man':
[-0.10967574  0.04374195 -0.02631918 -0.02932645  0.02155485]

vector representation of 'man':
[-0.10967574  0.04374195 -0.02631918 -0.02932645  0.02155485]

Cosine similarity between 'man' and 'man': 1.0
384


# Word to vectors

## Cosine Similarity Calculation

<svg width="300" height="300" viewBox="0 0 200 200">
  <!-- Axes -->
  <line x1="0" y1="100" x2="200" y2="100" stroke="gray" stroke-width="1" marker-end="url(#arrow)"/>
  <line x1="100" y1="200" x2="100" y2="0" stroke="gray" stroke-width="1" marker-end="url(#arrow)"/>

  <!-- Vector v₁: (50, 25) -->
  <line x1="100" y1="100" x2="150" y2="75" stroke="red" stroke-width="2" marker-end="url(#arrow)"/>

  <!-- Vector v₂: (25, 50) -->
  <line x1="100" y1="100" x2="125" y2="50" stroke="blue" stroke-width="2" marker-end="url(#arrow)"/>

  <!-- Arrowhead definition -->
  <defs>
    <marker id="arrow" markerWidth="10" markerHeight="10" refX="0" refY="3" orient="auto"
            markerUnits="strokeWidth">
      <path d="M0,0 L0,6 L9,3 z" fill="black" />
    </marker>
  </defs>

  
</svg>


Let's take $\vec{v}_1$ and $\vec{v}_2$:

The dot product of $\vec{v}_1$ and $\vec{v}_2$ is:
$$
\vec{v}_1 \cdot \vec{v}_2 = \sum_{i=1}^n v_{1,i} v_{2,i}
$$

The dot product can also be expressed in terms of the cosine of the angle $\theta$ between the two vectors:
$$
\vec{v}_1 \cdot \vec{v}_2 = \|\vec{v}_1\| \|\vec{v}_2\| \cos\theta
$$
The $cosine $ similarity between the two vectors
$$
\text{cos\_sim} = \frac{\vec{v}_1 \cdot \vec{v}_2}{\|\vec{v}_1\| \|\vec{v}_2\|}
$$

where $\vec{v}_1 \cdot \vec{v}_2$ is the dot product, and $\|\vec{v}_1\|$, $\|\vec{v}_2\|$ are the Euclidean norms of the vectors. 

The Euclidean norm (or length) of a vector $\vec{v}$ is calculated as:

$$
\|\vec{v}\| = \sqrt{v_1^2 + v_2^2 + \cdots + v_n^2}
$$

### The result ranges from:
- $-1$ $\vec{v}_1$ and $\vec{v}_2$ are opposite to each oter
- $1$ $\vec{v}_1$ and $\vec{v}_2$ are identical to each other
- $0$ indicating $\vec{v}_1$ and $\vec{v}_2$ are orthogonal (at $90^\bullet$) to each other


# Embed documents
### Collect data into list

In [21]:
folder_name = "book"

file_list = [os.path.join(folder_name, f) for f in os.listdir(folder_name) if os.path.isfile(os.path.join(folder_name, f))]
print(file_list)

# chapter_details_file = 'Maths Class 10 Chapter Details.json'
# with open(chapter_details_file, 'r') as f:
#     chapter_details = json.load(f)

documents_with_metadata = []
for file in file_list:
    
    with open(file, 'r', encoding='utf8') as f:
        content = f.read()
    # Split content into chunks (e.g., 1000 characters with 200 overlap)
    chunk_size = 2000
    chunk_overlap = 200
    chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size - chunk_overlap)]

    for chunk in chunks:
        doc = Document(
            page_content=chunk,
            metadata={
                "file_path": file,
            }
        )
        documents_with_metadata.append(doc)
    

['book\\jesc101.txt', 'book\\jesc102.txt', 'book\\jesc103.txt', 'book\\jesc104.txt', 'book\\jesc105.txt', 'book\\jesc106.txt', 'book\\jesc107.txt', 'book\\jesc108.txt', 'book\\jesc109.txt', 'book\\jesc110.txt', 'book\\jesc111.txt', 'book\\jesc112.txt', 'book\\jesc113.txt', 'book\\jesc1an.txt', 'book\\jesc1ps.txt']


### Create embedding

In [22]:
vector_store = FAISS.from_documents(documents_with_metadata, embeddings)
vector_store.save_local("faiss_index")

### How does this look like

In [23]:
import numpy as np

# Get the FAISS index from the vector_store
faiss_index = vector_store.index

# Get all vectors as a NumPy array
vectors = faiss_index.reconstruct_n(0, faiss_index.ntotal)

print("Shape of vectors:", vectors.shape)
print("First vector:", len(vectors[0]), vectors[0])

Shape of vectors: (284, 384)
First vector: 384 [-1.30706206e-02 -3.32267061e-02  4.71651778e-02  9.56698507e-02
  7.65684769e-02  2.49516703e-02  4.57166471e-02 -7.65561871e-03
  1.17983045e-02  2.69596484e-02  1.16835469e-02 -2.09506620e-02
 -6.20148564e-03 -8.95153452e-03 -1.56559777e-02 -2.14680582e-02
 -2.59503815e-02  5.37109934e-03 -7.11160302e-02  7.57401139e-02
  6.56213537e-02 -6.49396777e-02  1.61241796e-02  3.80813852e-02
 -3.47737558e-02  6.63031489e-02  9.18284133e-02  4.23495620e-02
 -5.02698980e-02  2.05755830e-02  3.88805270e-02  1.12818234e-01
  1.86516270e-02 -4.36741039e-02  3.26950438e-02  9.06238183e-02
 -3.49207707e-02  2.55973730e-02 -2.90852115e-02 -2.53039715e-03
 -7.08191022e-02 -6.88577890e-02  3.08742113e-02 -5.70027195e-02
  7.66699985e-02 -5.78395426e-02  9.98084173e-02 -7.23517537e-02
 -2.28918251e-02  5.20388372e-02 -5.66026531e-02  3.04690208e-02
 -7.19665661e-02 -3.22357938e-02  5.22843488e-02 -2.61843242e-02
 -6.68237731e-03 -7.73008391e-02  6.6476645

### Using Embedding to retrieve related text

In [28]:
query = "What is the difference between conductors and insulators?"
retrieved_docs = vector_store.similarity_search_with_score(query, k=5)
retrieved_docs

[(Document(id='ad8ea2a8-e359-4c55-b88f-d67633f922cf', metadata={'file_path': 'book\\jesc111.txt'}, page_content=' and (iii) on the nature of its material. Precise measurements\nhave shown that resistance of a uniform metallic conductor is directly\nproportional to its length (l ) and inversely proportional to the area of\ncross-section ( A). That is,\nR ∝ l (11.8)\nand R ∝ 1/A (11.9)\nCombining Eqs. (11.8) and (11.9) we get\nR ∝ l\nA\nor, R = ρl\nA(11.10)\nwhere ρ (rho) is a constant of proportionality and is called the electrical\nresistivity of the material of the conductor . The SI unit of resistivity is\nΩ m. It is a characteristic property of the material. The metals and alloysFigure 11.5 Figure 11.5 Figure 11.5 Figure 11.5 Figure 11.5 Electric circuit to study the factors on which the  resistance of conducting wires depends\n/square6Now, plug the key. Note the curr ent in the ammeter .\n/square6Replace the nichrome wire by another nichrome wire of same thickness but twice the\nle

### Collect retrieved text

In [29]:
text = '\n'.join([t[0].page_content for t in retrieved_docs[0:2]])
text

' and (iii) on the nature of its material. Precise measurements\nhave shown that resistance of a uniform metallic conductor is directly\nproportional to its length (l ) and inversely proportional to the area of\ncross-section ( A). That is,\nR ∝ l (11.8)\nand R ∝ 1/A (11.9)\nCombining Eqs. (11.8) and (11.9) we get\nR ∝ l\nA\nor, R = ρl\nA(11.10)\nwhere ρ (rho) is a constant of proportionality and is called the electrical\nresistivity of the material of the conductor . The SI unit of resistivity is\nΩ m. It is a characteristic property of the material. The metals and alloysFigure 11.5 Figure 11.5 Figure 11.5 Figure 11.5 Figure 11.5 Electric circuit to study the factors on which the  resistance of conducting wires depends\n/square6Now, plug the key. Note the curr ent in the ammeter .\n/square6Replace the nichrome wire by another nichrome wire of same thickness but twice the\nlength, that is 2 l [marked (2) in the Fig. 11.5].\n/square6Note the ammeter reading.\n/square6Now replace the wir

### Create prompt and ask LLM (Gemini)

In [30]:
prompt = f"""
    Role: You are a helpful assistant. Answer the question based on the context provided.
    Context: {text}
    Question: {query}"""

response = geminAPI(prompt)
response

'Conductors have very low resistivity (in the range of 10–8 Ω m to 10–6 Ω m), while insulators have high resistivity (of the order of 1012 to 1017 Ω m).\n'