In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q torch transformers sentence-transformers pypdf faiss-cpu python-dotenv accelerate

!pip install torch faiss-cpu langchain PyPDF2 sentence-transformers langdetect google-generativeai

!pip install PyPDF2

!pip install pytesseract pdf2image
!sudo apt-get install tesseract-ocr tesseract-ocr-te  # Telugu language support

!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
E: Unable to locate package tesseract-ocr-te
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.7).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [3]:
!pip install flask flask-ngrok sentence-transformers google-generativeai langdetect faiss-cpu PyPDF2




In [4]:
code = """
import torch  #  PyTorch library - used for tensor computations and model operations (e.g., if using a transformer model with GPU)

import google.generativeai as genai  # Google Generative AI client - used to interact with Gemini models via Google AI Studio API

from sentence_transformers import SentenceTransformer  # SentenceTransformer - used to convert text into high-quality vector embeddings for semantic search

from langchain.text_splitter import RecursiveCharacterTextSplitter  # Langchain's splitter - breaks long text into smaller overlapping chunks (useful for context windows in LLMs)

from PyPDF2 import PdfReader  # PDF reader - extracts raw text from PDF files page by page (can be replaced with `pypdf` for better support)

import faiss  # Facebook AI Similarity Search - helps in fast and efficient similarity search over vector embeddings (used for retrieving relevant chunks)

import numpy as np  # NumPy - numerical computing library used for handling arrays and converting embeddings to FAISS-compatible formats
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from pdf2image import convert_from_path # to convert pdf to images
import pytesseract # to extract text from images

genai.configure(api_key="AIzaS")

class Config:
    EMBEDDING_MODEL = "all-MiniLM-L6-v2"
    CHUNK_SIZE = 1000
    CHUNK_OVERLAP = 100
    TOP_K = 3
    GEMINI_MODEL = "gemini-1.5-flash"

class PDFQuestionAnswering:
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.embedder = SentenceTransformer(Config.EMBEDDING_MODEL)
        self.docs = self._load_pdf()
        self.chunks = self._split_chunks()
        self.chunk_embeddings = self.embedder.encode(self.chunks, convert_to_tensor=False, normalize_embeddings=True)
        self.index = self._create_faiss_index()
        self.model = genai.GenerativeModel(Config.GEMINI_MODEL)

    def _load_pdf(self) -> str:
        reader = PdfReader(self.pdf_path)
        text = ""
        for page in reader.pages:
            content = page.extract_text()
            if content:
                text += content + "\\n"
        return text

    def _split_chunks(self):
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=Config.CHUNK_SIZE,
            chunk_overlap=Config.CHUNK_OVERLAP
        )
        return splitter.split_text(self.docs)

    def _create_faiss_index(self):
        dim = len(self.chunk_embeddings[0])
        index = faiss.IndexFlatL2(dim)
        index.add(np.array(self.chunk_embeddings).astype("float32"))
        return index

    def _search_similar_chunks(self, query: str):
        query_embedding = self.embedder.encode([query], convert_to_tensor=False, normalize_embeddings=True)
        distances, indices = self.index.search(np.array(query_embedding).astype("float32"), Config.TOP_K)
        return [self.chunks[i] for i in indices[0]]

    def _detect_language(self, text_sample: str) -> str:
        try:
            return detect(text_sample)
        except LangDetectException:
            return "en"

    def generate_answer(self, question: str, max_new_tokens: int = 300):
        top_chunks = self._search_similar_chunks(question)
        context = "\\n".join(top_chunks)
        detect_lang = self._detect_language(context[:500])

        if detect_lang == "te":
            instruction = 'Answer in both Telugu and English.'
        elif detect_lang == "ta":
            instruction = 'Answer in both Tamil and English.'
        elif detect_lang == "hi":
            instruction = 'Answer in both Hindi and English.'
        else:
            instruction = 'Answer in English'

        prompt = (
            f"You are a multilingual AI tutor. Based on the following context, answer the question clearly.\\n\\n"
            f"Context:\\n{context}\\n\\n"
            f"Question:\\n{question}\\n\\n"
            f"{instruction}"
        )

        response = self.model.generate_content(prompt)
        return response.text.strip()

if __name__ == "__main__":
    pdf_path = "/content/drive/MyDrive/AI_Tutor/AI_Tutor/TELUGU.pdf"
    bot = PDFQuestionAnswering(pdf_path)
    while True:
        question = input("\\nYour question (or 'exit'): ")
        if question.lower() == "exit":
            print("Goodbye!")
            break
        answer = bot.generate_answer(question)
        print("\\n🤖 Answer:", answer)
"""

# Now write it to the file
with open("pdf_qa.py", "w") as f:
    f.write(code)


In [5]:
from google.colab import files
uploaded = files.upload()


Saving TELUGU.pdf to TELUGU.pdf


In [5]:
# Remove old ngrok if it exists
!rm -f ngrok
!rm -f ngrok.zip

# Download latest ngrok version
!wget -O ngrok.zip https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.zip
!unzip -o ngrok.zip
!chmod +x ngrok
!mv ngrok /usr/local/bin/




--2025-04-22 14:46:42--  https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 75.2.60.68, 35.71.179.82, 99.83.220.108, ...
Connecting to bin.equinox.io (bin.equinox.io)|75.2.60.68|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9462138 (9.0M) [application/octet-stream]
Saving to: ‘ngrok.zip’


2025-04-22 14:46:45 (3.46 MB/s) - ‘ngrok.zip’ saved [9462138/9462138]

Archive:  ngrok.zip
  inflating: ngrok                   


In [6]:
!ngrok authtoken 2w5RcODAAz6JnvLoG1fEEE7GsRN_4SAQXdZHh5rU59x82Xb6n


Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [8]:
%%writefile app.py
from flask import Flask, request, render_template_string
from pdf_qa import PDFQuestionAnswering

app = Flask(__name__)

bot = None
pdf_uploaded = False

HTML_TEMPLATE = """
<!doctype html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>AI PDF Tutor</title>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
    <style>
        body {
            background: linear-gradient(to right, #dfe9f3, #ffffff);
            padding: 2rem;
            font-family: 'Segoe UI', sans-serif;
        }
        .container {
            max-width: 720px;
            background: #fff;
            padding: 2.5rem;
            border-radius: 15px;
            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
        }
        .logo {
            display: block;
            margin: 0 auto 20px;
            width: 100px;
        }
        .answer-box {
            background: #e0fce7;
            padding: 1rem;
            border-left: 5px solid #28a745;
            border-radius: 10px;
        }
        .btn-primary {
            background: #6c63ff;
            border: none;
        }
        .btn-success {
            background: #00b894;
            border: none;
        }
        .btn:hover {
            opacity: 0.9;
        }
        .form-label {
            font-weight: bold;
        }
    </style>
</head>
<body>
    <div class="container text-center">
        <img src="https://img.icons8.com/fluency/96/artificial-intelligence.png" alt="AI Logo" class="logo">
        <h2 class="text-center mb-4">🤖 <span style="color:#6c63ff">AI PDF Tutor</span></h2>

        <form method="post" enctype="multipart/form-data" class="mb-4 text-start">
            <div class="mb-3">
                <label class="form-label">Upload a PDF file</label>
                <input type="file" name="pdf" class="form-control" required>
            </div>
            <button type="submit" class="btn btn-primary w-100">📤 Upload PDF</button>
        </form>

        {% if uploaded %}
        <h4 class="text-start mt-4">Ask a question</h4>
        <form method="post" class="mb-4 text-start">
            <div class="mb-3">
                <input type="text" name="question" class="form-control" placeholder="Type your question..." required>
            </div>
            <button type="submit" class="btn btn-success w-100">❓ Ask</button>
        </form>
        {% endif %}

        {% if answer %}
        <div class="answer-box text-start mt-3">
            <h5><strong>🤖 Answer:</strong></h5>
            <p>{{ answer }}</p>
        </div>
        {% endif %}
    </div>
</body>
</html>
"""

@app.route("/", methods=["GET", "POST"])
def home():
    global bot, pdf_uploaded
    answer = None

    if request.method == "POST":
        if "pdf" in request.files:
            pdf_file = request.files["pdf"]
            pdf_path = "uploaded.pdf"
            pdf_file.save(pdf_path)
            bot = PDFQuestionAnswering(pdf_path)
            pdf_uploaded = True

        elif "question" in request.form and bot:
            question = request.form["question"]
            answer = bot.generate_answer(question)

    return render_template_string(HTML_TEMPLATE, answer=answer, uploaded=pdf_uploaded)

if __name__ == '__main__':
    app.run(port=5000)


Overwriting app.py


In [9]:
%%writefile app.py
from flask import Flask, request, render_template_string
from pdf_qa import PDFQuestionAnswering

app = Flask(__name__)

bot = None
pdf_uploaded = False

HTML_TEMPLATE = """
<!doctype html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>AI PDF Tutor</title>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
    <style>
        body {
            background: linear-gradient(135deg, #f6f8ff, #e6f0ff);
            padding: 2rem;
            font-family: 'Segoe UI', sans-serif;
        }
        .container {
            max-width: 750px;
            background: white;
            padding: 2.5rem;
            border-radius: 15px;
            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
        }
        .logo {
            width: 60px;
            height: auto;
            margin-right: 10px;
        }
        .header {
            display: flex;
            align-items: center;
            justify-content: center;
            margin-bottom: 1.5rem;
        }
        .answer-box {
            background: #e6ffed;
            padding: 1rem;
            border-left: 6px solid #28a745;
            border-radius: 10px;
            white-space: pre-wrap;
            font-family: inherit;
        }
        .btn-primary {
            background-color: #0056d2;
            border: none;
        }
        .btn-success {
            background-color: #198754;
            border: none;
        }
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <img src="https://img.icons8.com/fluency/96/artificial-intelligence.png" alt="AI Tutor" class="logo">
            <h2 class="text-primary">AI PDF Tutor</h2>
        </div>

        <form method="post" enctype="multipart/form-data" class="mb-4">
            <div class="mb-3">
                <label class="form-label">📤 Upload a PDF file</label>
                <input type="file" name="pdf" class="form-control" required>
            </div>
            <button type="submit" class="btn btn-primary w-100">Upload PDF</button>
        </form>

        {% if uploaded %}
        <form method="post" class="mb-4">
            <div class="mb-3">
                <label class="form-label">💬 Ask a question</label>
                <input type="text" name="question" class="form-control" placeholder="Type your question..." required>
            </div>
            <button type="submit" class="btn btn-success w-100">Ask</button>
        </form>
        {% endif %}

        {% if answer %}
        <div class="answer-box mt-3">
            <h5><strong>🤖 Answer:</strong></h5>
            {{ answer }}
        </div>
        {% endif %}
    </div>
</body>
</html>
"""

@app.route("/", methods=["GET", "POST"])
def home():
    global bot, pdf_uploaded
    answer = None

    if request.method == "POST":
        if "pdf" in request.files:
            pdf_file = request.files["pdf"]
            pdf_path = "uploaded.pdf"
            pdf_file.save(pdf_path)
            bot = PDFQuestionAnswering(pdf_path)
            pdf_uploaded = True

        elif "question" in request.form and bot:
            question = request.form["question"]
            answer = bot.generate_answer(question)

    return render_template_string(HTML_TEMPLATE, answer=answer, uploaded=pdf_uploaded)

if __name__ == '__main__':
    app.run(port=5000)


Overwriting app.py


In [10]:
import subprocess
import time

# Run app.py in background
process = subprocess.Popen(["python3", "app.py"])
time.sleep(3)  # wait for the server to start


In [11]:
# Start ngrok
import requests

# Start ngrok tunnel
ngrok_process = subprocess.Popen(["/usr/local/bin/ngrok", "http", "5000"])
time.sleep(5)

# Get public URL
try:
    r = requests.get("http://localhost:4040/api/tunnels")
    public_url = r.json()['tunnels'][0]['public_url']
    print("🔗 Access your app here:", public_url)
except Exception as e:
    print("❌ Error fetching ngrok URL:", e)


🔗 Access your app here: https://0410-34-124-247-105.ngrok-free.app


In [45]:

ngrok_process.terminate()
