# Imports

In [1]:
import os
import json
from pathlib import Path
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_chroma import Chroma
from langchain_ollama.embeddings import OllamaEmbeddings
import chromadb
from server.v1.config import config

2025-06-05 13:14:05,924 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-06-05 13:14:06,587 - httpx - INFO - HTTP Request: GET http://localhost:9999/api/v2/auth/identity "HTTP/1.1 200 OK"
2025-06-05 13:14:06,605 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-06-05 13:14:07,203 - httpx - INFO - HTTP Request: GET http://localhost:9999/api/v2/tenants/default_tenant "HTTP/1.1 200 OK"
2025-06-05 13:14:07,229 - httpx - INFO - HTTP Request: GET http://localhost:9999/api/v2/tenants/default_tenant/databases/default_database "HTTP/1.1 200 OK"
2025-06-05 13:14:08,224 - httpx - INFO - HTTP Request: POST http://localhost:9999/api/v2/tenants/default_tenant/databases/default_database/collections "HTTP/1.1 200 OK"


In [2]:
class UniversityDataProcessor:
    def __init__(self, data_root="data/courses"):
        self.data_root = Path(data_root)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            chunk_overlap=200
        )
        self.embedding_model = OllamaEmbeddings(model=config.retrieval.embedding_model)
        
    def extract_metadata_from_path(self, file_path: Path):
        """Extract metadata from file path structure"""
        parts = file_path.parts
        
        # Find course, content_type, and specific info
        course_idx = None
        for i, part in enumerate(parts):
            if part == "courses" and i + 1 < len(parts):
                course_idx = i + 1
                break
                
        if course_idx is None:
            return {}
            
        course = parts[course_idx] if course_idx < len(parts) else "unknown"
        content_type = parts[course_idx + 1] if course_idx + 1 < len(parts) else "unknown"
        
        metadata = {
            "course": course,
            "content_type": content_type,  # "books" or "lectures"
            "file_name": file_path.stem,
            "file_type": file_path.suffix.lower(),
            "source_path": str(file_path)
        }
        
        if content_type == "lectures" and course_idx + 2 < len(parts):
            metadata["teacher"] = parts[course_idx + 2]
            
        return metadata
    
    def load_teacher_metadata(self, teacher_dir: Path):
        """Load teacher-specific metadata if available"""
        metadata_file = teacher_dir / "metadata.json"
        if metadata_file.exists():
            with open(metadata_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        return {}
    
    def process_book(self, file_path: Path):
        """Process a book PDF with enhanced metadata"""
        base_metadata = self.extract_metadata_from_path(file_path)
        
        # Add book-specific metadata
        base_metadata.update({
            "document_type": "textbook",
            "subject_area": self.infer_subject_from_filename(file_path.stem)
        })
        
        loader = PyPDFLoader(str(file_path))
        pages = loader.load()
        
        documents = []
        for page in pages:
            # Split long pages into chunks
            chunks = self.text_splitter.split_text(page.page_content)
            
            for i, chunk in enumerate(chunks):
                if len(chunk.strip()) < 50:  # Skip very short chunks
                    continue
                    
                doc_metadata = {
                    **base_metadata,
                    "page_number": page.metadata.get("page", 0),
                    "chunk_index": i,
                    "total_pages": len(pages)
                }
                
                # Extract chapter info if possible
                chapter_info = self.extract_chapter_info(chunk)
                if chapter_info:
                    doc_metadata.update(chapter_info)
                
                documents.append(Document(
                    page_content=chunk,
                    metadata=doc_metadata
                ))
        
        return documents
    
    def process_lecture(self, file_path: Path):
        """Process a lecture PDF with teacher and course metadata"""
        base_metadata = self.extract_metadata_from_path(file_path)
        
        # Load teacher metadata
        teacher_dir = file_path.parent
        teacher_metadata = self.load_teacher_metadata(teacher_dir)
        
        base_metadata.update({
            "document_type": "lecture",
            "teacher_info": teacher_metadata.get("teacher_name", base_metadata.get("teacher", "unknown")),
            "course_code": teacher_metadata.get("course_code", ""),
            "semester": teacher_metadata.get("semester", ""),
            "academic_year": teacher_metadata.get("academic_year", "")
        })
        
        # Extract lecture-specific info from filename
        lecture_info = self.extract_lecture_info(file_path.stem)
        base_metadata.update(lecture_info)
        
        loader = PyPDFLoader(str(file_path))
        pages = loader.load()
        
        documents = []
        for page in pages:
            chunks = self.text_splitter.split_text(page.page_content)
            
            for i, chunk in enumerate(chunks):
                if len(chunk.strip()) < 50:
                    continue
                    
                doc_metadata = {
                    **base_metadata,
                    "page_number": page.metadata.get("page", 0),
                    "chunk_index": i,
                    "total_pages": len(pages)
                }
                
                documents.append(Document(
                    page_content=chunk,
                    metadata=doc_metadata
                ))
        
        return documents
    
    def extract_chapter_info(self, text: str):
        """Extract chapter information from text content"""
        import re
        
        # Look for chapter patterns
        chapter_patterns = [
            r"Chapter\s+(\d+)[:\s]*(.+?)(?:\n|$)",
            r"CHAPTER\s+(\d+)[:\s]*(.+?)(?:\n|$)",
            r"Ch\.\s*(\d+)[:\s]*(.+?)(?:\n|$)"
        ]
        
        for pattern in chapter_patterns:
            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
            if match:
                return {
                    "chapter_number": int(match.group(1)),
                    "chapter_title": match.group(2).strip()
                }
        
        return {}
    
    def extract_lecture_info(self, filename: str):
        """Extract lecture-specific info from filename"""
        import re
        
        info = {}
        
        # Extract week number
        week_match = re.search(r"week(\d+)", filename, re.IGNORECASE)
        if week_match:
            info["week_number"] = int(week_match.group(1))
        
        # Extract lecture number
        lecture_match = re.search(r"lecture(\d+)", filename, re.IGNORECASE)
        if lecture_match:
            info["lecture_number"] = int(lecture_match.group(1))
        
        # Extract topic from filename (everything after week/lecture info)
        topic_match = re.search(r"(?:week\d+_|lecture\d+_)(.+)", filename, re.IGNORECASE)
        if topic_match:
            info["topic"] = topic_match.group(1).replace("_", " ").title()
        
        return info
    
    def infer_subject_from_filename(self, filename: str):
        """Infer subject area from filename"""
        filename_lower = filename.lower()
        
        subject_keywords = {
            "mechanics": ["static", "dynamic", "mechanic", "force", "motion"],
            "thermodynamics": ["thermo", "heat", "energy", "entropy"],
            "materials": ["material", "steel", "concrete", "composite"],
            "fluid_mechanics": ["fluid", "flow", "hydraulic", "pneumatic"],
            "mathematics": ["calculus", "algebra", "differential", "linear"],
            "electronics": ["circuit", "electronic", "digital", "analog"]
        }
        
        for subject, keywords in subject_keywords.items():
            if any(keyword in filename_lower for keyword in keywords):
                return subject
        
        return "general"
    
    def process_all_files(self):
        """Process all PDF files in the data structure"""
        all_documents = []
        
        for course_dir in self.data_root.iterdir():
            if not course_dir.is_dir():
                continue
                
            print(f"Processing course: {course_dir.name}")
            
            # Process books
            books_dir = course_dir / "books"
            if books_dir.exists():
                for book_file in books_dir.glob("*.pdf"):
                    print(f"  Processing book: {book_file.name}")
                    documents = self.process_book(book_file)
                    all_documents.extend(documents)
            
            # Process lectures
            lectures_dir = course_dir / "lectures"
            if lectures_dir.exists():
                for teacher_dir in lectures_dir.iterdir():
                    if not teacher_dir.is_dir():
                        continue
                    
                    print(f"  Processing teacher: {teacher_dir.name}")
                    for lecture_file in teacher_dir.glob("*.pdf"):
                        print(f"    Processing lecture: {lecture_file.name}")
                        documents = self.process_lecture(lecture_file)
                        all_documents.extend(documents)
        
        return all_documents
    
    def create_vector_database(self, documents, batch_size=250, collection_name="university_study"):
        """Create ChromaDB with processed documents"""
        persistent_client = chromadb.HttpClient(port=config.retrieval.chroma_port)
        vector_db = Chroma(
            embedding_function=self.embedding_model,
            client=persistent_client,
            collection_name=collection_name,
            collection_metadata={'id': collection_name},
            create_collection_if_not_exists=True,
        )

        for i in range(0, len(documents), batch_size):
            vector_db.add_documents(documents[i:i+batch_size])
        
        return vector_db

In [3]:
# Usage example
processor = UniversityDataProcessor()
all_documents = processor.process_all_files()
print(f"Processing {len(all_documents)} document chunks")

vector_db = processor.create_vector_database(all_documents)

print(f"Processed {len(all_documents)} document chunks")

Processing course: Fluid Machinery
  Processing teacher: mamun
    Processing lecture: S1_Introduction.pdf
    Processing lecture: S2_Reciprocating_Pump.pdf
    Processing lecture: S3_Centrifugal_Pumps.pdf
Processing course: Internal Combustion Engines
  Processing book: Ferguson 3rd ed.pdf
  Processing book: Heywood 2nd ed.pdf
  Processing teacher: anup
    Processing lecture: Lec1_Engine Basics.pdf
    Processing lecture: Lec2_Engine Basics.pdf
    Processing lecture: Lec3_Engine Performance Parameters.pdf
    Processing lecture: Lec4_IC Engine Cycles.pdf
    Processing lecture: Lec5_IC Engine Cycles.pdf
    Processing lecture: Lec6_IC Engine-Air standard Cycle.pdf
  Processing teacher: monjur
    Processing lecture: 1.IC-Engine-History.pdf
    Processing lecture: 2.IC-Engine-Fuel.pdf
    Processing lecture: 3.IC-Engine-Combustion-SI.pdf
    Processing lecture: 4.IC-Engine-Combustion-CI.pdf
Processing course: Noise and Vibration
  Processing book: Rao 6th ed.pdf
  Processing teacher:

2025-06-05 13:18:37,714 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Processing 8452 document chunks


2025-06-05 13:18:38,389 - httpx - INFO - HTTP Request: GET http://localhost:9999/api/v2/auth/identity "HTTP/1.1 200 OK"
2025-06-05 13:18:38,391 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-06-05 13:18:38,868 - httpx - INFO - HTTP Request: GET http://localhost:9999/api/v2/tenants/default_tenant "HTTP/1.1 200 OK"
2025-06-05 13:18:38,879 - httpx - INFO - HTTP Request: GET http://localhost:9999/api/v2/tenants/default_tenant/databases/default_database "HTTP/1.1 200 OK"
2025-06-05 13:18:39,074 - httpx - INFO - HTTP Request: POST http://localhost:9999/api/v2/tenants/default_tenant/databases/default_database/collections "HTTP/1.1 200 OK"
2025-06-05 13:18:39,087 - backoff - INFO - Backing off send_request(...) for 0.9s (requests.exceptions.ConnectionError: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Max retries exceeded with url: /batch/ (Caused by NameResolutionErr

Processed 8452 document chunks


In [None]:
def query_with_filters(query_text: str, db: Chroma, course=None, teacher=None, content_type=None, subject_area=None):
    # Build filter based on parameters
    filter_dict = {}
    if course:
        filter_dict["course"] = course
    if teacher:
        filter_dict["teacher"] = teacher
    if content_type:
        filter_dict["content_type"] = content_type
    if subject_area:
        filter_dict["subject_area"] = subject_area
    
    # Search with filters
    results = db.similarity_search_with_score(
        query_text, 
        k=5, 
        filter=filter_dict if filter_dict else None
    )
    
    return results

# Example usage:
# Get only lecture content from Prof Smith
results = query_with_filters(
    "fire", 
    db=vector_db,
    # course="mechanical_engineering",
    # teacher="prof_smith",
    content_type="lectures"
)
print(results)
# Get only book content about thermodynamics
results = query_with_filters(
    "explain force equilibrium",
    db=vector_db,
    # subject_area="thermodynamics",
    content_type="books"
)
print(results)

2025-06-05 13:28:49,958 - httpx - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-06-05 13:28:54,547 - httpx - INFO - HTTP Request: POST http://localhost:9999/api/v2/tenants/default_tenant/databases/default_database/collections/4b254101-5eb3-4574-93bf-13c7c83010d8/query "HTTP/1.1 200 OK"
2025-06-05 13:28:54,643 - httpx - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


[(Document(id='5d37b919-3c67-4bcf-ba9a-dc891d28489c', metadata={'total_pages': 31, 'course': 'Refrigeration and Building Mechanical System', 'course_code': 'ME415', 'document_type': 'lecture', 'file_name': 'Lecture 2_ME 415_Basics on Fire Dynamics_19 April 2025', 'content_type': 'lectures', 'chunk_index': 0, 'file_type': '.pdf', 'academic_year': '4', 'source_path': 'data\\courses\\Refrigeration and Building Mechanical System\\lectures\\ashiqur\\Lecture 2_ME 415_Basics on Fire Dynamics_19 April 2025.pdf', 'semester': '1', 'page_number': 7, 'teacher': 'ashiqur', 'teacher_info': 'Dr. Md. Ashiqur Rahman'}, page_content='ME 415 Md. Ashiqur Rahman, ME, BUET\noFire: \no A fire is self-sustained oxidation of a fuel. \no NFPA 921: "A rapid oxidation process, which is a chemical reaction \nresulting in the evolution of light and heat”.\no Combustion or burning, in which substances combine chemically \nwith oxygen from the air and typically give out bright light, heat, \nand smoke.\n\uf09b Fire m

2025-06-05 13:29:02,101 - httpx - INFO - HTTP Request: POST http://localhost:9999/api/v2/tenants/default_tenant/databases/default_database/collections/4b254101-5eb3-4574-93bf-13c7c83010d8/query "HTTP/1.1 200 OK"


[(Document(id='bd856e97-7781-4300-a6c9-68e0fab684e1', metadata={'chunk_index': 1, 'file_name': 'Rao 6th ed', 'page_number': 161, 'content_type': 'books', 'course': 'Noise and Vibration', 'file_type': '.pdf', 'subject_area': 'general', 'document_type': 'textbook', 'total_pages': 1291, 'source_path': 'data\\courses\\Noise and Vibration\\books\\Rao 6th ed.pdf'}, page_content='F  \n>\n1t2 - mx  \n>¶ = 0 (2.4a) \n M  \n>\n1t2 - Ju  \n>¶ = 0 (2.4b) \nThese equations can be considered equilibrium equations provided that -mx >¶ and -Ju  \n>¶ are \ntreated as a force and a moment, respectively. This fictitious force (or moment) is known \nas the inertia force (or inertia moment) and the artificial state of equilibrium implied by \nEq. (2.4a) or (2.4b) is known as dynamic equilibrium. This principle, implied in Eq. (2.4a) \nor (2.4b), is called D’Alembert’s principle. Applying it to the system shown in Fig. 2.1(c) \nyields the equation of motion:\n \n-kx - mx$ = 0 or mx$ + kx = 0 (2.3) \nPrincip

: 