In [29]:
"""
Simple PDF RAG System - Complete Setup
Using FREE: Ollama LLM + HuggingFace Embeddings

Prerequisites:
- Install Ollama: https://ollama.ai
- Run: ollama pull llama3.2
- Run: ollama code 
- not sure if we can call llama3.2 and olama code depending on the question
- !pip install langchain langchain-community langchain-huggingface sentence-transformers chromadb pymupdf langsmith
- 
"""
# !pip install langchain langchain-community langchain-huggingface sentence-transformers chromadb pymupdf langsmith

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading langchain_huggingface-0.3.1-py3-none-any.whl (27 kB)
Downloading pymupdf-1.26.4-cp39-abi3-win_amd64.whl (18.7 MB)
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
   ---------------------- ----------------- 10.7/18.7 MB 61.3 MB/s eta 0:00:01
   ---------------------------------------- 18.7/18.7 MB 51.5 MB/s eta 0:00:00
Installing collected packages: pymupdf, langchain-huggingface
Successfully installed langchain-huggingface-0.3.1 pymupdf-1.26.4


In [1]:
# Setup Environment Variables

import os

# LangSmith Configuration
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = 'YOUR API KEY'
os.environ['LANGCHAIN_PROJECT'] = 'rag-project'  # Name your project for organization

print("‚úì LangSmith tracing enabled")

‚úì LangSmith tracing enabled


In [3]:
# Import Libraries

import json
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.schema import BaseRetriever
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

print("Libraries imported")

‚úì Libraries imported


In [5]:
# Configuration

PARSED_JSON_FOLDER = "./parsed_json"
PDF_FOLDER = "./raw_data"
CHROMA_DIR = "./chroma_db"
DEFAULT_COURSE = "RAG ALIN"

# Embedding model
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# Ollama model
OLLAMA_MODEL = "llama3.2"  # "codellama"

In [7]:
# Initialize Embeddings

embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL
)
print(f"Embeddings model loaded: {EMBEDDING_MODEL}")

‚úì Embeddings model loaded: sentence-transformers/all-MiniLM-L6-v2


In [9]:
#Process JSON Files (Code) - With LangSmith Tracing

from langsmith import traceable

@traceable(name="process_json_files")
def process_json_files(json_folder: str, chroma_dir: str):
    """
    Process JSON files (code) with LangSmith tracing
    """
    json_path = Path(json_folder)
    json_files = list(json_path.glob("*.json"))
    
    print(f"\nProcessing {len(json_files)} JSON files (code)...\n")
    
    all_chunks = []
    
    # Code-aware text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\nclass ", "\n\ndef ", "\n\n", "\n", " ", ""]
    )
    
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            content = data.get("content", "")
            if not content:
                print(f"Skipping {json_file.name}: no content")
                continue
            
            # Create Document with metadata
            doc = Document(
                page_content=content,
                metadata={
                    "source": data.get("name", "unknown"),
                    "extension": data.get("extension", ""),
                    "course": data.get("course", DEFAULT_COURSE),
                    "file_type": "code",
                    "original_size": data.get("st_size", 0),
                }
            )
            
            # Split into chunks
            chunks = text_splitter.split_documents([doc])
            all_chunks.extend(chunks)
            
            print(f"{json_file.name}: {len(chunks)} chunks")
            
        except Exception as e:
            print(f"Error processing {json_file.name}: {e}")
    
    return all_chunks

# Run with tracing
code_chunks = process_json_files(PARSED_JSON_FOLDER, CHROMA_DIR)
print(f"\nTotal code chunks: {len(code_chunks)}")


üìÅ Processing 89 JSON files (code)...

‚úì 0010_Input.json: 5 chunks
‚úì 0010_Loop_Example.json: 29 chunks
‚úì 0020_If_Statement_ComparisonOperators.json: 15 chunks
‚úì 0050_loops_nested.json: 5 chunks
‚úì 0060_Relational_Operators.json: 7 chunks
‚úì 0_Showcase_Shiny.json: 1 chunks
‚úì 1_Shiny_UI_Editor.json: 1 chunks
‚úì 2_1_mini_example_TextInput.json: 1 chunks
‚úì 2_2_mini_example_NumericInput.json: 1 chunks
‚úì 2_3_advanced_example_MultipleInputOutput.json: 6 chunks
‚úì 3_Getting_Started.json: 1 chunks
‚úì 4_R_PullMechanism.json: 1 chunks
‚úì 5_Error_Example_ReactiveContext.json: 1 chunks
‚úì 6_Error_Example_OutputName.json: 2 chunks
‚úì 7_Error_Example_ReactiveExpressionsBrackets.json: 3 chunks
‚úì Bad_Example_01.json: 2 chunks
‚úì Bad_Example_Autoformatting_01.json: 2 chunks
‚úì Basic_Rmarkdown.json: 1 chunks
‚úì challenge_plot.json: 2 chunks
‚úì CleanUp_iris.json: 10 chunks
‚úì comp_lambda_i.json: 2 chunks
‚úì comp_lambda_ii.json: 2 chunks
‚úì comp_lambda_ii_solution.json: 1 

In [11]:
# Process PDFs - With LangSmith Tracing

@traceable(name="process_pdfs")
def process_pdfs(pdf_folder: str):
    """
    Process PDFs with LangSmith tracing
    """
    pdf_path = Path(pdf_folder)
    pdf_files = list(pdf_path.glob("**/*.pdf"))
    
    print(f"\nüìÑ Processing {len(pdf_files)} PDF files (including subfolders)...\n")
    
    all_chunks = []
    
    # PDF text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", " ", ""]
    )
    
    for pdf_file in pdf_files:
        try:
            # Load PDF
            loader = PyMuPDFLoader(str(pdf_file))
            pages = loader.load()
            
            # Extract course from folder structure
            try:
                relative_path = pdf_file.relative_to(pdf_path)
                course = relative_path.parts[0] if len(relative_path.parts) > 1 else DEFAULT_COURSE
            except:
                course = DEFAULT_COURSE
            
            # Split into chunks
            chunks = text_splitter.split_documents(pages)
            
            # Add metadata
            for chunk in chunks:
                chunk.metadata["course"] = course
                chunk.metadata["file_type"] = "pdf"
                chunk.metadata["source"] = pdf_file.stem
            
            all_chunks.extend(chunks)
            
            print(f"‚úì {pdf_file.name} (course: {course}): {len(chunks)} chunks")
            
        except Exception as e:
            print(f"‚úó Error processing {pdf_file.name}: {e}")
    
    return all_chunks

# Run with tracing
pdf_chunks = process_pdfs(PDF_FOLDER)
print(f"\nüìä Total PDF chunks: {len(pdf_chunks)}")


üìÑ Processing 64 PDF files (including subfolders)...

‚úì Problems_09.pdf (course: Bayesian and Classic Statistics): 21 chunks
‚úì Slides_09_handout.pdf (course: Bayesian and Classic Statistics): 65 chunks
‚úì Problems_01.pdf (course: Bayesian and Classic Statistics): 17 chunks
‚úì Slides_handout_01.pdf (course: Bayesian and Classic Statistics): 106 chunks
‚úì Problems_10.pdf (course: Bayesian and Classic Statistics): 18 chunks
‚úì Slides_10_print.pdf (course: Bayesian and Classic Statistics): 32 chunks
‚úì Problems_11.pdf (course: Bayesian and Classic Statistics): 15 chunks
‚úì Slides_11_print.pdf (course: Bayesian and Classic Statistics): 35 chunks
‚úì Problems_12.pdf (course: Bayesian and Classic Statistics): 27 chunks
‚úì Slides_12_handout.pdf (course: Bayesian and Classic Statistics): 73 chunks
‚úì Problems_13.pdf (course: Bayesian and Classic Statistics): 34 chunks
‚úì Slides_13_handout.pdf (course: Bayesian and Classic Statistics): 16 chunks
‚úì Problems_02.pdf (course: Bayes

In [15]:
#Create Chroma Vector Store - With Tracing

@traceable(name="create_vectorstore")
def create_vectorstore(all_documents: list, chroma_dir: str):
    """
    Create Chroma vector store with LangSmith tracing
    """
    print(f"\nüì¶ Total documents: {len(all_documents)}")
    print("‚è≥ Adding to Chroma (this may take a minute)...")
    
    vectorstore = Chroma.from_documents(
        documents=all_documents,
        embedding=embeddings,
        persist_directory=chroma_dir
    )
    
    print("‚úÖ Vector store created successfully!")
    return vectorstore

# Combine all chunks
all_documents = code_chunks + pdf_chunks

# Create vector store
vectorstore = create_vectorstore(all_documents, CHROMA_DIR)

# Verify
collection = vectorstore._collection
print(f"\nüìä Total chunks in Chroma: {collection.count()}")


üì¶ Total documents: 2795
‚è≥ Adding to Chroma (this may take a minute)...
‚úÖ Vector store created successfully!

üìä Total chunks in Chroma: 2795


In [17]:
# Setup RAG Chain with Ollama - With Full Tracing

# Initialize Ollama LLM
llm = Ollama(
    model=OLLAMA_MODEL,
    temperature=0.7
)

# Create custom prompt
prompt_template = """You are a helpful assistant answering questions based on course materials and code.
Use the following context to answer the question. If you don't know the answer based on the context, say so.

Context:
{context}

Question: {question}

Answer: """

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# Create retrieval chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

print("‚úì RAG chain created with Ollama")
print(f"‚úì Model: {OLLAMA_MODEL}")
print("‚úì LangSmith will trace all queries automatically!")

‚úì RAG chain created with Ollama
‚úì Model: llama3.2
‚úì LangSmith will trace all queries automatically!


  llm = Ollama(


In [20]:
# Query the RAG System - Fully Traced!

def ask_question(question: str):
    """
    Ask a question to the RAG system
    All steps are automatically traced in LangSmith!
    """
    print(f"\nüîç Question: {question}\n")
    
    # Query (automatically traced by LangSmith!)
    result = qa_chain.invoke({"query": question})
    
    # Display answer
    print(f"üí° Answer:\n{result['result']}\n")
    
    # Display sources
    print("üìö Sources used:")
    for i, doc in enumerate(result['source_documents'], 1):
        print(f"\n{i}. {doc.metadata.get('source', 'unknown')}")
        print(f"   Type: {doc.metadata.get('file_type', 'unknown')}")
        print(f"   Course: {doc.metadata.get('course', 'unknown')}")
        if doc.metadata.get('file_type') == 'pdf':
            print(f"   Page: {doc.metadata.get('page', 'N/A')}")
        print(f"   Preview: {doc.page_content[:150]}...")
    
    return result

# Test it!
result = ask_question("How do I create a pandas dataframe?")


üîç Question: How do I create a pandas dataframe?

üí° Answer:
To create a pandas DataFrame, you can use the `pd.DataFrame()` function or the `pd.read_csv()`, `pd.read_json()`, etc. functions depending on how your data is formatted.

Here are some examples:

- From a dictionary:
```python
data = {'Name': ['John', 'Anna', 'Peter', 'Linda'],
        'Age': [28, 24, 35, 32],
        'City': ['New York', 'Paris', 'Tokyo', 'Sydney']}
df = pd.DataFrame(data)
```

- From a CSV file:
```python
import pandas as pd

url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv"
tips = pd.read_csv(url)
print(tips.head())
```

- From an Excel file:
```python
df = pd.read_excel('data.xlsx')
```

- From a JSON file:
```python
import pandas as pd

url = "https://next.json-generator.com/api/json/get/4yIPxvyGt"
data = pd.read_json(url)
print(data.head())
```

Note that the `pd.DataFrame()` function takes in a dictionary, list of lists or a numpy array.

üìö Sources used:

1. sw10.ex