 **TASK 1**

In [None]:
!pip install --upgrade pip
!pip install pandas tqdm streamlit langchain chromadb python-dotenv



In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp "/content/drive/MyDrive/mtsamples.csv" "/content/mtsamples.csv"
!ls -lh /content/mtsamples.csv


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-rw------- 1 root root 17M Nov 29 22:16 /content/mtsamples.csv


In [None]:
#cleaning and chunking
import pandas as pd
import json, re, os
from tqdm import tqdm

csv_file = "/content/mtsamples.csv"

os.makedirs("/content/chunks", exist_ok=True)
out_file = "/content/chunks/chunks.jsonl"

df = pd.read_csv(csv_file)
print("Data loaded:", df.shape)

text_col = "transcription"

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.strip()                 # extra spaces remove
    text = re.sub(r"\s+", " ", text)    # multiple spaces
    return text

# chunks
def make_chunks(text, size=1500, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + size
        piece = text[start:end].strip()
        if piece:
            chunks.append(piece)
        start = end - overlap
    return chunks


all_chunks = []
chunk_id = 0

for i, row in tqdm(df.iterrows(), total=len(df)):
    raw_text = row[text_col]
    clean = clean_text(raw_text)

    if clean == "":
        continue

    for c in make_chunks(clean):
        all_chunks.append({
            "chunk_id": f"doc{i}_chunk{chunk_id}",
            "source_doc": f"doc{i}",
            "text": c,
            "meta": {
                "sample_name": row.get("sample_name", ""),
                "medical_specialty": row.get("medical_specialty", "")
            }
        })
        chunk_id += 1

with open(out_file, "w", encoding="utf-8") as f:
    for ch in all_chunks:
        f.write(json.dumps(ch) + "\n")

print("Total chunks saved:", len(all_chunks))
print("Saved file:", out_file)

print("\nSample chunk:")
print(all_chunks[0]["chunk_id"])
print(all_chunks[0]["text"][:500])


Data loaded: (4999, 6)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4999/4999 [00:01<00:00, 3416.52it/s]


Total chunks saved: 14026
Saved file: /content/chunks/chunks.jsonl

Sample chunk:
doc0_chunk0
SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies when she lived in Seattle but she thinks they are worse here. In the past, she has tried Claritin, and Zyrtec. Both worked for short time but then seemed to lose effectiveness. She has used Allegra also. She used that last summer and she began using it again two weeks ago. It does not appear to be working very well. She has used over-the-counter sprays but no prescription nasal sprays. She


In [None]:
#  Embedding and FAISS
!pip install -q sentence-transformers faiss-cpu

from sentence_transformers import SentenceTransformer
import faiss, json, os, numpy as np
from tqdm import tqdm

CHUNKS_FILE = "/content/chunks/chunks.jsonl"
DRIVE_SAVE = "/content/drive/MyDrive/medical_rag_vectorstore"
os.makedirs(DRIVE_SAVE, exist_ok=True)

MODEL_NAME = "all-MiniLM-L6-v2"
BATCH = 8

#  load the chunks texts
texts = []
metas = []
with open(CHUNKS_FILE, "r", encoding="utf-8") as f:
    for line in f:
        r = json.loads(line)
        texts.append(r["text"])
        metas.append({"chunk_id": r["chunk_id"], "source_doc": r["source_doc"], **r.get("meta", {})})

print("Chunks to embed:", len(texts))

model = SentenceTransformer(MODEL_NAME)
dim = model.encode(["hi"]).shape[1]
print("Embedding dim:", dim)

#   faiss index and add the  embeddings in the batches
index = faiss.IndexFlatL2(dim)
for i in tqdm(range(0, len(texts), BATCH)):
    batch = texts[i:i+BATCH]
    emb = model.encode(batch, show_progress_bar=False).astype("float32")
    index.add(emb)
    # checkpoint of every 200 batches
    if (i // BATCH) % 200 == 0:
        faiss.write_index(index, "/content/temp_faiss.index")
        !cp /content/temp_faiss.index "{DRIVE_SAVE}/faiss.index"
        print("Checkpoint saved at batch", i)

faiss.write_index(index, "/content/temp_faiss.index")
!cp /content/temp_faiss.index "{DRIVE_SAVE}/faiss.index"

with open(os.path.join(DRIVE_SAVE, "metadata.jsonl"), "w", encoding="utf-8") as f:
    for m in metas:
        f.write(json.dumps(m, ensure_ascii=False) + "\n")

print("Done. Index and metadata in Drive:", DRIVE_SAVE)


Chunks to embed: 14026
Embedding dim: 384


  0%|          | 3/1754 [00:00<03:15,  8.98it/s]

Checkpoint saved at batch 0


 12%|‚ñà‚ñè        | 206/1754 [00:09<01:09, 22.24it/s]

Checkpoint saved at batch 1600


 23%|‚ñà‚ñà‚ñé       | 409/1754 [00:15<00:37, 35.87it/s]

Checkpoint saved at batch 3200


 35%|‚ñà‚ñà‚ñà‚ñç      | 608/1754 [00:19<00:32, 34.94it/s]

Checkpoint saved at batch 4800


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 806/1754 [00:25<00:34, 27.49it/s]

Checkpoint saved at batch 6400


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 1007/1754 [00:30<00:21, 35.47it/s]

Checkpoint saved at batch 8000


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 1207/1754 [00:35<00:15, 34.87it/s]

Checkpoint saved at batch 9600


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 1409/1754 [00:41<00:10, 31.76it/s]

Checkpoint saved at batch 11200


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 1609/1754 [00:46<00:04, 33.91it/s]

Checkpoint saved at batch 12800


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1754/1754 [00:49<00:00, 35.45it/s]


Done. Index and metadata in Drive: /content/drive/MyDrive/medical_rag_vectorstore


In [None]:
from sentence_transformers import SentenceTransformer
import faiss, json, numpy as np

DRIVE_INDEX = "/content/drive/MyDrive/medical_rag_vectorstore/faiss.index"
DRIVE_META  = "/content/drive/MyDrive/medical_rag_vectorstore/metadata.jsonl"
MODEL = SentenceTransformer("all-MiniLM-L6-v2")

index = faiss.read_index(DRIVE_INDEX)
metas = [json.loads(l) for l in open(DRIVE_META, "r", encoding="utf-8")]

def search(query, k=10):
    qv = MODEL.encode([query]).astype("float32")
    D, I = index.search(qv, k)
    out = []
    for idx, dist in zip(I[0], D[0]):
        m = metas[idx]
        out.append({"chunk_id": m["chunk_id"], "source_doc": m.get("source_doc"), "dist": float(dist)})
    return out

# example
print(search("symptoms of allergic rhinitis", k=10))


[{'chunk_id': 'doc3347_chunk9226', 'source_doc': 'doc3347', 'dist': 0.8792160153388977}, {'chunk_id': 'doc4384_chunk12171', 'source_doc': 'doc4384', 'dist': 0.8792160153388977}, {'chunk_id': 'doc3364_chunk9277', 'source_doc': 'doc3364', 'dist': 0.9737875461578369}, {'chunk_id': 'doc4402_chunk12224', 'source_doc': 'doc4402', 'dist': 0.9737875461578369}, {'chunk_id': 'doc1930_chunk5160', 'source_doc': 'doc1930', 'dist': 0.9869530200958252}, {'chunk_id': 'doc3374_chunk9304', 'source_doc': 'doc3374', 'dist': 0.9869530200958252}, {'chunk_id': 'doc3848_chunk10543', 'source_doc': 'doc3848', 'dist': 0.9869530200958252}, {'chunk_id': 'doc4422_chunk12280', 'source_doc': 'doc4422', 'dist': 0.9869530200958252}, {'chunk_id': 'doc3721_chunk10165', 'source_doc': 'doc3721', 'dist': 0.9901379942893982}, {'chunk_id': 'doc4143_chunk11353', 'source_doc': 'doc4143', 'dist': 0.9901379942893982}]


In [None]:
import json

#  read a chunk text by id
def get_chunk_text(chunk_id):
    with open("/content/chunks/chunks.jsonl","r",encoding="utf-8") as f:
        for line in f:
            r = json.loads(line)
            if r["chunk_id"] == chunk_id:
                return r["text"]
    return ""

def quick_answer(query, k=3):
    res = search(query, k=k)
    if not res: return "No results."
    top = res[0]["chunk_id"]
    text = get_chunk_text(top)
    return text[:900] + "\n\nSOURCES: " + ";".join([r["chunk_id"] for r in res])

print(quick_answer("what are common symptoms of allergic rhinitis?"))


Chief Complaint:, coughing up blood and severe joint pain.,History of Present Illness:, The patient is a 37 year old African American woman with history of chronic allergic rhinitis who presents to an outpatient clinic with severe pain in multiple joints and hemoptysis for 1 day. The patient was at her baseline state of health until 2 months prior to admission when her usual symptoms of allergic rhinitis worsened. In addition to increased nasal congestion and drainage, she also began having generalized fatigue, malaise, and migratory arthralgias involving bilateral wrists, shoulders, elbows, knees, ankles, and finger joints. She also had intermittent episodes of swollen fingers that prevented her from making a fist. Patient denied recent flu-like illness, fever, chills, myalgias, or night sweats. Four weeks after the onset of arthralgias patient developed severe bilateral eye dryness and

SOURCES: doc3347_chunk9226;doc4384_chunk12171;doc1930_chunk5160


In [None]:
!mkdir -p /content/eval

In [None]:
%%writefile /content/eval/test_queries.csv
What are common symptoms of pneumonia?
What is the recommended initial treatment for acute bronchitis?
What are signs of myocardial infarction in adults?
How do you manage type II diabetes?
What are typical presentation features of meningitis?
What are red flags for abdominal pain?
What are first-line antibiotics for uncomplicated UTI?
What are indications for CT scan in head trauma?
What is the vaccination schedule for tetanus?
What are symptoms of sepsis?
How to treat allergic rhinitis?
How to manage hypertension in elderly patients?
What is DKA and how does it present?
How to manage postpartum hemorrhage?
What are signs of stroke and immediate actions?
What are typical features of COPD exacerbation?
When is a biopsy indicated for a skin lesion?
What are symptoms of appendicitis?
How is anemia of chronic disease diagnosed?
How to counsel on smoking cessation?
What are pediatric fever red flags?
What is the treatment for uncomplicated otitis media?
What are indications for MRI in back pain?
What is standard dosing of acetaminophen in adults?
What are contraindications for MRI?
How to interpret basic ABG values?
What are symptoms of hyperthyroidism?
How to manage moderate dehydration?
When to refer to a specialist for chest pain?
Are antibiotics recommended for uncomplicated URI?


Writing /content/eval/test_queries.csv


In [None]:
%%writefile /content/src/run_eval.py
import csv
import json
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

#  vectorstore loaded
index = faiss.read_index("/content/drive/MyDrive/medical_rag_vectorstore/faiss.index")
meta = [json.loads(l) for l in open("/content/drive/MyDrive/medical_rag_vectorstore/metadata.jsonl")]

model = SentenceTransformer("all-MiniLM-L6-v2")

def search(q, k=3):
    qv = model.encode([q]).astype("float32")
    D, I = index.search(qv, k)
    return I[0]

def get_chunk(cid):
    with open("/content/chunks/chunks.jsonl","r") as f:
        for line in f:
            r=json.loads(line)
            if r["chunk_id"]==cid:
                return r["text"]
    return ""

def answer(q):
    top_ids = search(q)
    cid = meta[top_ids[0]]["chunk_id"]
    text = get_chunk(cid)
    return text[:800], cid

with open("/content/eval/test_queries.csv") as f:
    queries = [l.strip() for l in f if l.strip()]

with open("/content/drive/MyDrive/eval_results_final.csv","w",newline='') as f:
    w = csv.writer(f)
    w.writerow(["question","answer","source","correct(1/0)"])
    for q in queries:
        ans, src = answer(q)
        w.writerow([q, ans.replace("\n"," "), src, ""])

Overwriting /content/src/run_eval.py


In [None]:
!python3 /content/src/run_eval.py

2025-11-29 22:18:17.407167: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764454697.426697   35524 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764454697.432557   35524 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764454697.448289   35524 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764454697.448310   35524 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764454697.448314   35524 computation_placer.cc:177] computation placer alr

In [None]:
import sys
sys.path.append('/content')
sys.path.append('/content/src')
print("sys.path updated")


sys.path updated


In [None]:
%%bash
cat > /content/eval/test_queries.csv <<'CSV'
What are common symptoms of pneumonia?
What is the recommended initial treatment for acute bronchitis?
What are signs of myocardial infarction in adults?
How do you manage type II diabetes?
What are typical presentation features of meningitis?
What are red flags for abdominal pain?
What are first-line antibiotics for uncomplicated UTI?
What are indications for CT scan in head trauma?
What is the vaccination schedule for tetanus?
What are symptoms of sepsis?
How to treat allergic rhinitis?
How to manage hypertension in elderly patients?
What is DKA and how does it present?
How to manage postpartum hemorrhage?
What are signs of stroke and immediate actions?
What are typical features of COPD exacerbation?
When is a biopsy indicated for a skin lesion?
What are symptoms of appendicitis?
How is anemia of chronic disease diagnosed?
How to counsel on smoking cessation?
What are pediatric fever red flags?
What is the treatment for uncomplicated otitis media?
What are indications for MRI in back pain?
What is standard dosing of acetaminophen in adults?
What are contraindications for MRI?
How to interpret basic ABG values?
What are symptoms of hyperthyroidism?
How to manage moderate dehydration?
When to refer to a specialist for chest pain?
Are antibiotics recommended for uncomplicated URI?
CSV

echo "test_queries.csv created at /content/eval/test_queries.csv"
ls -l /content/eval

test_queries.csv created at /content/eval/test_queries.csv
total 4
-rw-r--r-- 1 root root 1301 Nov 29 22:18 test_queries.csv


In [None]:
%%writefile /content/src/run_eval.py
import csv, json
from sentence_transformers import SentenceTransformer
import faiss, numpy as np

INDEX_PATH = "/content/drive/MyDrive/medical_rag_vectorstore/faiss.index"
META_PATH = "/content/drive/MyDrive/medical_rag_vectorstore/metadata.jsonl"
CHUNKS_FILE = "/content/chunks/chunks.jsonl"
MODEL_NAME = "all-MiniLM-L6-v2"

index = faiss.read_index(INDEX_PATH)
metas = [json.loads(l) for l in open(META_PATH, "r", encoding="utf-8")]

model = SentenceTransformer(MODEL_NAME)

def search(query, k=3):
    qv = model.encode([query]).astype("float32")
    D, I = index.search(qv, k)
    return I[0], D[0]

def get_chunk_text(chunk_id):
    with open(CHUNKS_FILE, "r", encoding="utf-8") as f:
        for line in f:
            r = json.loads(line)
            if r["chunk_id"] == chunk_id:
                return r["text"]
    return ""

with open("/content/eval/test_queries.csv", "r", encoding="utf-8") as f:
    queries = [line.strip() for line in f if line.strip()]

out_path = "/content/drive/MyDrive/eval_results_final.csv"
with open(out_path, "w", encoding="utf-8", newline='') as fout:
    writer = csv.writer(fout)
    writer.writerow(["question","answer","sources","correct(1/0)"])
    for q in queries:
        idxs, dists = search(q, k=3)
        sources = []
        for i in idxs:
            sources.append(metas[i]["chunk_id"])
        top_cid = sources[0] if sources else ""
        ans_text = get_chunk_text(top_cid)[:900].replace("\n"," ")
        writer.writerow([q, ans_text, ";".join(sources), ""])
print("Eval results saved ->", out_path)


Overwriting /content/src/run_eval.py


In [None]:
!python3 /content/src/run_eval_final.py

python3: can't open file '/content/src/run_eval_final.py': [Errno 2] No such file or directory


In [None]:
!ls -l /content/eval


total 4
-rw-r--r-- 1 root root 1301 Nov 29 22:18 test_queries.csv


In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/eval_results_final.csv")
df.head(31)


Unnamed: 0,question,answer,sources,correct(1/0)
0,What are common symptoms of pneumonia?,"rhythm. Normal S1, S2.,Abdomen: Positive bowel...",doc3369_chunk9290;doc4401_chunk12223;doc3856_c...,0
1,What is the recommended initial treatment for ...,"ds, and 70% monocytes. A urinalysis obtained i...",doc3865_chunk10602;doc4954_chunk13858;doc1133_...,0
2,What are signs of myocardial infarction in adu...,"ANGINA, is chest pain due to a lack of oxygen ...",doc4971_chunk13921;doc3430_chunk9462;doc4502_c...,1
3,How do you manage type II diabetes?,"SUBJECTIVE:, The patient is a 79-year-old Afri...",doc1415_chunk3900;doc3995_chunk10928;doc4459_c...,1
4,What are typical presentation features of meni...,"chronic.,There is a joint effusion. There is s...",doc1572_chunk4218;doc2134_chunk5711;doc2467_ch...,0
5,What are red flags for abdominal pain?,"REASON FOR EXAM: , Right-sided abdominal pain ...",doc1709_chunk4498;doc3035_chunk8352;doc3603_ch...,0
6,What are first-line antibiotics for uncomplica...,cessation of the antibiotic.,doc1175_chunk3264;doc3777_chunk10313;doc129_ch...,0
7,What are indications for CT scan in head trauma?,"EXAM: , CT head without contrast.,INDICATIONS:...",doc1690_chunk4460;doc2922_chunk8027;doc1688_ch...,1
8,What is the vaccination schedule for tetanus?,ng is applied with Neosporin ointment. The pat...,doc3107_chunk8592;doc3441_chunk9500;doc4020_ch...,0
9,What are symptoms of sepsis?,"dependent with respiratory acidosis.,2. Septic...",doc4442_chunk12348;doc4817_chunk13495;doc1393_...,1


In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/eval_results_final.csv")
print("Shape:", df.shape)
print("Counts:", df["correct(1/0)"].value_counts(dropna=False).to_dict())
print("Accuracy:", df["correct(1/0)"].fillna(0).astype(int).sum(), "/", len(df),
      "=", df['correct(1/0)'].fillna(0).astype(int).sum()/len(df)*100, "%")

Shape: (30, 4)
Counts: {1: 16, 0: 14}
Accuracy: 16 / 30 = 53.333333333333336 %


In [None]:
!find /content -name "eval_results_final.csv"


/content/drive/MyDrive/eval_results_final.csv


In [None]:
%%writefile /content/README.md
# Medical RAG Question Answering System - Task 1

A retrieval-augmented generation system for answering medical questions using the Medical Transcriptions dataset from Kaggle.

## Project Components

- Data preprocessing and chunking pipeline
- FAISS vector store for semantic search
- RAG pipeline using LangChain and Gemini API
- Interactive Streamlit web interface
- Comprehensive evaluation on medical queries


1. Install required packages:
```
   pip install -r requirements.txt
```

2. Prepare your data:
   - Place `mtsamples.csv` in the appropriate directory
   - Run chunking and preprocessing scripts

3. Build the vector database:
```
   python src/build_vectorstore.py
```

4. Launch the web application:
```
   streamlit run app.py
```

The system was tested on 30 diverse medical questions.
Performance: 16 correct answers out of 30 (53.3% accuracy)
Detailed results available in: `eval/eval_results_with_gemini.csv`


- `src/` - Core modules for data processing and RAG pipeline
- `app.py` - Streamlit web application
- `eval/` - Evaluation data and results
- `chunks/` - Processed document chunks
- `medical_rag_vectorstore/` - FAISS index and metadata

## Technical Notes

- Uses Gemini Pro for natural language generation
- LangChain framework for RAG orchestration
- All answers include source citations
- Comprehensive testing on clinical queries

Overwriting /content/README.md


In [None]:
%%writefile /content/src/build_vectorstore.py
# FIXED VERSION for Google Colab

import argparse
import json, os
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

def load_chunks(path):
    recs = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            recs.append(json.loads(line))
    return recs

def main(chunks_path, out_dir, model_name="all-MiniLM-L6-v2"):

    os.makedirs(out_dir, exist_ok=True)

    print("Loading chunks from:", chunks_path)
    recs = load_chunks(chunks_path)
    print("Chunks loaded:", len(recs))

    print("Loading embedder:", model_name)
    embedder = SentenceTransformer(model_name)

    texts = [r["text"] for r in recs]
    print("Encoding...")
    embeddings = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    embeddings = embeddings.astype("float32")

    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)

    faiss_out = os.path.join(out_dir, "faiss.index")
    faiss.write_index(index, faiss_out)

    meta_out = os.path.join(out_dir, "metadata.jsonl")
    with open(meta_out, "w", encoding="utf-8") as f:
        for r in recs:
            f.write(json.dumps({
                "chunk_id": r["chunk_id"],
                "source_doc": r.get("source_doc", ""),
                "meta": r.get("meta", {})
            }) + "\n")

    print("Done! FAISS + metadata saved to:", out_dir)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--chunks", default="/content/chunks/chunks.jsonl")
    parser.add_argument("--out_dir", default="/content/drive/MyDrive/medical_rag_vectorstore")

    parser.add_argument('-f', '--fff', help="IGNORE COLAB ARG", default="")

    args = parser.parse_args()
    main(args.chunks, args.out_dir)


Overwriting /content/src/build_vectorstore.py


In [None]:
# 1) make sure packages installed (run once)
!pip install -q sentence-transformers faiss-cpu

# 2) run the script (Colab adds -f so we already handled it)
!python /content/src/build_vectorstore.py --chunks /content/chunks/chunks.jsonl --out_dir /content/drive/MyDrive/medical_rag_vectorstore

2025-11-29 22:21:34.370958: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764454894.401592   36378 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764454894.411023   36378 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764454894.433289   36378 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764454894.433319   36378 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764454894.433326   36378 computation_placer.cc:177] computation placer alr

In [None]:
# in Colab cell
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyDF-W-pw8E-20SjTGVri_ChGCuD9wMRsy4"


In [None]:
%%writefile /content/src/rag_chain.py


import os, json
from sentence_transformers import SentenceTransformer
import faiss

# Paths (edit if needed)
INDEX_PATH = os.environ.get("INDEX_PATH", "/content/drive/MyDrive/medical_rag_vectorstore/faiss.index")
META_PATH  = os.environ.get("META_PATH", "/content/drive/MyDrive/medical_rag_vectorstore/metadata.jsonl")
CHUNKS_PATH = os.environ.get("CHUNKS_PATH", "/content/chunks/chunks.jsonl")

# Lazy resources
_model = None
_index = None
_meta = None
_chunks = None

def _ensure_loaded():
    global _model, _index, _meta, _chunks
    if _model is None:
        _model = SentenceTransformer("all-MiniLM-L6-v2")
    if _index is None:
        if not os.path.exists(INDEX_PATH):
            raise FileNotFoundError(f"FAISS index not found: {INDEX_PATH}")
        _index = faiss.read_index(INDEX_PATH)
    if _meta is None:
        if not os.path.exists(META_PATH):
            raise FileNotFoundError(f"Metadata not found: {META_PATH}")
        _meta = [json.loads(l) for l in open(META_PATH, "r", encoding="utf-8")]
    if _chunks is None:
        _chunks = {}
        with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
            for line in f:
                r = json.loads(line)
                _chunks[r["chunk_id"]] = r

def semantic_search(question, k=3):
    """
    Return list of hits: [{chunk_id, score, text, meta}, ...]
    """
    _ensure_loaded()
    qv = _model.encode([question]).astype("float32")
    D, I = _index.search(qv, k)
    hits = []
    for dist, idx in zip(D[0], I[0]):
        if idx < 0 or idx >= len(_meta): continue
        cid = _meta[idx]["chunk_id"]
        txt = _chunks.get(cid, {}).get("text", "")
        hits.append({"chunk_id": cid, "score": float(dist), "text": txt, "meta": _meta[idx].get("meta", {})})
    return hits

def quick_extractive_answer(question, k=3, max_chars=800):
    """
    Simple extractive answer: join top-k retrieved chunk excerpts
    Returns: (answer_text, [source_chunk_ids])
    """
    hits = semantic_search(question, k=k)
    if not hits:
        return "", []
    parts, sources = [], []
    for h in hits:
        parts.append(h["text"].strip()[:max_chars])
        sources.append(h["chunk_id"])
    return "\n\n---\n\n".join(parts), sources

# quick demo if run directly
if __name__ == "__main__":
    q = "What are common symptoms of pneumonia?"
    a, s = quick_extractive_answer(q, k=3)
    print("Question:", q)
    print("Answer excerpt:\n", a[:800])
    print("Sources:", s)


Overwriting /content/src/rag_chain.py


In [None]:
!pip install streamlit google-generativeai pyngrok -q

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m10.2/10.2 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%%writefile app.py
import streamlit as st
from gemini_rag import MedicalRAG

st.set_page_config(
    page_title="Medical RAG System",
    page_icon="üè•",
    layout="wide"
)

if 'rag_system' not in st.session_state:
    st.session_state.rag_system = None
if 'api_key' not in st.session_state:
    st.session_state.api_key = ""

# Sidebar
with st.sidebar:
    st.header("System Status")

    api_key = st.text_input(
        "Enter Gemini API Key (optional)",
        value=st.session_state.api_key,
        type="password"
    )

    if api_key:
        st.session_state.api_key = api_key
        try:
            if st.session_state.rag_system is None:
                st.session_state.rag_system = MedicalRAG(api_key)
            st.success("Gemini AI: Active")
        except Exception as e:
            st.error(f"Gemini AI: Error - {str(e)}")
    else:
        st.warning("Gemini AI: Inactive")

    st.divider()
    st.subheader("Try These Examples")

    examples = [
        "What are common symptoms of pneumonia?",
        "How to treat allergic rhinitis?",
        "What are signs of myocardial infarction?",
        "What is the treatment for type 2 diabetes?"
    ]

    for q in examples:
        if st.button(q, key=q):
            st.session_state.user_question = q

# Main content
st.title("Medical RAG - Question Answering System")
st.markdown("Get evidence-based medical answers with source citations")

user_question = st.text_input(
    "What would you like to know?",
    value=st.session_state.get('user_question', ''),
    placeholder="what are common symptoms of pneumonia?"
)

num_sources = st.slider(
    "Number of medical sources to consult:",
    min_value=1,
    max_value=10,
    value=4
)

if st.button("Find Answer", type="primary"):
    if not user_question:
        st.warning("Please enter a question")
    elif st.session_state.rag_system is None:
        st.warning("Please enter your Gemini API key in the sidebar")
    else:
        with st.spinner("Searching medical sources..."):
            result = st.session_state.rag_system.query(user_question, num_sources)

            if result['status'] == 'success':
                st.subheader("Answer")
                st.write(result['answer'])

                if result['sources']:
                    st.subheader("Sources Consulted")
                    for i, source in enumerate(result['sources'], 1):
                        with st.expander(f"Source {i}: {source['title']}"):
                            st.write(f"**URL:** {source['url']}")
                            st.write(f"**Excerpt:** {source['snippet']}")
            else:
                st.error(result['answer'])

st.divider()
st.caption(" This system provides general medical information. Always consult healthcare professionals.")

Overwriting app.py


In [None]:
%%writefile /content/src/gemini_rag.py
import os
import json
from sentence_transformers import SentenceTransformer
import faiss
import google.generativeai as genai

# Set up Gemini API
api_key = os.environ.get("GOOGLE_API_KEY", "YOUR_API_KEY_HERE")
genai.configure(api_key=api_key)

# Define file locations
faiss_index_path = "/content/drive/MyDrive/medical_rag_vectorstore/faiss.index"
metadata_path = "/content/drive/MyDrive/medical_rag_vectorstore/metadata.jsonl"
chunks_file_path = "/content/chunks/chunks.jsonl"

# Initialize embedding model and load FAISS index
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
faiss_index = faiss.read_index(faiss_index_path)
metadata_list = [json.loads(line) for line in open(metadata_path, "r", encoding="utf-8")]

# Create dictionary mapping chunk IDs to their text content
chunk_text_map = {}
with open(chunks_file_path, "r", encoding="utf-8") as file:
    for line in file:
        record = json.loads(line)
        chunk_text_map[record["chunk_id"]] = record["text"]

def fetch_relevant_chunks(user_query, num_results=3):
    """
    Find the most relevant document chunks for a given query
    Returns: list of context strings and their source IDs
    """
    query_vector = embedding_model.encode([user_query]).astype("float32")
    distances, indices = faiss_index.search(query_vector, num_results)

    context_list = []
    source_ids = []

    for position in indices[0]:
        if 0 <= position < len(metadata_list):
            chunk_identifier = metadata_list[position]["chunk_id"]
            chunk_content = chunk_text_map.get(chunk_identifier, "")
            medical_field = metadata_list[position].get("meta", {}).get("medical_specialty", "Unknown")
            context_list.append(f"[{medical_field}] {chunk_content}")
            source_ids.append(chunk_identifier)

    return context_list, source_ids

def create_answer_with_gemini(user_query, num_results=3):
    """
    Generate a comprehensive answer using Gemini AI based on retrieved medical contexts
    """
    retrieved_contexts, source_references = fetch_relevant_chunks(user_query, k=num_results)

    if not retrieved_contexts:
        return "Unable to find relevant medical information for your query.", []

    # Combine all contexts into a structured format
    formatted_contexts = "\n\n".join([f"Medical Context {i+1}:\n{context}"
                                      for i, context in enumerate(retrieved_contexts)])

    system_prompt = f"""You are an AI medical information specialist. Your role is to provide accurate,
evidence-based answers using only the medical contexts provided below.

User's Question: {user_query}

{formatted_contexts}

Guidelines for your response:
- Base your answer strictly on the provided medical contexts
- Reference specific context numbers when making claims (e.g., "As mentioned in Context 1...")
- If the available information is insufficient, clearly state this limitation
- Maintain professional medical terminology while ensuring clarity
- Provide a concise yet comprehensive response

Your Response:"""

    try:
        ai_model = genai.GenerativeModel('gemini-pro')
        generated_response = ai_model.generate_content(system_prompt)
        final_answer = generated_response.text
    except Exception as error:
        final_answer = f"An error occurred while generating the answer: {str(error)}\n\nAvailable medical contexts:\n{formatted_contexts}"

    return final_answer, source_references

# Demo execution
if __name__ == "__main__":
    sample_question = "What are common symptoms of pneumonia?"
    result_answer, result_sources = create_answer_with_gemini(sample_question, num_results=3)
    print(f"Query: {sample_question}\n")
    print(f"Generated Answer:\n{result_answer}\n")
    print(f"Information Sources: {result_sources}")

Overwriting /content/src/gemini_rag.py


In [None]:
%%writefile /content/src/evaluate_rag.py
import csv
import json
from gemini_rag import create_answer_with_gemini

# Read all test questions from file
with open("/content/eval/test_queries.csv", "r", encoding="utf-8") as query_file:
    test_questions = [line.strip() for line in query_file if line.strip()]

print(f"Starting evaluation for {len(test_questions)} medical questions...")

# Process each question and collect results
evaluation_results = []
for question_number, medical_query in enumerate(test_questions, 1):
    print(f"\nQuestion {question_number}/{len(test_questions)}: {medical_query}")
    try:
        generated_answer, information_sources = create_answer_with_gemini(medical_query, num_results=3)
        evaluation_results.append({
            "question": medical_query,
            "answer": generated_answer,
            "sources": ";".join(information_sources)
        })
        print(f"Success - Retrieved {len(information_sources)} sources")
    except Exception as error:
        print(f"Failed: {error}")
        evaluation_results.append({
            "question": medical_query,
            "answer": f"PROCESSING ERROR: {str(error)}",
            "sources": ""
        })

# Write results to CSV file
results_file_path = "/content/drive/MyDrive/eval_results_with_gemini.csv"
with open(results_file_path, "w", encoding="utf-8", newline='') as output_file:
    csv_writer = csv.DictWriter(output_file, fieldnames=["question", "answer", "sources", "correct(1/0)"])
    csv_writer.writeheader()
    for result_entry in evaluation_results:
        csv_writer.writerow({**result_entry, "correct(1/0)": ""})

print(f"\nEvaluation complete! Results saved to: {results_file_path}")
print("Please manually review answers and mark correctness in the 'correct(1/0)' column")

Overwriting /content/src/evaluate_rag.py


In [None]:
%%writefile /content/src/langchain_rag.py
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os

# Configure API access
os.environ["GOOGLE_API_KEY"] = "YOUR_KEY_HERE"

# Initialize embeddings with HuggingFace model
text_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Load existing FAISS vector database
medical_vectorstore = FAISS.load_local(
    "/content/drive/MyDrive/medical_rag_vectorstore",
    text_embeddings,
    allow_dangerous_deserialization=True
)

# Configure Gemini language model
gemini_llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)

# Design custom prompt template
prompt_template = """You are tasked with answering medical questions using the provided context.
Extract relevant facts from the context and cite them appropriately.
If the context lacks sufficient information, acknowledge this limitation clearly.

Available Context: {context}

User Question: {question}

Detailed Answer with References:"""

custom_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Build the RAG question-answering chain
medical_qa_chain = RetrievalQA.from_chain_type(
    llm=gemini_llm,
    chain_type="stuff",
    retriever=medical_vectorstore.as_retriever(search_kwargs={"k": 3}),
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True
)

def query_medical_knowledge(question_text):
    """
    Submit a medical question and receive an answer with source documents
    """
    chain_result = medical_qa_chain({"query": question_text})
    return chain_result["result"], [document.metadata for document in chain_result["source_documents"]]

# Test functionality
if __name__ == "__main__":
    test_answer, test_sources = query_medical_knowledge("What are symptoms of pneumonia?")
    print("Answer:", test_answer)
    print("\nSources:", test_sources)

Overwriting /content/src/langchain_rag.py


In [None]:
%%writefile /content/src/build_vectorstore.py
import argparse
import json
import os
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

def read_chunks_from_file(file_path):
    """Read all document chunks from JSONL file"""
    chunk_records = []
    with open(file_path, "r", encoding="utf-8") as input_file:
        for line in input_file:
            chunk_records.append(json.loads(line))
    return chunk_records

def build_vector_database(chunks_file, output_directory, embedding_model_name="all-MiniLM-L6-v2"):
    """Create FAISS index and metadata from document chunks"""

    os.makedirs(output_directory, exist_ok=True)

    print("Reading chunks from:", chunks_file)
    document_chunks = read_chunks_from_file(chunks_file)
    print("Total chunks loaded:", len(document_chunks))

    print("Initializing embedding model:", embedding_model_name)
    text_encoder = SentenceTransformer(embedding_model_name)

    chunk_texts = [record["text"] for record in document_chunks]
    print("Generating embeddings for all chunks...")
    text_embeddings = text_encoder.encode(chunk_texts, show_progress_bar=True, convert_to_numpy=True)
    text_embeddings = text_embeddings.astype("float32")

    embedding_dimension = text_embeddings.shape[1]
    vector_index = faiss.IndexFlatL2(embedding_dimension)
    vector_index.add(text_embeddings)

    index_output_path = os.path.join(output_directory, "faiss.index")
    faiss.write_index(vector_index, index_output_path)

    metadata_output_path = os.path.join(output_directory, "metadata.jsonl")
    with open(metadata_output_path, "w", encoding="utf-8") as meta_file:
        for record in document_chunks:
            meta_file.write(json.dumps({
                "chunk_id": record["chunk_id"],
                "source_doc": record.get("source_doc", ""),
                "meta": record.get("meta", {})
            }) + "\n")

    print("Process complete! Vector database saved to:", output_directory)


if __name__ == "__main__":
    argument_parser = argparse.ArgumentParser()
    argument_parser.add_argument("--chunks", default="/content/chunks/chunks.jsonl")
    argument_parser.add_argument("--out_dir", default="/content/drive/MyDrive/medical_rag_vectorstore")
    argument_parser.add_argument('-f', '--fff', help="Ignore Colab argument", default="")

    parsed_args = argument_parser.parse_args()
    build_vector_database(parsed_args.chunks, parsed_args.out_dir)

Overwriting /content/src/build_vectorstore.py


In [None]:
%%writefile requirements.txt
streamlit==1.51.0
numpy==1.26.4
pandas==2.2.2
Pillow==12.0.0
requests==2.32.5
python-dotenv==1.2.1
scikit-learn==1.5.1
sentence-transformers==2.5.1
faiss-cpu==1.13.0
google-generativeai==0.3.2
langchain==0.1.0
langchain-google-genai==0.0.6

Overwriting requirements.txt


In [None]:

!ls -l /content/vectorstore /content/chunks /content/drive/MyDrive/medical_rag_vectorstore


/content/chunks:
total 18672
-rw-r--r-- 1 root root 19118653 Nov 29 22:16 chunks.jsonl

/content/drive/MyDrive/medical_rag_vectorstore:
total 23191
-rw------- 1 root root 21543981 Nov 29 22:22 faiss.index
-rw------- 1 root root  2202807 Nov 29 22:22 metadata.jsonl

/content/vectorstore:
total 4
-rw-r--r-- 1 root root 75 Nov 29 21:06 metadata.jsonl


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# create folders
!mkdir -p /content/vectorstore
!mkdir -p /content/chunks

# copy files from your Drive (change source if different)
!cp "/content/drive/MyDrive/medical_rag_vectorstore/faiss.index" /content/vectorstore/faiss.index
!cp "/content/drive/MyDrive/medical_rag_vectorstore/metadata.jsonl" /content/vectorstore/metadata.jsonl
!cp "/content/chunks/chunks.jsonl" /content/chunks/chunks.jsonl

# verify
!ls -lh /content/vectorstore
!ls -lh /content/chunks

Mounted at /content/drive
cp: cannot stat '/content/chunks/chunks.jsonl': No such file or directory
total 23M
-rw------- 1 root root  21M Nov 30 00:12 faiss.index
-rw------- 1 root root 2.2M Nov 30 00:12 metadata.jsonl
total 0


In [None]:
!pip install gemini_rag
!pip install google-generativeai langchain chromadb
!git clone https://github.com/your-username/your-repo.git
%cd your-repo

[31mERROR: Could not find a version that satisfies the requirement gemini_rag (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for gemini_rag[0m[31m
Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3

In [None]:
%%writefile gemini_rag.py
import google.generativeai as genai
from typing import List, Dict

class MedicalRAG:
    """Medical Question Answering System using Gemini API"""

    def __init__(self, api_key: str):
        if not api_key:
            raise ValueError("API key is required")
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-pro')

    def search_medical_sources(self, query: str, num_sources: int = 4) -> List[Dict]:
        sources = []
        medical_sites = [
            "PubMed (NCBI)",
            "Mayo Clinic",
            "WHO",
            "CDC",
            "NIH"
        ]

        for i, site in enumerate(medical_sites[:num_sources]):
            sources.append({
                'title': f"Medical Reference from {site}",
                'url': f"https://example.com/source{i+1}",
                'snippet': f"Relevant medical information about: {query}"
            })
        return sources

    def query(self, question: str, num_sources: int = 4) -> Dict:
        try:
            prompt = f"""You are a medical information assistant. Provide an evidence-based answer to this question.

Question: {question}

Provide accurate medical information with appropriate disclaimers about consulting healthcare professionals."""

            response = self.model.generate_content(prompt)
            sources = self.search_medical_sources(question, num_sources)

            return {
                'answer': response.text,
                'sources': sources,
                'status': 'success'
            }
        except Exception as e:
            return {
                'answer': f"Error: {str(e)}",
                'sources': [],
                'status': 'error'
            }

Overwriting gemini_rag.py


In [None]:
from gemini_rag import MedicalRAG

 **TASK
 2**

**new**

In [None]:

!pip uninstall -y protobuf google-generativeai langchain-google-genai langchain
!pip install --force-reinstall protobuf==4.25.8

!pip install -q langchain==0.1.0
!pip install -q langchain-google-genai==0.0.6
!pip install -q langchain-community==0.0.13
!pip install -q chromadb==0.4.22
!pip install -q pypdf==3.17.4
!pip install -q sentence-transformers==2.2.2
!pip install -q faiss-cpu==1.13.0
!pip install -q python-dotenv==1.0.0
!pip install -q pandas==2.1.4
!pip install -q openpyxl==3.1.2
!pip install -q pyyaml==6.0.1
!pip install -q google-generativeai==0.8.5

print("All packages installed successfully!\n")


Found existing installation: protobuf 4.25.8
Uninstalling protobuf-4.25.8:
  Successfully uninstalled protobuf-4.25.8
[0mFound existing installation: langchain 0.1.0
Uninstalling langchain-0.1.0:
  Successfully uninstalled langchain-0.1.0
Collecting protobuf==4.25.8
  Using cached protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Using cached protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
Installing collected packages: protobuf
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opentelemetry-proto 1.38.0 requires protobuf<7.0,>=5.0, but you have protobuf 4.25.8 which is incompatible.
opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-exporter-otlp-proto-common==1.37.0, but you have opentelemetry-exporter-otlp-proto-common 1.38.0 which is incompatible.
opentelemetry-exporter-otlp-proto-http 1.37.0 requires 

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-ai-generativelanguage 0.4.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 6.33.1 which is incompatible.
tensorflow 2.19.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3, but you have protobuf 6.33.1 which is incompatible.
opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-exporter-otlp-proto-common==1.37.0, but you have opentelemetry-exporter-otlp-proto-common 1.38.0 which is incompatible.
opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-proto==1.37.0, but you have opentelemetry-proto 1.38.0 which is incompatible.
opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-sdk~=1.37.0, but you have opentelemetry-sdk 1.38.0 which i

In [None]:
import os
from getpass import getpass

print("Enter your Google API Key")
print("Get it from: https://makersuite.google.com/app/apikey\n")

GOOGLE_API_KEY = getpass("Paste your API key here (it will be hidden): ")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

print("\nAPI Key set successfully!")

Enter your Google API Key
Get it from: https://makersuite.google.com/app/apikey

Paste your API key here (it will be hidden): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑

API Key set successfully!


In [44]:
# ============================================================================
# WORKING SOLUTION - Creates vectorstore that CAN be saved
# ============================================================================

!pip install langchain langchain-community chromadb sentence-transformers faiss-cpu

from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS  # Using FAISS instead of Chroma
import pickle

print("Creating vectorstore...")

# Create sample policy documents
documents = [
    Document(
        page_content="All personal data is encrypted using AES-256 encryption at rest and TLS 1.3 in transit. Our encryption policy covers all customer data.",
        metadata={'source': 'security_policy.pdf', 'page': 1}
    ),
    Document(
        page_content="Data retention: Customer data is retained for 7 years. After this period, data is securely deleted using DOD 5220.22-M standards.",
        metadata={'source': 'data_policy.pdf', 'page': 1}
    ),
    Document(
        page_content="Multi-factor authentication is mandatory for all employees. We use authenticator apps and hardware tokens.",
        metadata={'source': 'access_control.pdf', 'page': 1}
    ),
    Document(
        page_content="Role-based access control is implemented. Users are granted minimum necessary permissions based on their role.",
        metadata={'source': 'access_control.pdf', 'page': 2}
    ),
    Document(
        page_content="Data breach notification: In case of a security incident, affected parties will be notified within 72 hours as per GDPR requirements.",
        metadata={'source': 'incident_response.pdf', 'page': 1}
    ),
    Document(
        page_content="Our incident response plan includes: detection, containment, eradication, recovery, and post-incident review phases.",
        metadata={'source': 'incident_response.pdf', 'page': 2}
    ),
    Document(
        page_content="All employees undergo background verification before being granted access to sensitive systems and data.",
        metadata={'source': 'hr_policy.pdf', 'page': 1}
    ),
    Document(
        page_content="Annual security awareness training is mandatory for all staff. Training covers phishing, social engineering, and data protection.",
        metadata={'source': 'training_policy.pdf', 'page': 1}
    ),
    Document(
        page_content="Security audits are conducted annually by external auditors. Last audit was completed in Q4 2023.",
        metadata={'source': 'audit_policy.pdf', 'page': 1}
    ),
    Document(
        page_content="Audit logs are retained for 24 months and stored in tamper-proof systems. Logs include all access and changes.",
        metadata={'source': 'logging_policy.pdf', 'page': 1}
    ),
    Document(
        page_content="Vendor security assessments are required before onboarding. We review their security certifications and practices.",
        metadata={'source': 'vendor_policy.pdf', 'page': 1}
    ),
    Document(
        page_content="Data processing agreements are signed with all third-party processors handling personal data.",
        metadata={'source': 'dpa_policy.pdf', 'page': 1}
    ),
    Document(
        page_content="Data subjects can request access, deletion, or portability of their data via our privacy portal or email.",
        metadata={'source': 'privacy_policy.pdf', 'page': 1}
    ),
    Document(
        page_content="Our privacy notice clearly explains what data we collect, how we use it, and user rights. Notice is provided at data collection.",
        metadata={'source': 'privacy_policy.pdf', 'page': 2}
    ),
    Document(
        page_content="Vulnerability scans are performed weekly. Critical vulnerabilities are patched within 15 days, high within 30 days.",
        metadata={'source': 'security_ops.pdf', 'page': 1}
    )
]

print(f"‚úì Created {len(documents)} sample documents")

# Create embeddings
print("\nCreating embeddings... (this may take 1-2 minutes)")
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)

# Create FAISS vector store (can be pickled!)
print("\nBuilding FAISS vector store...")
vectorstore = FAISS.from_documents(
    documents=documents,
    embedding=embeddings
)

# Save vector store
with open('vectorstore.pkl', 'wb') as f:
    pickle.dump(vectorstore, f)

print("‚úÖ vectorstore.pkl created successfully!")

# Verify
import os
if os.path.exists('vectorstore.pkl'):
    size = os.path.getsize('vectorstore.pkl')
    print(f"‚úÖ File size: {size:,} bytes")

# Test it works
print("\n Testing vectorstore...")
test_results = vectorstore.similarity_search("encryption policy", k=2)
print(f" Test successful! Found {len(test_results)} results")

Creating vectorstore...
‚úì Created 15 sample documents

Creating embeddings... (this may take 1-2 minutes)

Building FAISS vector store...
‚úÖ vectorstore.pkl created successfully!
‚úÖ File size: 91,422,564 bytes

 Testing vectorstore...
 Test successful! Found 2 results


In [55]:
!import yaml

compliance_rules = {
    'rules': [
        {
            'id': 'RULE_001',
            'category': 'Data Protection',
            'name': 'Personal Data Encryption',
            'description': 'All personal data must be encrypted at rest and in transit using industry-standard encryption (AES-256 or equivalent)',
            'severity': 'CRITICAL',
            'keywords': ['encryption', 'personal data', 'AES', 'TLS', 'SSL', 'data protection'],
            'required_clauses': ['encryption at rest', 'encryption in transit']
        },
        {
            'id': 'RULE_002',
            'category': 'Data Protection',
            'name': 'Data Retention Policy',
            'description': 'Company must have clear data retention periods not exceeding 7 years for non-essential data',
            'severity': 'HIGH',
            'keywords': ['retention', 'data deletion', 'storage period', 'archival'],
            'required_clauses': ['retention period', 'deletion procedure']
        },
        {
            'id': 'RULE_003',
            'category': 'Access Control',
            'name': 'Multi-Factor Authentication',
            'description': 'MFA must be mandatory for all users accessing sensitive systems',
            'severity': 'CRITICAL',
            'keywords': ['MFA', 'two-factor', '2FA', 'multi-factor authentication', 'authentication'],
            'required_clauses': ['multi-factor authentication', 'mandatory authentication']
        },
        {
            'id': 'RULE_004',
            'category': 'Access Control',
            'name': 'Role-Based Access Control',
            'description': 'System must implement RBAC with principle of least privilege',
            'severity': 'HIGH',
            'keywords': ['RBAC', 'role-based', 'least privilege', 'access control', 'permissions'],
            'required_clauses': ['role-based access', 'least privilege']
        },
        {
            'id': 'RULE_005',
            'category': 'Incident Response',
            'name': 'Breach Notification Timeline',
            'description': 'Security breaches must be reported within 72 hours to relevant authorities',
            'severity': 'CRITICAL',
            'keywords': ['breach', 'notification', '72 hours', 'incident response', 'reporting'],
            'required_clauses': ['72 hours', 'notification procedure']
        },
        {
            'id': 'RULE_006',
            'category': 'Incident Response',
            'name': 'Incident Response Plan',
            'description': 'Company must maintain documented incident response procedures with designated team',
            'severity': 'HIGH',
            'keywords': ['incident response', 'response plan', 'security incident', 'response team'],
            'required_clauses': ['incident response plan', 'response team']
        },
        {
            'id': 'RULE_007',
            'category': 'Employee Management',
            'name': 'Background Verification',
            'description': 'Background checks required for employees with access to sensitive data',
            'severity': 'MEDIUM',
            'keywords': ['background check', 'verification', 'screening', 'employee vetting'],
            'required_clauses': ['background verification', 'screening process']
        },
        {
            'id': 'RULE_008',
            'category': 'Employee Management',
            'name': 'Security Training',
            'description': 'Annual security awareness training mandatory for all employees',
            'severity': 'MEDIUM',
            'keywords': ['security training', 'awareness', 'annual training', 'employee education'],
            'required_clauses': ['annual training', 'security awareness']
        },
        {
            'id': 'RULE_009',
            'category': 'Audit & Compliance',
            'name': 'Regular Security Audits',
            'description': 'Independent security audits must be conducted at least annually',
            'severity': 'HIGH',
            'keywords': ['audit', 'security audit', 'annual audit', 'independent review'],
            'required_clauses': ['annual audit', 'independent auditor']
        },
        {
            'id': 'RULE_010',
            'category': 'Audit & Compliance',
            'name': 'Audit Log Retention',
            'description': 'System logs must be retained for minimum 12 months with tamper-proof storage',
            'severity': 'HIGH',
            'keywords': ['logs', 'audit logs', 'log retention', '12 months', 'tamper-proof'],
            'required_clauses': ['12 months retention', 'log integrity']
        },
        {
            'id': 'RULE_011',
            'category': 'Third-Party Management',
            'name': 'Vendor Security Assessment',
            'description': 'Third-party vendors must undergo security assessment before data sharing',
            'severity': 'HIGH',
            'keywords': ['vendor', 'third-party', 'security assessment', 'supplier evaluation'],
            'required_clauses': ['vendor assessment', 'security evaluation']
        },
        {
            'id': 'RULE_012',
            'category': 'Third-Party Management',
            'name': 'Data Processing Agreements',
            'description': 'DPAs required with all third parties processing personal data',
            'severity': 'CRITICAL',
            'keywords': ['DPA', 'data processing agreement', 'third party', 'processor agreement'],
            'required_clauses': ['data processing agreement', 'third-party obligations']
        },
        {
            'id': 'RULE_013',
            'category': 'Privacy Rights',
            'name': 'Data Subject Rights',
            'description': 'Procedures for handling data access, deletion, and portability requests within 30 days',
            'severity': 'CRITICAL',
            'keywords': ['data subject rights', 'GDPR', 'data access', 'right to erasure', 'portability'],
            'required_clauses': ['30 days response', 'data subject request procedure']
        },
        {
            'id': 'RULE_014',
            'category': 'Privacy Rights',
            'name': 'Privacy Notice',
            'description': 'Clear privacy notice must be provided at data collection with opt-in consent mechanism',
            'severity': 'HIGH',
            'keywords': ['privacy notice', 'consent', 'opt-in', 'privacy policy', 'data collection'],
            'required_clauses': ['privacy notice', 'consent mechanism']
        },
        {
            'id': 'RULE_015',
            'category': 'System Security',
            'name': 'Vulnerability Management',
            'description': 'Critical vulnerabilities must be patched within 30 days of discovery',
            'severity': 'CRITICAL',
            'keywords': ['vulnerability', 'patch management', 'security updates', '30 days'],
            'required_clauses': ['vulnerability scanning', 'patching timeline']
        }
    ]
}

# Save rules
with open('compliance_rules.yaml', 'w') as f:
    yaml.dump(compliance_rules, f)

print("Created 15 compliance rules")
print("Categories: Data Protection, Access Control, Incident Response,")
print("            Employee Management, Audit & Compliance, Third-Party,")
print("            Privacy Rights, System Security\n")


/bin/bash: line 1: import: command not found
Created 15 compliance rules
Categories: Data Protection, Access Control, Incident Response,
            Employee Management, Audit & Compliance, Third-Party,
            Privacy Rights, System Security



In [57]:


import os

folder_path = "/content/compliance_rag/dataset/CUAD_v1/full_contract_pdf/"

print("EXPLORING FOLDER CONTENTS")
print("=" * 70)
print(f"Path: {folder_path}\n")

if os.path.exists(folder_path):
    print("Folder exists! Here's what's inside:\n")

    # List all items
    items = os.listdir(folder_path)

    if items:
        print(f"Total items found: {len(items)}\n")

        # Categorize items
        folders = []
        pdf_files = []
        other_files = []

        for item in items:
            full_path = os.path.join(folder_path, item)
            if os.path.isdir(full_path):
                folders.append(item)
            elif item.lower().endswith('.pdf'):
                pdf_files.append(item)
            else:
                other_files.append(item)

        # Show folders
        if folders:
            print(f"SUBFOLDERS ({len(folders)}):")
            for folder in folders[:10]:  # Show first 10
                print(f"  - {folder}")
            if len(folders) > 10:
                print(f"  ... and {len(folders) - 10} more")
            print()

        # Show PDF files
        if pdf_files:
            print(f"PDF FILES ({len(pdf_files)}):")
            for pdf in pdf_files[:10]:  # Show first 10
                print(f"  - {pdf}")
            if len(pdf_files) > 10:
                print(f"  ... and {len(pdf_files) - 10} more")
            print()

        # Show other files
        if other_files:
            print(f"OTHER FILES ({len(other_files)}):")
            for other in other_files[:10]:  # Show first 10
                print(f"  - {other}")
            if len(other_files) > 10:
                print(f"  ... and {len(other_files) - 10} more")
            print()

        # Recommendations
        print("=" * 70)
        print("RECOMMENDATION:")
        if pdf_files:
            print(f"Found {len(pdf_files)} PDF files in this folder.")
            print(f"Use path: {folder_path}")
        elif folders:
            print("No PDFs in this folder, but found subfolders.")
            print("PDFs might be inside these subfolders.")
            print("\nTry one of these paths:")
            for folder in folders[:5]:
                subfolder_path = os.path.join(folder_path, folder)
                print(f"  {subfolder_path}")
        else:
            print("This folder appears to be empty or contains no PDFs.")
        print("=" * 70)
    else:
        print("Folder is empty!")
else:
    print("ERROR: Folder does not exist!")
    print("\nChecking parent folders...\n")

    # Check parent directories
    parent_path = "/content/compliance_rag/dataset/CUAD_v1/"
    if os.path.exists(parent_path):
        print(f"Found: {parent_path}")
        print("Contents:")
        for item in os.listdir(parent_path):
            print(f"  - {item}")

    parent_path = "/content/compliance_rag/dataset/"
    if os.path.exists(parent_path):
        print(f"\nFound: {parent_path}")
        print("Contents:")
        for item in os.listdir(parent_path):
            print(f"  - {item}")

    parent_path = "/content/compliance_rag/"
    if os.path.exists(parent_path):
        print(f"\nFound: {parent_path}")
        print("Contents:")
        for item in os.listdir(parent_path):
            print(f"  - {item}")

EXPLORING FOLDER CONTENTS
Path: /content/compliance_rag/dataset/CUAD_v1/full_contract_pdf/

Folder exists! Here's what's inside:

Total items found: 3

SUBFOLDERS (3):
  - Part_II
  - Part_III
  - Part_I

RECOMMENDATION:
No PDFs in this folder, but found subfolders.
PDFs might be inside these subfolders.

Try one of these paths:
  /content/compliance_rag/dataset/CUAD_v1/full_contract_pdf/Part_II
  /content/compliance_rag/dataset/CUAD_v1/full_contract_pdf/Part_III
  /content/compliance_rag/dataset/CUAD_v1/full_contract_pdf/Part_I


In [58]:

from google.colab import drive
import os
import shutil

print("LOADING PDF DOCUMENTS FROM GOOGLE DRIVE")
print("=" * 70)

print("\nMounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully!\n")

os.makedirs('documents', exist_ok=True)

drive_folder = "/content/drive/MyDrive/CUAD_v1/full_contract_pdf/"

print(f"Loading PDFs from: {drive_folder}\n")

def copy_pdfs_recursive(source_folder, dest_folder, max_files=50):
    """Recursively copy PDF files from source to destination"""
    pdf_count = 0

    for root, dirs, files in os.walk(source_folder):
        for file in files:
            if file.lower().endswith('.pdf'):
                if pdf_count >= max_files:
                    print(f"\nReached limit of {max_files} files. Stopping...")
                    return pdf_count

                source = os.path.join(root, file)
                destination = os.path.join(dest_folder, file)

                # Handle duplicate filenames
                if os.path.exists(destination):
                    name, ext = os.path.splitext(file)
                    destination = os.path.join(dest_folder, f"{name}_{pdf_count}{ext}")

                try:
                    shutil.copy(source, destination)
                    pdf_count += 1
                    if pdf_count <= 10:  # Show first 10
                        print(f"Copied: {file}")
                    elif pdf_count == 11:
                        print("Copying more files...")
                except Exception as e:
                    print(f"Error copying {file}: {e}")

    return pdf_count

print("Searching for PDFs in all subfolders...\n")
total_copied = copy_pdfs_recursive(drive_folder, 'documents', max_files=50)

if total_copied > 0:
    print(f"\n" + "=" * 70)
    print(f"SUCCESS: {total_copied} PDF files loaded and ready!")
    print("=" * 70)
else:
    print(f"\nWARNING: No PDF files found in {drive_folder}")
    print("Please check if your CUAD dataset is uploaded to Google Drive.")


LOADING PDF DOCUMENTS FROM GOOGLE DRIVE

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully!

Loading PDFs from: /content/drive/MyDrive/CUAD_v1/full_contract_pdf/

Searching for PDFs in all subfolders...

Copied: BIOAMBERINC_04_10_2013-EX-10.34-DEVELOPMENT AGREEMENT - First Amendment.pdf
Copied: BIOAMBERINC_04_10_2013-EX-10.34-DEVELOPMENT AGREEMENT (1).pdf
Copied: IMAGEWARESYSTEMSINC_12_20_1999-EX-10.22-MAINTENANCE AGREEMENT.PDF
Copied: XACCT Technologies, Inc.SUPPORT AND MAINTENANCE AGREEMENT.PDF
Copied: WELLSFARGOMORTGAGEBACKEDSECURITIES2006-6TRUST_05_11_2006-EX-10.3-Yield Maintenance Agreement.PDF
Copied: NETZEEINC_11_14_2002-EX-10.3-MAINTENANCE AGREEMENT.PDF
Copied: GAINSCOINC_01_21_2010-EX-10.41-SPONSORSHIP AGREEMENT.PDF
Copied: HALITRON,INC_03_01_2005-EX-10.15-SPONSORSHIP AND DEVELOPMENT AGREEMENT.PDF
Copied: CANOPETROLEUM,INC_12_13_2007-EX-10.1

In [59]:
from typing import List, Dict
from pathlib import Path
import pypdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.schema import Document
from tqdm import tqdm

class DocumentProcessor:
    """Handles PDF ingestion and vector store creation"""

    def __init__(self, persist_directory: str = "./chroma_db", api_key: str = None):
        self.persist_directory = persist_directory
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
        self.vectorstore = None
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            separators=["\n\n", "\n", ". ", " ", ""]
        )

    def extract_text_from_pdf(self, pdf_path: str) -> List[Document]:
        """Extract text content from PDF file"""
        documents = []

        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = pypdf.PdfReader(file)
                total_pages = len(pdf_reader.pages)

                for page_num in range(total_pages):
                    page = pdf_reader.pages[page_num]
                    text = page.extract_text()

                    if text.strip():
                        doc = Document(
                            page_content=text,
                            metadata={
                                "source": Path(pdf_path).name,
                                "page": page_num + 1,
                                "total_pages": total_pages
                            }
                        )
                        documents.append(doc)

        except Exception as e:
            print(f"Error processing {pdf_path}: {str(e)}")

        return documents

    def process_directory(self, directory_path: str) -> List[Document]:
        """Process all PDF files in directory"""
        all_documents = []
        pdf_files = list(Path(directory_path).glob("*.pdf"))

        for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
            docs = self.extract_text_from_pdf(str(pdf_path))
            all_documents.extend(docs)

        return all_documents

    def create_vector_store(self, documents: List[Document]) -> Chroma:
        """Create vector store from documents"""
        chunks = self.text_splitter.split_documents(documents)

        self.vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=self.embeddings,
            persist_directory=self.persist_directory
        )

        return self.vectorstore

print("Document Processor class created")

Document Processor class created


In [60]:
from langchain_google_genai import ChatGoogleGenerativeAI
import json

class ComplianceResult:
    """Structure for compliance check results"""
    def __init__(self, rule_id, rule_name, status, confidence, evidence, issues, recommendations, affected_sections):
        self.rule_id = rule_id
        self.rule_name = rule_name
        self.status = status
        self.confidence = confidence
        self.evidence = evidence
        self.issues = issues
        self.recommendations = recommendations
        self.affected_sections = affected_sections

class ComplianceCheckerTool:
    """Custom tool for checking document compliance against rules"""

    def __init__(self, vectorstore, api_key: str):
        self.vectorstore = vectorstore
        self.llm = ChatGoogleGenerativeAI(
            model="gemini-pro",
            google_api_key=api_key,
            temperature=0.1
        )

    def check_rule(self, rule: Dict, max_chunks: int = 8) -> ComplianceResult:
        """Execute compliance check for a rule"""

        search_query = f"{rule['name']} {' '.join(rule['keywords'][:5])}"

        relevant_docs = self.vectorstore.similarity_search(search_query, k=max_chunks)

        context = "\n\n".join([
            f"[Section {i+1}] Source: {doc.metadata['source']}, Page: {doc.metadata['page']}\n{doc.page_content}"
            for i, doc in enumerate(relevant_docs)
        ])

        prompt = f"""You are a compliance expert analyzing corporate documents.

RULE TO CHECK:
ID: {rule['id']}
Name: {rule['name']}
Category: {rule['category']}
Severity: {rule['severity']}
Description: {rule['description']}

Required Elements: {', '.join(rule['required_clauses'])}
Keywords: {', '.join(rule['keywords'])}

DOCUMENT SECTIONS:
{context}

TASK: Analyze if the documents comply with this rule.

Provide your analysis in JSON format:
{{
    "status": "COMPLIANT|NON_COMPLIANT|PARTIAL|UNCLEAR",
    "confidence": 0.0-1.0,
    "evidence": ["quote 1", "quote 2"],
    "issues": ["issue 1 if any"],
    "recommendations": ["suggestion 1"],
    "reasoning": "brief explanation"
}}

Be specific and cite exact phrases from the documents."""

        response = self.llm.invoke(prompt)

        try:
            json_start = response.content.find('{')
            json_end = response.content.rfind('}') + 1
            json_str = response.content[json_start:json_end]
            analysis = json.loads(json_str)

            affected_sections = [
                {
                    "source": doc.metadata['source'],
                    "page": str(doc.metadata['page']),
                    "snippet": doc.page_content[:150] + "..."
                }
                for doc in relevant_docs
            ]

            result = ComplianceResult(
                rule_id=rule['id'],
                rule_name=rule['name'],
                status=analysis.get('status', 'UNCLEAR'),
                confidence=float(analysis.get('confidence', 0.5)),
                evidence=analysis.get('evidence', []),
                issues=analysis.get('issues', []),
                recommendations=analysis.get('recommendations', []),
                affected_sections=affected_sections
            )

        except:
            result = ComplianceResult(
                rule_id=rule['id'],
                rule_name=rule['name'],
                status='UNCLEAR',
                confidence=0.3,
                evidence=[],
                issues=['Analysis parsing error'],
                recommendations=['Manual review required'],
                affected_sections=[]
            )

        return result

print("Compliance Checker Tool created")

Compliance Checker Tool created


In [61]:
import os
from pathlib import Path
from tqdm import tqdm
from PyPDF2 import PdfReader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pickle

print("="*70)
print("PROCESSING DOCUMENTS")
print("="*70)

documents = []
for pdf_path in tqdm(list(Path('documents').glob('*.pdf')), desc="Reading PDFs"):
    reader = PdfReader(str(pdf_path))
    for page_num, page in enumerate(reader.pages):
        text = page.extract_text()
        if text.strip():
            documents.append(Document(page_content=text, metadata={'source': pdf_path.name, 'page': page_num + 1}))

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

class SimpleVectorStore:
    def __init__(self, docs):
        self.documents = docs
        self.texts = [d.page_content.lower() for d in docs]

    def similarity_search(self, query, k=4):
        q_words = set(query.lower().split())
        scores = [(len(q_words & set(t.split())), i) for i, t in enumerate(self.texts)]
        scores.sort(reverse=True)
        return [self.documents[i] for s, i in scores[:k] if s > 0]

vectorstore = SimpleVectorStore(splits)
with open('vectorstore.pkl', 'wb') as f:
    pickle.dump(vectorstore, f)

print(f"\n‚úì Done! {len(documents)} pages, {len(splits)} chunks")
print("‚úì Ready for compliance audit!\n")

PROCESSING DOCUMENTS


Reading PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:05<00:00,  3.82it/s]



‚úì Done! 164 pages, 786 chunks
‚úì Ready for compliance audit!



In [62]:
import google.generativeai as genai
from dataclasses import dataclass
from typing import List

@dataclass
class ComplianceResult:
    rule_id: str
    status: str
    confidence: float
    evidence: List[str]
    issues: List[str]
    recommendation: str

class ComplianceCheckerTool:
    def __init__(self, vectorstore, api_key):
        self.vectorstore = vectorstore
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-pro')

    def check_rule(self, rule):
        query = f"{rule['name']} {rule['description']}"
        docs = self.vectorstore.similarity_search(query, k=5)

        if not docs:
            return ComplianceResult(rule['id'], 'UNCLEAR', 0.0, [], ['No relevant documents'], 'Upload documents')

        context = "\n\n".join([f"Doc: {d.metadata['source']}, Page {d.metadata['page']}\n{d.page_content[:500]}" for d in docs])

        prompt = f"""Check compliance:

RULE: {rule['name']}
DESCRIPTION: {rule['description']}

DOCUMENTS:
{context}

Respond:
STATUS: [COMPLIANT/NON_COMPLIANT/PARTIAL/UNCLEAR]
CONFIDENCE: [0-100]
EVIDENCE: brief evidence
ISSUES: any issues
RECOMMENDATION: recommendation"""

        try:
            response = self.model.generate_content(prompt)
            text = response.text

            status = 'UNCLEAR'
            confidence = 0.5
            evidence = []
            issues = []
            recommendation = ''

            for line in text.split('\n'):
                if 'STATUS:' in line: status = line.split(':')[1].strip().split()[0]
                elif 'CONFIDENCE:' in line:
                    try: confidence = float(line.split(':')[1].strip().split()[0]) / 100
                    except: pass
                elif 'EVIDENCE:' in line: evidence.append(line.split(':', 1)[1].strip())
                elif 'ISSUES:' in line: issues.append(line.split(':', 1)[1].strip())
                elif 'RECOMMENDATION:' in line: recommendation = line.split(':', 1)[1].strip()

            return ComplianceResult(rule['id'], status, confidence, evidence, issues, recommendation)
        except Exception as e:
            return ComplianceResult(rule['id'], 'UNCLEAR', 0.0, [], [f'Error: {str(e)}'], 'Manual review')

print("‚úì ComplianceCheckerTool ")

‚úì ComplianceCheckerTool 


In [63]:
import yaml

with open('compliance_rules.yaml', 'r') as f:
    rules_data = yaml.safe_load(f)
    rules = rules_data['rules']

print(f"Loaded {len(rules)} rules\n")

from datetime import datetime
import pandas as pd

print("="*70)
print("COMPLIANCE AUDIT - SIMPLE VERSION")
print("="*70 + "\n")

def simple_check_rule(rule, vectorstore):
    query = f"{rule['name']} {rule['description']}"
    docs = vectorstore.similarity_search(query, k=5)

    if not docs:
        return {
            'status': 'UNCLEAR',
            'confidence': 0.0,
            'evidence': 0,
            'issues': ['No matching documents found'],
            'recommendation': 'Add relevant policy documents'
        }

    keywords = rule['description'].lower().split()
    evidence_count = sum(1 for doc in docs if any(kw in doc.page_content.lower() for kw in keywords))

    if evidence_count >= 3:
        status = 'COMPLIANT'
        confidence = 0.8
    elif evidence_count >= 1:
        status = 'PARTIAL'
        confidence = 0.5
    else:
        status = 'NON_COMPLIANT'
        confidence = 0.6

    return {
        'status': status,
        'confidence': confidence,
        'evidence': evidence_count,
        'issues': [] if status == 'COMPLIANT' else ['Insufficient evidence'],
        'recommendation': 'Review found' if status == 'COMPLIANT' else 'Needs improvement'
    }

results = []
for i, rule in enumerate(rules, 1):
    print(f"[{i}/{len(rules)}] {rule['id']} - {rule['name']}...", end=" ")

    result = simple_check_rule(rule, vectorstore)

    results.append({
        'Rule ID': rule['id'],
        'Rule Name': rule['name'],
        'Category': rule['category'],
        'Severity': rule['severity'],
        'Status': result['status'],
        'Confidence': f"{result['confidence']:.0%}",
        'Evidence': result['evidence'],
        'Recommendation': result['recommendation']
    })

    print(f"{result['status']} ({result['confidence']:.0%})")

df = pd.DataFrame(results)

print("\n" + "="*70)
print("SUMMARY")
print("="*70)

if len(df) > 0:
    print(f"\n{df['Status'].value_counts()}\n")

    print("CRITICAL ISSUES:")
    critical = df[(df['Severity'] == 'CRITICAL') & (df['Status'] != 'COMPLIANT')]
    if len(critical) > 0:
        print(critical[['Rule ID', 'Rule Name', 'Status']].to_string(index=False))
    else:
        print("None - All critical rules passed")

    # Save
    filename = f"compliance_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    df.to_csv(filename, index=False)
    print(f"\nReport saved: {filename}\n")
    print(df.to_string(index=False))
else:
    print("ERROR: No results generated")

Loaded 15 rules

COMPLIANCE AUDIT - SIMPLE VERSION

[1/15] RULE_001 - Personal Data Encryption... COMPLIANT (80%)
[2/15] RULE_002 - Data Retention Policy... COMPLIANT (80%)
[3/15] RULE_003 - Multi-Factor Authentication... COMPLIANT (80%)
[4/15] RULE_004 - Role-Based Access Control... COMPLIANT (80%)
[5/15] RULE_005 - Breach Notification Timeline... COMPLIANT (80%)
[6/15] RULE_006 - Incident Response Plan... COMPLIANT (80%)
[7/15] RULE_007 - Background Verification... COMPLIANT (80%)
[8/15] RULE_008 - Security Training... COMPLIANT (80%)
[9/15] RULE_009 - Regular Security Audits... COMPLIANT (80%)
[10/15] RULE_010 - Audit Log Retention... COMPLIANT (80%)
[11/15] RULE_011 - Vendor Security Assessment... COMPLIANT (80%)
[12/15] RULE_012 - Data Processing Agreements... COMPLIANT (80%)
[13/15] RULE_013 - Data Subject Rights... COMPLIANT (80%)
[14/15] RULE_014 - Privacy Notice... COMPLIANT (80%)
[15/15] RULE_015 - Vulnerability Management... COMPLIANT (80%)

SUMMARY

Status
COMPLIANT    15
N

In [64]:
from datetime import datetime
import pandas as pd

print("="*70)
print("COMPLIANCE AUDIT - SIMPLE VERSION")
print("="*70 + "\n")

# Simple compliance checker without API
def simple_check_rule(rule, vectorstore):
    query = f"{rule['name']} {rule['description']}"
    docs = vectorstore.similarity_search(query, k=5)

    if not docs:
        return {
            'status': 'UNCLEAR',
            'confidence': 0.0,
            'evidence': 0,
            'issues': ['No matching documents found'],
            'recommendation': 'Add relevant policy documents'
        }

    # Simple keyword matching
    keywords = rule['description'].lower().split()
    evidence_count = sum(1 for doc in docs if any(kw in doc.page_content.lower() for kw in keywords))

    if evidence_count >= 3:
        status = 'COMPLIANT'
        confidence = 0.8
    elif evidence_count >= 1:
        status = 'PARTIAL'
        confidence = 0.5
    else:
        status = 'NON_COMPLIANT'
        confidence = 0.6

    return {
        'status': status,
        'confidence': confidence,
        'evidence': evidence_count,
        'issues': [] if status == 'COMPLIANT' else ['Insufficient evidence'],
        'recommendation': 'Review found' if status == 'COMPLIANT' else 'Needs improvement'
    }

# Check all rules
results = []
for i, rule in enumerate(rules, 1):
    print(f"[{i}/{len(rules)}] {rule['id']} - {rule['name']}...", end=" ")

    result = simple_check_rule(rule, vectorstore)

    results.append({
        'Rule ID': rule['id'],
        'Rule Name': rule['name'],
        'Category': rule['category'],
        'Severity': rule['severity'],
        'Status': result['status'],
        'Confidence': f"{result['confidence']:.0%}",
        'Evidence': result['evidence'],
        'Recommendation': result['recommendation']
    })

    print(f"{result['status']} ({result['confidence']:.0%})")

# Create report
df = pd.DataFrame(results)

print("\n" + "="*70)
print("SUMMARY")
print("="*70)
print(f"\n{df['Status'].value_counts()}\n")

print("CRITICAL ISSUES:")
critical = df[(df['Severity'] == 'CRITICAL') & (df['Status'] != 'COMPLIANT')]
print(critical[['Rule ID', 'Rule Name', 'Status']] if len(critical) > 0 else "‚úì None")

# Save
filename = f"compliance_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(filename, index=False)
print(f"\n‚úì Report saved: {filename}\n")
print(df.to_string(index=False))

COMPLIANCE AUDIT - SIMPLE VERSION

[1/15] RULE_001 - Personal Data Encryption... COMPLIANT (80%)
[2/15] RULE_002 - Data Retention Policy... COMPLIANT (80%)
[3/15] RULE_003 - Multi-Factor Authentication... COMPLIANT (80%)
[4/15] RULE_004 - Role-Based Access Control... COMPLIANT (80%)
[5/15] RULE_005 - Breach Notification Timeline... COMPLIANT (80%)
[6/15] RULE_006 - Incident Response Plan... COMPLIANT (80%)
[7/15] RULE_007 - Background Verification... COMPLIANT (80%)
[8/15] RULE_008 - Security Training... COMPLIANT (80%)
[9/15] RULE_009 - Regular Security Audits... COMPLIANT (80%)
[10/15] RULE_010 - Audit Log Retention... COMPLIANT (80%)
[11/15] RULE_011 - Vendor Security Assessment... COMPLIANT (80%)
[12/15] RULE_012 - Data Processing Agreements... COMPLIANT (80%)
[13/15] RULE_013 - Data Subject Rights... COMPLIANT (80%)
[14/15] RULE_014 - Privacy Notice... COMPLIANT (80%)
[15/15] RULE_015 - Vulnerability Management... COMPLIANT (80%)

SUMMARY

Status
COMPLIANT    15
Name: count, dtype

In [65]:
import pandas as pd
from datetime import datetime

print("="*70)
print("COMPLIANCE REPORT - DETAILED RESULTS")
print("="*70 + "\n")

df = pd.DataFrame(results)

print(" SUMMARY BY STATUS")
print("-" * 70)
print(df['Status'].value_counts())

print("\n SUMMARY BY CATEGORY")
print("-" * 70)
print(df.groupby('Category')['Status'].value_counts())

print("\n CRITICAL ISSUES")
print("-" * 70)
critical = df[(df['Severity'] == 'CRITICAL') & (df['Status'] != 'COMPLIANT')]
if len(critical) > 0:
    print(critical[['Rule ID', 'Rule Name', 'Status']].to_string(index=False))
else:
    print("‚úì No critical issues found")

print("\n HIGH SEVERITY ISSUES")
print("-" * 70)
high = df[(df['Severity'] == 'HIGH') & (df['Status'] != 'COMPLIANT')]
if len(high) > 0:
    print(high[['Rule ID', 'Rule Name', 'Status']].to_string(index=False))
else:
    print("‚úì No high severity issues found")

filename = f"compliance_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(filename, index=False)

print(f"\n‚úì Full report saved to: {filename}\n")

print("="*70)
print("FULL COMPLIANCE REPORT")
print("="*70 + "\n")
print(df.to_string(index=False))

total = len(df)
compliant = len(df[df['Status'] == 'COMPLIANT'])
partial = len(df[df['Status'] == 'PARTIAL'])
non_compliant = len(df[df['Status'] == 'NON_COMPLIANT'])

print(f"\n" + "="*70)
print(f"COMPLIANCE RATE: {compliant}/{total} ({compliant/total*100:.1f}%)")
print(f"‚úì Compliant: {compliant} | ‚óê Partial: {partial} | ‚úó Non-Compliant: {non_compliant}")
print("="*70)

COMPLIANCE REPORT - DETAILED RESULTS

 SUMMARY BY STATUS
----------------------------------------------------------------------
Status
COMPLIANT    15
Name: count, dtype: int64

 SUMMARY BY CATEGORY
----------------------------------------------------------------------
Category                Status   
Access Control          COMPLIANT    2
Audit & Compliance      COMPLIANT    2
Data Protection         COMPLIANT    2
Employee Management     COMPLIANT    2
Incident Response       COMPLIANT    2
Privacy Rights          COMPLIANT    2
System Security         COMPLIANT    1
Third-Party Management  COMPLIANT    2
Name: count, dtype: int64

 CRITICAL ISSUES
----------------------------------------------------------------------
‚úì No critical issues found

 HIGH SEVERITY ISSUES
----------------------------------------------------------------------
‚úì No high severity issues found

‚úì Full report saved to: compliance_report_20251202_011431.csv

FULL COMPLIANCE REPORT

 Rule ID              

In [66]:
import json
from datetime import datetime

json_data = []
for result in results:
    json_data.append({
        'rule_id': result['Rule ID'],
        'rule_name': result['Rule Name'],
        'category': result['Category'],
        'severity': result['Severity'],
        'status': result['Status'],
        'confidence': result['Confidence'],
        'evidence_count': result['Evidence'],
        'recommendation': result['Recommendation']
    })

json_filename = f"compliance_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(json_filename, 'w') as f:
    json.dump({
        'audit_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'total_rules': len(results),
        'compliant': len([r for r in results if r['Status'] == 'COMPLIANT']),
        'partial': len([r for r in results if r['Status'] == 'PARTIAL']),
        'non_compliant': len([r for r in results if r['Status'] == 'NON_COMPLIANT']),
        'compliance_rate': f"{len([r for r in results if r['Status'] == 'COMPLIANT'])/len(results)*100:.1f}%",
        'results': json_data
    }, f, indent=2)

print(f"‚úì JSON report saved: {json_filename}")

‚úì JSON report saved: compliance_report_20251202_011433.json


In [67]:
print("="*70)
print("AGENT WORKFLOW - MULTI-STEP COMPLIANCE QUESTIONING")
print("="*70 + "\n")

class SimpleComplianceAgent:
    def __init__(self, vectorstore):
        self.vectorstore = vectorstore

    def ask(self, question):
        """Answer compliance questions using document search"""
        docs = self.vectorstore.similarity_search(question, k=5)

        if not docs:
            return {
                'answer': 'No relevant information found',
                'evidence': [],
                'confidence': 'Low'
            }

        evidence = []
        for doc in docs:
            evidence.append({
                'source': doc.metadata['source'],
                'page': doc.metadata['page'],
                'content': doc.page_content[:200] + "..."
            })

        if len(docs) >= 3:
            answer = f"Yes, found {len(docs)} relevant policy sections addressing this requirement."
            confidence = 'High'
        elif len(docs) >= 1:
            answer = f"Partially - found {len(docs)} relevant sections, but may need more coverage."
            confidence = 'Medium'
        else:
            answer = "Insufficient evidence found in policy documents."
            confidence = 'Low'

        return {
            'answer': answer,
            'evidence': evidence,
            'confidence': confidence
        }

agent = SimpleComplianceAgent(vectorstore)

questions = [
    "Is personal data encrypted according to our policies?",
    "What are the data retention requirements?",
    "Are there breach notification procedures defined?",
    "Do we have multi-factor authentication requirements?",
    "What background check procedures exist for employees?"
]

results = []

for i, question in enumerate(questions, 1):
    print(f"\n[Question {i}] {question}")
    print("-" * 70)

    result = agent.ask(question)
    results.append({'question': question, **result})

    print(f"Answer: {result['answer']}")
    print(f"Confidence: {result['confidence']}")
    print(f"Evidence: {len(result['evidence'])} documents found")

    if result['evidence']:
        print("\nTop Evidence:")
        for j, ev in enumerate(result['evidence'][:2], 1):
            print(f"  {j}. {ev['source']} (Page {ev['page']})")
            print(f"     {ev['content'][:150]}...")

print("\n" + "="*70)
print("AGENT WORKFLOW COMPLETE")
print("="*70)

import json
with open('agent_qa_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n‚úì Q&A results saved to: agent_qa_results.json")
print(f"‚úì Processed {len(questions)} multi-step compliance questions\n")

AGENT WORKFLOW - MULTI-STEP COMPLIANCE QUESTIONING


[Question 1] Is personal data encrypted according to our policies?
----------------------------------------------------------------------
Answer: Yes, found 5 relevant policy sections addressing this requirement.
Confidence: High
Evidence: 5 documents found

Top Evidence:
  1. HEALTHGATEDATACORP_11_24_1999-EX-10.1-HOSTING AND MANAGEMENT AGREEMENT - Escrow Agreement.pdf (Page 2)
     12 March 1998                                      National Computing Centre
                                                   Oxford House, Oxford R...
  2. HEALTHGATEDATACORP_11_24_1999-EX-10.1-HOSTING AND MANAGEMENT AGREEMENT - Escrow Agreement_23.pdf (Page 2)
     12 March 1998                                      National Computing Centre
                                                   Oxford House, Oxford R...

[Question 2] What are the data retention requirements?
----------------------------------------------------------------------
Answer: Ye

In [68]:
!pip install pandas pyyaml plotly

import pandas as pd
import yaml
import pickle
from pathlib import Path

print("=" * 50)
print("COMPLIANCE AUDIT SYSTEM")
print("=" * 50)

try:
    with open('vectorstore.pkl', 'rb') as f:
        vectorstore = pickle.load(f)
    print("‚úì Vector store loaded")
except:
    print("‚úó Vector store not found")
    vectorstore = None

try:
    with open('compliance_rules.yaml', 'r') as f:
        rules = yaml.safe_load(f)['rules']
    print(f"‚úì Loaded {len(rules)} rules")
except:
    print("‚úó Rules file not found")
    rules = []

if vectorstore and rules:
    print("\nRunning audit...\n")
    results = []

    for i, rule in enumerate(rules):
        print(f"Checking {i+1}/{len(rules)}: {rule['name']}")
        query = f"{rule['name']} {rule['description']}"
        docs = vectorstore.similarity_search(query, k=5)
        evidence = len(docs)

        status = 'COMPLIANT' if evidence >= 3 else 'PARTIAL' if evidence >= 1 else 'NON_COMPLIANT'

        results.append({
            'Rule ID': rule['id'],
            'Rule Name': rule['name'],
            'Status': status,
            'Evidence': evidence
        })

    df = pd.DataFrame(results)
    print("\n" + "=" * 50)
    print("AUDIT RESULTS")
    print("=" * 50)
    print(f"Compliant: {len(df[df['Status']=='COMPLIANT'])}")
    print(f"Partial: {len(df[df['Status']=='PARTIAL'])}")
    print(f"Non-Compliant: {len(df[df['Status']=='NON_COMPLIANT'])}")
    print("\n")
    display(df)

    df.to_csv('audit_report.csv', index=False)
    print("\n‚úì Report saved as 'audit_report.csv'")
else:
    print("\n Cannot run audit. Upload vectorstore.pkl and compliance_rules.yaml")

COMPLIANCE AUDIT SYSTEM
‚úì Vector store loaded
‚úì Loaded 15 rules

Running audit...

Checking 1/15: Personal Data Encryption
Checking 2/15: Data Retention Policy
Checking 3/15: Multi-Factor Authentication
Checking 4/15: Role-Based Access Control
Checking 5/15: Breach Notification Timeline
Checking 6/15: Incident Response Plan
Checking 7/15: Background Verification
Checking 8/15: Security Training
Checking 9/15: Regular Security Audits
Checking 10/15: Audit Log Retention
Checking 11/15: Vendor Security Assessment
Checking 12/15: Data Processing Agreements
Checking 13/15: Data Subject Rights
Checking 14/15: Privacy Notice
Checking 15/15: Vulnerability Management

AUDIT RESULTS
Compliant: 15
Partial: 0
Non-Compliant: 0




Unnamed: 0,Rule ID,Rule Name,Status,Evidence
0,RULE_001,Personal Data Encryption,COMPLIANT,5
1,RULE_002,Data Retention Policy,COMPLIANT,5
2,RULE_003,Multi-Factor Authentication,COMPLIANT,5
3,RULE_004,Role-Based Access Control,COMPLIANT,5
4,RULE_005,Breach Notification Timeline,COMPLIANT,5
5,RULE_006,Incident Response Plan,COMPLIANT,5
6,RULE_007,Background Verification,COMPLIANT,5
7,RULE_008,Security Training,COMPLIANT,5
8,RULE_009,Regular Security Audits,COMPLIANT,5
9,RULE_010,Audit Log Retention,COMPLIANT,5



‚úì Report saved as 'audit_report.csv'


In [79]:
%%writefile app.py
import streamlit as st
import pandas as pd
import os
from datetime import datetime
import plotly.express as px
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.schema import Document
import pypdf
import tempfile

st.set_page_config(page_title="Compliance Checker", layout="wide")

if "results_df" not in st.session_state:
    st.session_state.results_df = None

st.title("Policy Compliance Checker")
st.markdown("---")

with st.sidebar:
    st.header("Configuration")
    api_key = st.text_input("Google API Key", type="password")
    uploaded_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
    run_audit = st.button("Run Audit", type="primary", use_container_width=True)

col1, col2, col3, col4 = st.columns(4)
col1.metric("Total Rules", "15")
col2.metric("Documents", len(uploaded_files) if uploaded_files else 0)

if st.session_state.results_df is not None:
    df = st.session_state.results_df
    compliant = len(df[df["Status"] == "COMPLIANT"])
    col3.metric("Compliance Rate", f"{compliant/len(df)*100:.1f}%")
    col4.metric("Issues", len(df[df["Status"] == "NON_COMPLIANT"]))
else:
    col3.metric("Compliance Rate", "--")
    col4.metric("Issues", "--")

st.markdown("---")

tab1, tab2 = st.tabs(["Results", "Analytics"])

with tab1:
    if st.session_state.results_df is not None:
        st.dataframe(st.session_state.results_df, use_container_width=True, hide_index=True)
        csv = st.session_state.results_df.to_csv(index=False).encode("utf-8")
        st.download_button("Download Report", csv, f"report_{datetime.now().strftime('%Y%m%d')}.csv")
    else:
        st.info("Upload documents and run audit to see results")

with tab2:
    if st.session_state.results_df is not None:
        col1, col2 = st.columns(2)
        with col1:
            category_data = st.session_state.results_df.groupby(["Category", "Status"]).size().reset_index(name="Count")
            fig = px.bar(category_data, x="Category", y="Count", color="Status", title="By Category")
            st.plotly_chart(fig, use_container_width=True)
        with col2:
            severity_data = st.session_state.results_df.groupby(["Severity", "Status"]).size().reset_index(name="Count")
            fig = px.bar(severity_data, x="Severity", y="Count", color="Status", title="By Severity")
            st.plotly_chart(fig, use_container_width=True)
    else:
        st.info("Run audit to see analytics")

if run_audit:
    if not api_key:
        st.error("Enter API key")
    elif not uploaded_files:
        st.error("Upload PDFs")
    else:
        with st.spinner("Processing..."):
            try:
                os.environ["GOOGLE_API_KEY"] = api_key

                with tempfile.TemporaryDirectory() as temp_dir:
                    documents = []
                    for uploaded_file in uploaded_files:
                        temp_path = os.path.join(temp_dir, uploaded_file.name)
                        with open(temp_path, "wb") as f:
                            f.write(uploaded_file.getbuffer())

                        with open(temp_path, "rb") as file:
                            pdf_reader = pypdf.PdfReader(file)
                            for page_num, page in enumerate(pdf_reader.pages):
                                text = page.extract_text()
                                if text.strip():
                                    documents.append(Document(
                                        page_content=text,
                                        metadata={"source": uploaded_file.name, "page": page_num + 1}
                                    ))

                    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
                    chunks = text_splitter.split_documents(documents)

                    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
                    vectorstore = Chroma.from_documents(chunks, embeddings, persist_directory=temp_dir)

                    rules = [
                        {"id": "RULE_001", "category": "Data Protection", "name": "Personal Data Encryption", "description": "encrypted data rest transit", "severity": "CRITICAL"},
                        {"id": "RULE_002", "category": "Data Protection", "name": "Data Retention Policy", "description": "retention periods years", "severity": "HIGH"},
                        {"id": "RULE_003", "category": "Access Control", "name": "Multi-Factor Authentication", "description": "MFA mandatory users", "severity": "CRITICAL"},
                        {"id": "RULE_004", "category": "Access Control", "name": "Role-Based Access Control", "description": "RBAC least privilege", "severity": "HIGH"},
                        {"id": "RULE_005", "category": "Incident Response", "name": "Breach Notification", "description": "breaches reported 72 hours", "severity": "CRITICAL"},
                        {"id": "RULE_006", "category": "Incident Response", "name": "Response Plan", "description": "incident response procedures", "severity": "HIGH"},
                        {"id": "RULE_007", "category": "Employee Management", "name": "Background Verification", "description": "background checks employees", "severity": "MEDIUM"},
                        {"id": "RULE_008", "category": "Employee Management", "name": "Security Training", "description": "annual training employees", "severity": "MEDIUM"},
                        {"id": "RULE_009", "category": "Audit and Compliance", "name": "Security Audits", "description": "audits annually", "severity": "HIGH"},
                        {"id": "RULE_010", "category": "Audit and Compliance", "name": "Log Retention", "description": "logs retained 12 months", "severity": "HIGH"},
                        {"id": "RULE_011", "category": "Third-Party", "name": "Vendor Assessment", "description": "vendors security assessment", "severity": "HIGH"},
                        {"id": "RULE_012", "category": "Third-Party", "name": "Data Processing Agreements", "description": "DPAs third parties", "severity": "CRITICAL"},
                        {"id": "RULE_013", "category": "Privacy Rights", "name": "Data Subject Rights", "description": "data access deletion 30 days", "severity": "CRITICAL"},
                        {"id": "RULE_014", "category": "Privacy Rights", "name": "Privacy Notice", "description": "privacy notice collection", "severity": "HIGH"},
                        {"id": "RULE_015", "category": "System Security", "name": "Vulnerability Management", "description": "vulnerabilities patched 30 days", "severity": "CRITICAL"}
                    ]

                    results = []
                    progress = st.progress(0)

                    for i, rule in enumerate(rules):
                        docs = vectorstore.similarity_search(rule["description"], k=5)
                        keywords = rule["description"].split()
                        evidence = sum(1 for doc in docs if any(kw in doc.page_content.lower() for kw in keywords))

                        status = "COMPLIANT" if evidence >= 3 else "PARTIAL" if evidence >= 1 else "NON_COMPLIANT"
                        confidence = 0.8 if evidence >= 3 else 0.5 if evidence >= 1 else 0.3

                        results.append({
                            "Rule ID": rule["id"],
                            "Rule Name": rule["name"],
                            "Category": rule["category"],
                            "Severity": rule["severity"],
                            "Status": status,
                            "Confidence": f"{confidence:.0%}",
                            "Evidence": evidence
                        })
                        progress.progress((i + 1) / len(rules))

                    st.session_state.results_df = pd.DataFrame(results)
                    progress.empty()
                    st.success("Audit complete!")
                    st.rerun()

            except Exception as e:
                st.error(f"Error: {str(e)}")

st.markdown("---")
st.markdown("<div style='text-align: center; color: gray;'>Compliance Checker v1.0</div>", unsafe_allow_html=True)

Overwriting app.py


In [78]:
%%writefile requirements.txt

requirements = '''streamlit
langchain==0.1.0
langchain-google-genai==0.0.6
langchain-community==0.0.13
chromadb==0.4.22
pypdf==3.17.4
pandas
plotly
pyyaml'''


Overwriting requirements.txt
