<a href="https://colab.research.google.com/github/deeception/REG_AI/blob/main/pipeline1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 0: Create config JSON (edit this file only to port to new org/country)
import json, os

CONFIG_PATH = "/content/config_sources.json"
config = {
  "meta": {
    "project": "RegIntel-POC",
    "version": "1.1.0",
    "notes": "Edit this JSON to change sources/scope; pipeline code need not change."
  },
  "sources": {
    "primary": {
      "name": "RBI/SEBI",
      "seeds": [
        "https://rbi.org.in/Scripts/PressReleases.aspx",
        "https://www.sebi.gov.in/sebiweb/home/HomeAction.do?doListing=yes&sid=1&ssid=1&smid=0"
      ],
      "allowed_domains": ["rbi.org.in", "www.rbi.org.in", "sebi.gov.in", "www.sebi.gov.in"],
      "high_value_patterns": ["press","enforcement","penalty","circular","order","notification","monetary","imposed"],
      "max_pages_total": 120,
      "max_pdfs_total": 60,
      "per_domain_page_cap": 60,
      "max_depth": 3
    },
    "secondary": {
      "name": "Trusted Media/Agencies",
      "seeds": [
        "https://www.business-standard.com/category/economy-policy-102.html",
        "https://economictimes.indiatimes.com/news/economy/policy",
        "https://www.livemint.com/market/regulation",
        "https://www.cnbctv18.com/economy",
        "https://www.moneycontrol.com/news/business/markets",
        "https://pib.gov.in/PressReleseDetailm.aspx?Ministry=Finance",
        "https://dea.gov.in/press-releases"
      ],
      "allowed_domains": [
        "www.business-standard.com", "business-standard.com",
        "economictimes.indiatimes.com", "www.livemint.com",
        "www.cnbctv18.com", "www.moneycontrol.com",
        "pib.gov.in", "dea.gov.in"
      ],
      "high_value_patterns": ["regulation","penalty","enforcement","order","directive","circular","rbi","sebi","pmla","kyc","fema"],
      "max_pages_total": 80,
      "max_pdfs_total": 10,
      "per_domain_page_cap": 25,
      "max_depth": 3
    }
  },
  "crawler": {
    "headers": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36",
    "min_delay_seconds": 0.8,
    "skip_extensions": ["css","js","svg","ico","gif","png","jpg","jpeg","mp4","webm","zip","xlsx","xls","docx","pptx"]
  },
  "nlp": {
    "zero_shot_model": "facebook/bart-large-mnli",
    "sbert_model": "sentence-transformers/all-MiniLM-L6-v2",
    "spacy_model": "en_core_web_trf",
    "labels": [
      "Monetary Penalty",
      "Regulatory/Compliance Update",
      "Customer Impact / Reputational",
      "Operational / Systems / Process"
    ],
    "bank_profile": "Retail & corporate banking in India; AML/KYC; payments; credit cards; collections; outsourcing; operational risk; data governance; RBI/SEBI compliance"
  },
  "scoring": {
    "structural_url_patterns": ["/topic/", "/tags/", "/author/", "/search?"],
    "structural_title_cues": ["sitemap", "contact us", "about us", "list of", "index of"],
    "structural_title_cues": ["sitemap","legal framework","regulated entities","holidays","contact us","help","faq","faqs","disclaimer","privacy","terms","feedback","grievance","complaint","complaints","reporting portal","registration"],
    "risk_weights": {
      "severity": 0.40,
      "recency": 0.15,
      "jurisdiction": 0.10,
      "relevance_embed": 0.15,
      "relevance_ml": 0.15,
      "authority": 0.05
    },
    "authority": {
      "primary_pdf": 1.0,
      "primary_html": 0.85,
      "secondary_pdf": 0.80,
      "secondary_html": 0.70
    },
    "no_event_cap": 0.35,
    "polarity_adjust": {"Negative": 0.04, "Neutral": 0.0, "Positive": -0.02}
  },
  "labeled_files": {
    "relevance": "/content/relevance_labels.csv",
    "impact": "/content/impact_labels.csv",
    "polarity": "/content/polarity_labels.csv"
  },
  "outputs": {
    "data_dir": "/content/data",
    "csv": "/content/rbi_risk_digest_with_news.csv",
    "xlsx": "/content/rbi_risk_digest_with_news.xlsx"
  }
}

os.makedirs(os.path.dirname(CONFIG_PATH), exist_ok=True)
with open(CONFIG_PATH, "w") as f:
  json.dump(config, f, indent=2)
print("Wrote config to", CONFIG_PATH)


Wrote config to /content/config_sources.json


In [None]:
# Step 1: Install dependencies
!pip -q install requests beautifulsoup4 pandas numpy regex tqdm python-dateutil \
                   pdfplumber pymupdf sentence-transformers transformers scikit-learn \
                   openpyxl tldextract spacy

# Optional (NER quality; can skip for speed)
!python -m spacy download en_core_web_trf

# Step 1b: Imports
import os, re, json, time, hashlib, math, random, datetime as dt
from dateutil import parser as dateparser
from collections import deque
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm

import fitz  # PyMuPDF
import pdfplumber

from sentence_transformers import SentenceTransformer, util
from transformers import pipeline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

import spacy


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-an

In [None]:
# Step 2: Load config JSON and setup paths
with open("/content/config_sources.json") as f:
    CFG = json.load(f)

PRIMARY = CFG["sources"]["primary"]
SECONDARY = CFG["sources"]["secondary"]
CRAWL = CFG["crawler"]
NLP_CFG = CFG["nlp"]
SCORE_CFG = CFG["scoring"]
LABELED = CFG["labeled_files"]
OUTPUTS = CFG["outputs"]

DATA_DIR = OUTPUTS["data_dir"]
os.makedirs(f"{DATA_DIR}/raw", exist_ok=True)
os.makedirs(f"{DATA_DIR}/processed", exist_ok=True)

SKIP_EXT_RE = re.compile(r"\.(" + "|".join(CRAWL["skip_extensions"]) + r")($|\?)", re.I)
HEADERS = {"User-Agent": CRAWL["headers"]}
MIN_DELAY = float(CRAWL["min_delay_seconds"])

ZS_LABELS = NLP_CFG["labels"]


In [None]:
# Step 3: Utilities
def sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def normalize_url(u: str) -> str:
    p = urlparse(u)
    return p._replace(fragment="").geturl()

def domain(hosted_url: str) -> str:
    return urlparse(hosted_url).netloc.lower()

LAST_FETCH = {}
def domain_throttle(url: str):
    host = domain(url)
    t = time.time()
    last = LAST_FETCH.get(host, 0.0)
    if t - last < MIN_DELAY:
        time.sleep(MIN_DELAY - (t - last))
    LAST_FETCH[host] = time.time()

def get_with_retry(url, timeout=20, retries=2, backoff=1.5):
    for i in range(retries+1):
        try:
            domain_throttle(url)
            r = requests.get(url, headers=HEADERS, timeout=timeout)
            r.raise_for_status()
            return r
        except requests.RequestException:
            if i == retries: raise
            time.sleep(backoff**i)

def clean_text(t: str) -> str:
    if not t: return ""
    t = re.sub(r"\s+", " ", t).strip()
    t = re.sub(r"(\w)-\s+(\w)", r"\1\2", t)
    return t

def parse_date_from_text(t: str):
    if not t: return None
    m = re.search(r"(\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4})", t)
    if m:
        try: return dateparser.parse(m.group(1)).date().isoformat()
        except: pass
    m = re.search(r"(20[12]\d)", t)
    if m: return f"{m.group(1)}-01-01"
    return None


In [None]:
# Step 4: Initialize models
sbert = SentenceTransformer(NLP_CFG["sbert_model"])
bank_vec = sbert.encode(NLP_CFG["bank_profile"], normalize_embeddings=True)

zero_shot = pipeline("zero-shot-classification", model=NLP_CFG["zero_shot_model"], device_map="auto")

try:
    nlp = spacy.load(NLP_CFG["spacy_model"])
except Exception:
    nlp = spacy.blank("en")  # fallback minimal


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [None]:
# Step 5: Batch helpers for GPU efficiency
BATCH_SIZE_ZS = 16
BATCH_SIZE_EMB = 64

def batched(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n], i

def zero_shot_batch(texts, candidate_labels, threshold=0.35, batch_size=BATCH_SIZE_ZS):
    outputs = []
    for chunk, _ in batched(texts, batch_size):
        res = zero_shot(chunk, candidate_labels, multi_label=True)
        for r in res:
            labs = [l for l, s in zip(r["labels"], r["scores"]) if s >= threshold]
            outputs.append(labs or [r["labels"][0]])
    return outputs

def sbert_encode_batch(texts, normalize=True, batch_size=BATCH_SIZE_EMB):
    vecs = []
    for chunk, _ in batched(texts, batch_size):
        V = sbert.encode(chunk, normalize_embeddings=normalize)
        vecs.append(V)
    return np.vstack(vecs) if vecs else np.zeros((0, 384))


In [None]:
# Step 6: ML-guided resilient crawling for any source spec in config
def is_high_value(url: str, anchor_text: str, patterns: list) -> bool:
    lu = (url or "").lower()
    lt = (anchor_text or "").lower()
    return any(p in lu for p in patterns) or any(p in lt for p in patterns)

def autodiscover(source_cfg: dict):
    seeds = source_cfg["seeds"]
    allowed = set([d.lower() for d in source_cfg["allowed_domains"]])
    patterns = source_cfg["high_value_patterns"]
    max_pages = source_cfg["max_pages_total"]
    max_pdfs = source_cfg["max_pdfs_total"]
    max_depth = source_cfg["max_depth"]
    per_domain_cap = source_cfg["per_domain_page_cap"]

    pages, pdfs = set(), set()
    visited = set()
    per_domain_counts = {}
    queue = deque()

    for s in seeds:
        s_norm = normalize_url(s)
        if s_norm not in visited:
            visited.add(s_norm)
            queue.append((s_norm, 0))

    pbar = tqdm(total=max_pages + max_pdfs, desc=f"Discovering: {source_cfg['name']}")
    while queue and (len(pages) < max_pages or len(pdfs) < max_pdfs):
        url, depth = queue.popleft()
        if depth >= max_depth:
            continue
        try:
            r = get_with_retry(url)
            soup = BeautifulSoup(r.text, "html.parser")
            parent_title = soup.title.get_text(strip=True) if soup.title else ""

            # Embedded PDFs
            for tag in soup.find_all(["iframe","object"]):
                src = tag.get("src") or tag.get("data")
                if not src: continue
                href = normalize_url(urljoin(url, src))
                if href.lower().endswith(".pdf") and href not in pdfs and len(pdfs) < max_pdfs:
                    pdfs.add(href)
                    if pbar.n < pbar.total: pbar.update(1)

            for a in soup.find_all("a", href=True):
                href_raw = a["href"]
                link_text = a.get_text(" ", strip=True)
                href = normalize_url(urljoin(url, href_raw))
                if href in visited:
                    continue
                if not href.startswith("http"):
                    continue
                if SKIP_EXT_RE.search(href):
                    continue

                visited.add(href)

                # PDFs accepted across domains
                if href.lower().endswith(".pdf"):
                    if len(pdfs) < max_pdfs:
                        pdfs.add(href)
                        if pbar.n < pbar.total: pbar.update(1)
                    continue

                # Non-PDFs: scope to allowed
                if domain(href) not in allowed:
                    continue

                host = domain(href)
                if per_domain_counts.get(host, 0) >= per_domain_cap:
                    continue

                if len(pages) < max_pages and is_high_value(href, link_text, patterns):
                    pages.add(href)
                    per_domain_counts[host] = per_domain_counts.get(host, 0) + 1
                    if pbar.n < pbar.total: pbar.update(1)
                    if depth + 1 < max_depth:
                        # Prioritize by patterns
                        queue.appendleft((href, depth + 1))
        except requests.RequestException:
            continue

    pbar.close()
    return list(pages), list(pdfs)

print("Step 6: Discovering primary...")
p_pages, p_pdfs = autodiscover(PRIMARY)
print("Primary pages:", len(p_pages), "pdfs:", len(p_pdfs))

print("Step 6: Discovering secondary...")
s_pages, s_pdfs = autodiscover(SECONDARY)
print("Secondary pages:", len(s_pages), "pdfs:", len(s_pdfs))


Step 6: Discovering primary...


Discovering: RBI/SEBI: 100%|██████████| 180/180 [01:56<00:00,  1.55it/s]


Primary pages: 120 pdfs: 60
Step 6: Discovering secondary...


Discovering: Trusted Media/Agencies:  77%|███████▋  | 69/90 [01:34<00:28,  1.36s/it]

Secondary pages: 59 pdfs: 10





In [None]:
# Step 7: Extraction
def html_to_text(url):
    try:
        r = get_with_retry(url)
    except Exception:
        return None, None
    soup = BeautifulSoup(r.text, "html.parser")
    title = soup.title.get_text(strip=True) if soup.title else ""
    # Template-aware title boost (minimal)
    try:
        t2 = soup.select_one("#ctl00_ContentPlaceHolder1_lblHeading, h1, .pressRelease, .main-title")
        if t2:
            tt = t2.get_text(" ", strip=True)
            if tt: title = tt
    except Exception:
        pass

    # Simple density heuristic for main body
    candidates = soup.find_all(["article","section","div","main"])
    scored = []
    for n in candidates:
        text = n.get_text(" ", strip=True)
        if not text: continue
        words = text.split()
        if len(words) < 50: continue
        links = len(n.find_all("a"))
        link_ratio = links / max(1, len(words))
        digit_ratio = sum(c.isdigit() for c in text) / max(1, len(text))
        score = len(words) / (1 + 10*link_ratio) + 50*digit_ratio
        scored.append((score, text))
    if scored:
        text = " ".join([t for _, t in sorted(scored, key=lambda x:-x[0])[:3]])
    else:
        text = " ".join([p.get_text(" ", strip=True) for p in soup.find_all(["p","li","div","span"])])

    return title, clean_text(text)

def pdf_to_text(url):
    fn = f"{DATA_DIR}/raw/{sha1(url)}.pdf"
    try:
        if not os.path.exists(fn):
            with get_with_retry(url) as r:
                with open(fn, "wb") as f: f.write(r.content)
        text = ""
        with fitz.open(fn) as doc:
            for page in doc:
                text += page.get_text("text") or ""
        if len(text.strip()) < 120 or (sum(ord(c) > 127 for c in text[:2000]) > 200):
            with pdfplumber.open(fn) as pdf:
                t2 = ""
                for page in pdf.pages:
                    t2 += page.extract_text() or ""
            if len(t2.strip()) > len(text.strip()):
                text = t2
        return os.path.basename(fn), clean_text(text)
    except Exception as e:
        print("PDF failed:", url, e)
        return None, None


In [None]:
# Step 8: Build datasets
def record_from_url(url, doctype, title, text, source_name):
    pub_date = parse_date_from_text(text) or ""
    return {
        "id": sha1(url),
        "source": source_name,
        "section": "Press/Enforcement (Hybrid)",
        "url": url,
        "title": title or "",
        "published_date": pub_date,
        "doctype": doctype,
        "text": text
    }

def ingest(pages, pdfs, source_name):
    recs = []
    for p in tqdm(pages, desc=f"HTML ingest: {source_name}"):
        title, text = html_to_text(p)
        if text and len(text) > 150:
            recs.append(record_from_url(p, "HTML", title, text, source_name))
    for p in tqdm(pdfs, desc=f"PDF ingest: {source_name}"):
        fn, text = pdf_to_text(p)
        if text and len(text) > 150:
            recs.append(record_from_url(p, "PDF", fn, text, source_name))
    return pd.DataFrame(recs)

df_p = ingest(p_pages, p_pdfs, "RBI/SEBI")
df_s = ingest(s_pages, s_pdfs, "News/Secondary")

print("Primary DF:", df_p.shape, "Secondary DF:", df_s.shape)


HTML ingest: RBI/SEBI: 100%|██████████| 120/120 [02:26<00:00,  1.22s/it]
PDF ingest: RBI/SEBI: 100%|██████████| 60/60 [00:04<00:00, 14.89it/s]
HTML ingest: News/Secondary: 100%|██████████| 59/59 [00:38<00:00,  1.53it/s]
PDF ingest: News/Secondary: 100%|██████████| 10/10 [01:42<00:00, 10.27s/it]

Primary DF: (180, 8) Secondary DF: (64, 8)





In [None]:
# Step 9: Structural filtering
STRUCT_CUES = re.compile(r"(" + "|".join([re.escape(x) for x in SCORE_CFG["structural_title_cues"]]) + r")", re.I)

def is_structural(title, text):
    if STRUCT_CUES.search(title or ""): return True
    if not text or len(text) < 120: return True
    return False

def filter_structural(df):
    if df.empty: return df.copy(), df.copy()
    df["is_structural"] = df.apply(lambda r: is_structural(r["title"], r["text"]), axis=1)
    return df[~df["is_structural"]].drop(columns=["is_structural"]).copy(), df[df["is_structural"]].copy()

df_p_content, df_p_struct = filter_structural(df_p)
df_s_content, df_s_struct = filter_structural(df_s)

print("Primary content:", len(df_p_content), "Primary structural:", len(df_p_struct))
print("Secondary content:", len(df_s_content), "Secondary structural:", len(df_s_struct))


Primary content: 177 Primary structural: 3
Secondary content: 63 Secondary structural: 1


In [None]:
# =====================================================================
# Step 10: Zero-shot and rules (batched) - CORRECTED
# =====================================================================

def apply_labels(df):
    if df.empty:
        df["labels_zero_shot"] = []
        return df
    texts = df["text"].fillna("").str.slice(0,2000).tolist()
    df["labels_zero_shot"] = zero_shot_batch(texts, ZS_LABELS, threshold=0.35)
    return df

# Define the regex patterns once
AMOUNT_PAT = re.compile(
    r"(?:(₹|INR|Rs\.?|Rupees)\s*)"
    r"([\d]{1,3}(?:[,]\d{2,3})+|\d+(?:\.\d+)?)"
    r"(?:\s*(crore|cr|lakh|lac))?",
    re.IGNORECASE
)
PENALTY_CUES = re.compile(r"\b(impose[ds]?|levy|penalt(y|ies)|fine[sd]?|monetary\s+penalty)\b", re.IGNORECASE)

# --- START OF THE FIX ---

# 1. This function is the renamed version of your old `parse_amount_in_inr`.
#    The leading underscore signifies it's an internal helper function.
def _parse_amount_from_string(text_snippet: str):
    """Helper function to extract a number from a small string."""
    if not text_snippet: return None
    m = AMOUNT_PAT.search(text_snippet)
    if not m: return None
    num_str = (m.group(2) or "").strip().replace(",", "")
    try:
        amount = float(num_str)
    except ValueError:
        return None
    unit = (m.group(3) or "").lower()
    if unit in ("crore","cr"): amount *= 1e7
    elif unit in ("lakh","lac"): amount *= 1e5
    return amount if amount > 0 else None

# 2. This is the new, primary function that provides context.
#    It now correctly calls the helper function defined above.
def extract_fine_from_context(text: str):
    """
    Finds a monetary fine only if it appears in the same sentence as a penalty keyword.
    """
    if not text: return None
    sentences = re.split(r'(?<=[.!?])\s+', text)
    for sent in sentences:
        if PENALTY_CUES.search(sent):
            amount = _parse_amount_from_string(sent) # This call now works correctly
            if amount:
                return amount
    return None

# --- END OF THE FIX ---

def has_penalty_cue(text: str):
    return bool(PENALTY_CUES.search(text or ""))

def apply_rules_and_embeddings(df):
    if df.empty:
        df["fine_amount_in_inr"] = None
        df["penalty_cue"] = False
        df["relevance_embed"] = 0.0
        return df

    # 3. This main function now correctly calls the new context-aware extractor.
    df["fine_amount_in_inr"] = df["text"].apply(extract_fine_from_context)
    df["penalty_cue"] = df["text"].apply(has_penalty_cue)

    # Embeddings relevance
    texts = df["text"].fillna("").str.slice(0,1500).tolist()
    V = sbert_encode_batch(texts, normalize=True, batch_size=64)
    df["relevance_embed"] = (V @ bank_vec.reshape(1,-1).T).ravel().astype(float)
    return df

# --- Execution ---
df_p_content = apply_labels(df_p_content)
df_p_content = apply_rules_and_embeddings(df_p_content)

df_s_content = apply_labels(df_s_content)
df_s_content = apply_rules_and_embeddings(df_s_content)

print("Step 10 successfully executed with context-aware fine extraction.")

Step 10 successfully executed with context-aware fine extraction.


In [None]:
# Step 11: Entities
BANK_KEYWORDS = re.compile(r"\b(Bank|Cooperative Bank|NBFC|Housing Finance|Credit|Payment Bank|Small Finance Bank)\b", re.I)
LAW_CUES = re.compile(r"(Banking Regulation Act|RBI Act|Section\s+\d+[A-Za-z]?|KYC|AML|FEMA|PMLA)", re.I)

def extract_entities_ml(text):
    ents = {"banks": [], "laws": [], "amounts": []}
    banks = []
    if getattr(nlp, "pipe_names", None):
        doc = nlp((text or "")[:8000])
        orgs = [e.text for e in doc.ents if e.label_ in ("ORG","PRODUCT")]
        banks = [o for o in orgs if BANK_KEYWORDS.search(o)]
    laws = LAW_CUES.findall(text or "")
    ents["banks"] = list(dict.fromkeys(banks))[:10]
    ents["laws"] = list(dict.fromkeys([l if isinstance(l, str) else " ".join(l) for l in laws]))[:10]
    return ents

def merge_entities(df):
    if df.empty:
        df["entities"] = {}
        return df
    df["entities_ml"] = df["text"].apply(extract_entities_ml)
    def merge_row(row):
        e = row.get("entities_ml", {}) or {}
        amts = []
        if isinstance(row.get("fine_amount_in_inr"), (int,float)) and row["fine_amount_in_inr"] > 0:
            amts.append(row["fine_amount_in_inr"])
        return {"banks": e.get("banks", []), "laws": e.get("laws", []), "amounts": amts}
    df["entities"] = df.apply(merge_row, axis=1)
    df = df.drop(columns=["entities_ml"])
    return df

df_p_content = merge_entities(df_p_content)
df_s_content = merge_entities(df_s_content)


In [None]:
# Step 12: Relevance refinement (clamp generic pages, boost event signals)
ACTION_CUES = re.compile(r"\b(imposed|imposes|imposition|levied|ordered|order|directed|directions|penalty|contravention|non[- ]compliance|violat(ed|ion))\b", re.IGNORECASE)
LAW_TOKENS  = re.compile(r"\b(section\s+\d+[A-Za-z]?|banking regulation act|rbi act|kyc|aml|pmla|fema)\b", re.IGNORECASE)
BANK_NAME_HINT = re.compile(r"\b(bank|nbfc|cooperative bank|small finance bank|payment bank|housing finance|credit)\b", re.IGNORECASE)

def event_signal(text):
    t = text or ""
    has_action = bool(ACTION_CUES.search(t))
    has_law = bool(LAW_TOKENS.search(t))
    if has_action and has_law: return 1.0
    if has_action: return 0.7
    if has_law: return 0.5
    return 0.0

def bank_mention_signal(row):
    ents = row.get("entities") or {}
    banks = ents.get("banks", []) if isinstance(ents, dict) else []
    if banks: return 1.0
    return 1.0 if BANK_NAME_HINT.search((row.get("text") or "")) else 0.0

def refine_relevance(row):
    base_emb = float(row.get("relevance_embed") or 0.0)
    # if you train a relevance model later, plug it and average; for now reuse refined emb
    base_ml  = base_emb
    ev = event_signal(row.get("text"))
    bk = bank_mention_signal(row)
    if ev == 0.0 and bk == 0.0:
        rel = min(0.25, 0.5 * base_emb + 0.5 * base_ml)
    else:
        rel = min(1.0, 0.4 * base_emb + 0.4 * base_ml + 0.2 * ev)
    return round(rel, 3)

for df_ in (df_p_content, df_s_content):
    df_["relevance_prob_ml"] = df_.apply(refine_relevance, axis=1)


In [None]:
# Step 13: Mark secondary corroboration vs primary
def mark_corroboration(df_primary, df_secondary, threshold=0.90):
    if df_secondary.empty or df_primary.empty:
        if not df_secondary.empty:
            df_secondary["is_corroboration"] = False
        return df_secondary
    Vp = sbert_encode_batch(df_primary["text"].str.slice(0,1200).tolist(), normalize=True, batch_size=64)
    Vs = sbert_encode_batch(df_secondary["text"].str.slice(0,1200).tolist(), normalize=True, batch_size=64)
    cor_flags = []
    for i in range(len(df_secondary)):
        sim = float((Vp @ Vs[i]).max()) if len(df_primary) > 0 else 0.0
        cor_flags.append(sim >= threshold)
    df_secondary["is_corroboration"] = cor_flags
    return df_secondary

df_s_content = mark_corroboration(df_p_content, df_s_content, threshold=0.90)


In [None]:
# Step 14: Severity channels
REG_ACTION_CUES = re.compile(r"\b(show[- ]cause|censure|admonish(ed|ment)?|cease and desist|prohibit(ed)?|restrain(ed)?|order(ed)?|direction(s)?|non[- ]compliance|contravention|violat(ed|ion))\b", re.IGNORECASE)
CUSTOMER_IMPACT_CUES = re.compile(r"\b(outage|downtime|service disruption|transaction failure|refund|chargeback surge|system incident|breach|cyber|compromise|data leak|security incident|phishing)\b", re.IGNORECASE)
REPUTATION_CUES = re.compile(r"\b(advisory issued|public notice|warning to customers|misconduct|malpractice|mis-selling|investigation initiated)\b", re.IGNORECASE)

def regulatory_action_severity(text: str) -> float:
    if not text: return 0.0
    t = text[:8000]
    has_act = bool(REG_ACTION_CUES.search(t))
    has_law = bool(LAW_TOKENS.search(t))
    if has_act and has_law: return 0.45
    if has_act: return 0.30
    if has_law: return 0.15
    return 0.0

def customer_impact_severity(text: str) -> float:
    if not text: return 0.0
    t = text[:8000]
    if CUSTOMER_IMPACT_CUES.search(t):
        magnitude = 0.35
        if re.search(r"\b(hours|days|nationwide|across India|millions|lakhs)\b", t, re.I):
            magnitude += 0.10
        return min(0.55, magnitude)
    return 0.0

def reputational_severity(text: str) -> float:
    if not text: return 0.0
    t = text[:8000]
    return 0.25 if REPUTATION_CUES.search(t) else 0.0

def monetary_severity(amount, penalty_cue=False) -> float:
    if amount and amount > 0:
        return min(1.0, math.log10(1 + float(amount)) / 8.0)
    return 0.05 if penalty_cue else 0.0

def composite_severity(row) -> dict:
    amt = row.get("fine_amount_in_inr")
    txt = row.get("text") or ""
    pen = bool(row.get("penalty_cue"))

    sev_monetary = monetary_severity(amt, pen)
    sev_reg = regulatory_action_severity(txt)
    sev_cust = customer_impact_severity(txt)
    sev_rep = reputational_severity(txt)

    comp = max(
        sev_monetary,
        0.9*sev_reg,
        0.85*sev_cust,
        0.7*sev_rep
    )
    return {
        "sev_monetary": round(sev_monetary,3),
        "sev_regulatory": round(sev_reg,3),
        "sev_customer": round(sev_cust,3),
        "sev_reputational": round(sev_rep,3),
        "severity": round(comp,3)
    }


In [None]:
# Step 15: Risk scoring
POL_ADJ = SCORE_CFG["polarity_adjust"]
W = SCORE_CFG["risk_weights"]
AUTH_W = SCORE_CFG["authority"]
NO_EVENT_CAP = float(SCORE_CFG["no_event_cap"])
STRUCT_CUES_TITLE = re.compile(r"(" + "|".join([re.escape(x) for x in SCORE_CFG["structural_title_cues"]]) + r")", re.I)
STRUCT_URL_PATTERNS = SCORE_CFG["structural_url_patterns"]
STRUCT_TITLE_CUES = re.compile(r"(" + "|".join([re.escape(x) for x in SCORE_CFG["structural_title_cues"]]) + r")", re.I)
def recency_decay(iso_date):
    if not iso_date: return 0.5
    try:
        d = (dt.date.today() - dateparser.parse(iso_date).date()).days
    except:
        return 0.5
    return math.exp(-d / 90.0)

def jurisdiction_weight(row):
    src = (row.get("source") or "").lower()
    return 1.0 if ("rbi" in src or "sebi" in src) else 0.8

def doc_authority(row):
    src = (row.get("source") or "").lower()
    if src.startswith("news/secondary"):
        return AUTH_W["secondary_pdf"] if row.get("doctype")=="PDF" else AUTH_W["secondary_html"]
    return AUTH_W["primary_pdf"] if row.get("doctype")=="PDF" else AUTH_W["primary_html"]

def is_structural_row(row):
    url = row.get("url", "")
    title = row.get("title", "")

    # Check 1: URL contains a structural pattern
    if any(pattern in url for pattern in STRUCT_URL_PATTERNS):
        return True

    # Check 2: Title contains a structural keyword
    if STRUCT_TITLE_CUES.search(title):
        return True

    return False

def compute_risk(row):
    # Demote structural pages outright
    if is_structural_row(row):
        return 0.05, {"note": "structural_page_demoted"}

    sev_parts = composite_severity(row)
    sev = sev_parts["severity"]

    rec = recency_decay(row.get("published_date"))
    jur = jurisdiction_weight(row)
    rel_emb = max(0.0, min(1.0, float(row.get("relevance_embed") or 0.0)))
    rel_ml  = max(0.0, min(1.0, float(row.get("relevance_prob_ml") or 0.0)))
    auth = doc_authority(row)

    pol = row.get("impact_polarity", "Neutral")
    pol_adj = POL_ADJ.get(pol, 0.0)

    # Corroboration: secondary that matches primary closely
    if (row.get("source") or "").lower().startswith("news/secondary") and row.get("is_corroboration"):
        rec = min(1.0, rec + 0.05)

    non_event = (sev == 0.0)
    cap = NO_EVENT_CAP if non_event else 1.0

    score = (
        W["severity"]*sev +
        W["recency"]*rec +
        W["jurisdiction"]*jur +
        W["relevance_embed"]*rel_emb +
        W["relevance_ml"]*rel_ml +
        W["authority"]*auth +
        pol_adj
    )
    score = max(0.0, min(cap, score))
    factors = {
        **sev_parts,
        "recency": round(rec,3),
        "jurisdiction": round(jur,3),
        "relevance_embed": round(rel_emb,3),
        "relevance_ml": round(rel_ml,3),
        "authority": round(auth,2),
        "polarity_adj": pol_adj,
        "no_event_cap": cap,
        "corroboration": bool(row.get("is_corroboration", False))
    }
    return round(score,3), factors


In [None]:
# Step 16: Analyze, merge, score
def analyze(df):
    if df.empty:
        df["impact_polarity"] = "Neutral"
        df["risk_score"] = 0.0
        df["risk_factors"] = {}
        df["summary"] = ""
        return df
    # Polarity simple default
    df["impact_polarity"] = "Neutral"
    # Risk
    df["risk_score"], df["risk_factors"] = zip(*df.apply(compute_risk, axis=1))
    # Simple extractive summary
    def extractive_summary(text, n=3):
        sents = re.split(r"(?<=[.!?])\s+", (text or "").strip())
        if len(sents) <= n: return (text or "")[:600]
        q = "What is the action, amount, authority, and reason?"
        qv = sbert.encode(q, normalize_embeddings=True)
        cand = [(util.cos_sim(qv, sbert.encode(s[:512], normalize_embeddings=True)).item(), s) for s in sents[:40]]
        return " ".join([s for _, s in sorted(cand, key=lambda x: -x[0])[:n]])
    df["summary"] = df["text"].apply(lambda t: extractive_summary(t, n=3))
    return df

df_p_an = analyze(df_p_content)
df_s_an = analyze(df_s_content)

df_all = pd.concat([df_p_an, df_s_an], ignore_index=True)
print("Merged analyzed rows:", len(df_all))

# QA: Top-N panel
qa_cols = ["source","doctype","published_date","title","fine_amount_in_inr","penalty_cue","relevance_prob_ml","risk_score","url"]
print(df_all.sort_values("risk_score", ascending=False).head(15)[qa_cols].to_string(index=False))

# Export
cols = ["source","section","title","url","published_date","doctype",
        "labels_zero_shot","penalty_cue","fine_amount_in_inr",
        "relevance_embed","relevance_prob_ml","impact_polarity",
        "is_corroboration","risk_score","risk_factors","summary"]

# Ensure columns exist even for empty branches
for c in cols:
    if c not in df_all.columns: df_all[c] = None

out = df_all[cols].sort_values(["risk_score","published_date"], ascending=[False, False])
out.to_csv(OUTPUTS["csv"], index=False)
out.to_excel(OUTPUTS["xlsx"], index=False)
print("Exported:", len(out), "rows ->", OUTPUTS["csv"], "|", OUTPUTS["xlsx"])


Merged analyzed rows: 240
  source doctype published_date                                        title  fine_amount_in_inr  penalty_cue  relevance_prob_ml  risk_score                                                                                          url
RBI/SEBI    HTML     2025-01-01                               Press Releases        3.600000e+11         True              0.566       0.707                                       https://rbi.org.in/Scripts/BS_PressReleaseDisplay.aspx
RBI/SEBI     PDF     2025-01-01 f0e51f4ade02656d855d07181a33c840e51cb8d3.pdf        5.000000e+05         True              0.590       0.607 https://rbidocs.rbi.org.in/rdocs/PressRelease/PDFs/PR913B249BA47101C47D983A6DB7BCF55C2BC.PDF
RBI/SEBI     PDF     2025-01-01 6740dc5fc166b49921f0e8d34f7b543562e4248b.pdf        4.200000e+05         True              0.596       0.606 https://rbidocs.rbi.org.in/rdocs/PressRelease/PDFs/PR9109D0BEA4645754F099B0CBFC383ECA055.PDF
RBI/SEBI    HTML     2025-01-01       