In [5]:
import os
import random
import pandas as pd
import numpy as np

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

CSV_PATH = "updated_data.csv"
TRAIN_PATH = "train_updated.csv"
TEST_PATH = "test_updated.csv"

df = pd.read_csv(CSV_PATH)
print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")


Dataset loaded: 3400 rows, 11 columns


In [6]:
# preprocessing function
def preprocess(df):
    df = df.copy()
    df["text_for_nlp"] = (
        df.get("scheme_name", "").astype(str) + ". " +
        df.get("details", "").astype(str) + ". " +
        df.get("benefits", "").astype(str) + ". " +
        df.get("eligibility", "").astype(str) + ". " +
        df.get("application", "").astype(str) + ". " +
        df.get("documents", "").astype(str) + ". " +
        df.get("tags", "").astype(str)
    ).str.lower()
    return df

# Prepare dataset for vectorization
df = preprocess(df)
print(f"Text prepared for {len(df)} records.")

Text prepared for 3400 records.


In [7]:
# Train TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=30000, ngram_range=(1, 2), stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df["text_for_nlp"])
df_full = df.copy()
print("Vectorizer trained and matrix built.")

Vectorizer trained and matrix built.


In [8]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import re


def _norm(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

CATEGORIES = {
    'education': ['education', 'educational', 'school', 'college', 'university', 'universities', 'scholarship', 'scholarships', 'tuition', 'higher education'],
    'business': ['business', 'enterprise', 'entrepreneur', 'entrepreneurship', 'startup', 'msme', 'microenterprise', 'small business', 'company', 'firm'],
    'agriculture': ['agriculture', 'agricultural', 'farm', 'farming', 'farmer', 'farmers', 'horticulture', 'dairy', 'livestock', 'fisheries', 'aquaculture', 'agro'],
    'health': ['health', 'healthcare', 'hospital', 'medical', 'medicine'],
    'women': ['women', 'woman', 'female', 'girls', 'girl', 'women entrepreneurs', 'women led'],
    'youth': ['youth', 'young', 'young entrepreneur', 'youth led', 'student', 'students'],
    'loan': ['loan', 'credit', 'finance', 'funding', 'loan facility', 'soft loan', 'subsidy', 'grant'],
    'market': ['market', 'marketing', 'market access', 'value chain', 'supply chain', 'cold chain', 'processing', 'retail'],
    'cooperative': ['cooperative', 'co op', 'cooperative society', 'self help group', 'shg'],
}

GEO = {
    'rural': ['rural','village','village level','rural area','rural development','remote','tribal','underserved'],
    'urban': ['urban','city','town','municipal','metropolitan']
}

POLICY = ['policy','policies','guideline','guidelines','rule','rules','act','framework','regulation','regulations']

TAG_STOP = {'scheme','schemes','india','indian','government','govt','department','state','central','na','n a','-'}


def _present_facets(query: str):
    qn = _norm(query)
    facets = {cat for cat, syns in CATEGORIES.items() if any(w in qn for w in syns)}
    geos = {g for g, syns in GEO.items() if any(w in qn for w in syns)}
    policy = any(w in qn for w in POLICY)
    return facets, geos, policy


def _row_norm_fields(row):
    return {
        'tags': _norm(row.get('tags', '')),
        'scheme_name': _norm(row.get('scheme_name', '')),
        'details': _norm(row.get('details', '')),
        'benefits': _norm(row.get('benefits', '')),
        'eligibility': _norm(row.get('eligibility', '')),
    }


def _row_has_any(row_fields: dict, terms: list) -> bool:
    corpus = " ".join(row_fields.values())
    return any(t in corpus for t in terms)


def _doc_has_policy(row_fields: dict) -> bool:
    corpus = " ".join(row_fields.values())
    return any(t in corpus for t in POLICY)


def _doc_covers_all_facets(row_fields: dict, facets: set) -> bool:
    if not facets:
        return True
    corpus = " ".join(row_fields.values())
    for facet in facets:
        if not any(w in corpus for w in CATEGORIES.get(facet, [])):
            return False
    return True


def _collect_terms(keys, mapping):
    seen, out = set(), []
    for k in keys:
        for t in mapping.get(k, []):
            if t not in seen:
                seen.add(t)
                out.append(t)
    return out


def _rerank(indices, sims, tokens, phrases, present_facets=None):
    boosts = {'tags': 0.35, 'scheme_name': 0.35, 'details': 0.15, 'benefits': 0.10, 'eligibility': 0.10}
    phrase_boosts = {'scheme_name': 0.70, 'tags': 0.55, 'details': 0.40, 'benefits': 0.30, 'eligibility': 0.30}
    wanted = present_facets or set()
    scored = []
    for idx in indices:
        row = df_full.iloc[idx]
        f = _row_norm_fields(row)
        token_b = 0.0
        for t in tokens:
            if t in f['tags']: token_b += boosts['tags']
            if t in f['scheme_name']: token_b += boosts['scheme_name']
            if t in f['details']: token_b += boosts['details']
            if t in f['benefits']: token_b += boosts['benefits']
            if t in f['eligibility']: token_b += boosts['eligibility']
        phrase_b = 0.0
        for p in phrases:
            if p in f['scheme_name']: phrase_b += phrase_boosts['scheme_name']
            if p in f['tags']: phrase_b += phrase_boosts['tags']
            if p in f['details']: phrase_b += phrase_boosts['details']
            if p in f['benefits']: phrase_b += phrase_boosts['benefits']
            if p in f['eligibility']: phrase_b += phrase_boosts['eligibility']
        # penalty, to avoid off-topic docs
        penalty = 0.0
        if wanted:
            for cat, syns in CATEGORIES.items():
                if cat in wanted:
                    continue
                if _row_has_any(f, syns):
                    penalty += 0.40
        base = float(sims[idx])
        final = base + token_b + phrase_b - penalty
        scored.append((idx, final, base, token_b, phrase_b, penalty))
    scored.sort(key=lambda x: x[1], reverse=True)
    return scored


def answer_query(query: str, top_k: int = 5, candidate_multiplier: int = 8):
    present_facets, present_geos, policy_in_query = _present_facets(query)
    phrases = []
    tokens = set(_norm(query).split())
    tokens.update(_collect_terms(present_facets, CATEGORIES))
    tokens.update(_collect_terms(present_geos, GEO))
    tokens = list(tokens)

    qv = vectorizer.transform([_norm(query)])
    sims = cosine_similarity(qv, tfidf_matrix).flatten()

    cand = min(len(sims), max(top_k * candidate_multiplier, 100))
    cand_idx = list(sims.argsort()[::-1][:cand])

    #  geo filter
    if present_geos:
        geo_terms = _collect_terms(present_geos, GEO)
        tmp = [i for i in cand_idx if _row_has_any(_row_norm_fields(df_full.iloc[i]), geo_terms)]
        cand_idx = tmp or cand_idx

    if policy_in_query:
        tmp = [i for i in cand_idx if _doc_has_policy(_row_norm_fields(df_full.iloc[i]))]
        cand_idx = tmp or cand_idx
    if len(present_facets) >= 2:
        tmp = [i for i in cand_idx if _doc_covers_all_facets(_row_norm_fields(df_full.iloc[i]), present_facets)]
        if len(tmp) >= max(top_k*2, 10):
            cand_idx = tmp

    ranked = _rerank(cand_idx, sims, tokens, phrases, present_facets=present_facets)

    print(f"\n🔎 Query: {query}\nFacets: {sorted(list(present_facets)) or 'none'} | Geo: {sorted(list(present_geos)) or 'none'} | Policy: {policy_in_query}\n")
    results = ranked[:top_k]
    if not results:
        print("No matching results. Tips: check spelling, reduce strict filters, or broaden query.")
        return
    for rank, (idx, final, base, token_b, phrase_b, penalty) in enumerate(results, start=1):
        row = df_full.iloc[idx]
        print(f"{rank}. {row.get('scheme_name','N/A')} | score={final:.3f} (sim={base:.3f}, token={token_b:.3f}, phrase={phrase_b:.3f}, penalty={penalty:.2f})")
        print(f"   📌 Tags       : {row.get('tags','N/A')}")
        print(f"   📌 Benefits   : {row.get('benefits','N/A')}")
        print(f"   📌 Eligibility: {row.get('eligibility','N/A')}\n")

# Example query
answer_query(("health scheme"), top_k=3)



🔎 Query: health scheme
Facets: ['health'] | Geo: none | Policy: False

1. Special Health Scheme for Primitive Tribes | score=3.042 (sim=0.292, token=2.750, phrase=0.000, penalty=0.00)
   📌 Tags       : Medical, Financial Assistance, Primitive Tribal, Healthcare
   📌 Benefits   : Focused healthcare services for primitive tribal communities. Distribution of funds to districts such as Dumka, Sahebganj, Pakur, and Godda. Utilization for better medical facilities, treatments, and preventive care programs. Priority to regions with high tribal populations and pressing health needs.
   📌 Eligibility: The applicant should belong to a primitive tribal community. The applicant should reside in districts covered under the scheme, including Dumka, Sahebganj, Pakur, and Godda.

2. Chief Minister's Comprehensive Health Insurance Scheme | score=3.001 (sim=0.101, token=3.300, phrase=0.000, penalty=0.40)
   📌 Tags       : Cashless Medical Treatment, Health Insurance, Economically-weaker Sections, Criti