In [29]:
# === FINAL SCHEME RECOMMENDER (5 RESULTS, NO HEALTH/AGRI/OTHER-STATE JUNK) ===

import pandas as pd
import re
from dataclasses import dataclass
from typing import List
from urllib.parse import urlparse

# ---------------- 1. Load data ----------------
DRIVE_FILE_ID = "1zCcZjIIjDWmNDfSTZ1eeBD6z98uOaSw7"
CSV_URL = f"https://drive.google.com/uc?id={DRIVE_FILE_ID}"

REQUIRED_COLS = [
    "Scheme_Name","Min_Age","Max_Age","Gender_Eligibility","Min_Education",
    "Area","State","Target_Group","Application_Link","Summary"
]

def load_dataset(url: str) -> pd.DataFrame:
    df = pd.read_csv(url)
    for c in REQUIRED_COLS:
        if c not in df.columns:
            df[c] = ""
    df["Min_Age"] = pd.to_numeric(df["Min_Age"], errors="coerce")
    df["Max_Age"] = pd.to_numeric(df["Max_Age"], errors="coerce")
    return df

DF = load_dataset(CSV_URL)

# ---------------- 2. Fix missing defaults ----------------
def fix_defaults(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Min_Age"] = df["Min_Age"].fillna(0)
    df["Max_Age"] = df["Max_Age"].replace(0, None).fillna(200)
    df["Gender_Eligibility"] = df["Gender_Eligibility"].replace("", "any")
    df["Min_Education"] = df["Min_Education"].replace("", "any")
    df["Area"] = df["Area"].replace("", "any")
    df["State"] = df["State"].replace("", "any")
    df["Target_Group"] = df["Target_Group"].fillna("")
    df["Summary"] = df["Summary"].fillna("")
    df["Application_Link"] = df["Application_Link"].fillna("")
    df["Scheme_Name"] = df["Scheme_Name"].fillna("")
    return df

DF = fix_defaults(DF)

# ---------------- 3. Config ----------------
EDU_ORDER = ["class 8","class 10","class 12","graduate","postgraduate","phd","any"]
EDU_RANK  = {e:i for i,e in enumerate(EDU_ORDER)}

TAG_KEYWORDS = {
    "Student":      ["student","scholar","nsp","inspire","school","college"],
    "Unemployed":   ["unemployed","rojgar","ncs","employment"],
    "Youth":        ["youth","skill","pmkvy","apprentice","internship"],
    "Women":        ["women","woman","mahila","beti","kanya","ladli","girl"],
    "Entrepreneur": ["startup","entrepreneur","mudra","pmegp","odop","udyam","pm-fme","msme"],
    "Farmer":       ["farmer","kisan","agri","agriculture","crop","horticulture","irrigation","dairy","bamboo","rythu"]
}

# list of Indian states for name-based state matching
INDIAN_STATES = [
    "andhra pradesh","arunachal pradesh","assam","bihar","chhattisgarh","goa",
    "gujarat","haryana","himachal pradesh","jharkhand","karnataka","kerala",
    "madhya pradesh","maharashtra","manipur","meghalaya","mizoram","nagaland",
    "odisha","orissa","punjab","rajasthan","sikkim","tamil nadu","telangana",
    "tripura","uttar pradesh","uttarakhand","west bengal","delhi","ladakh",
    "jammu","kashmir","jammu and kashmir"
]

@dataclass
class Profile:
    age: int
    gender: str
    education: str
    area: str
    state: str
    tags: List[str]

# ---------------- 4. Helpers ----------------
def norm(x) -> str:
    try:
        return str(x).lower().strip()
    except:
        return ""

def is_central(state: str) -> bool:
    s = norm(state)
    return s in ("", "any") or "all india" in s or "pan india" in s or "pan-india" in s

def edu_ok(user_ed: str, req_ed: str) -> bool:
    req = norm(req_ed)
    if req not in EDU_RANK:
        return True
    return EDU_RANK.get(norm(user_ed), -1) >= EDU_RANK[req]

def gender_ok(row_gender: str, user_gender: str) -> bool:
    g = norm(row_gender)
    return g in ("", "any") or g == norm(user_gender)

def link_score(link: str) -> float:
    host = urlparse(str(link)).netloc.lower()
    if host.endswith(".gov.in") or host.endswith(".nic.in"):
        return 2.0
    if host.endswith(".gov") or host.endswith(".org"):
        return 1.0
    return 0.3

def combined_text(row: pd.Series) -> str:
    text = norm(row.get("Scheme_Name","")) + " " + norm(row.get("Summary","")) + " " + norm(row.get("Target_Group",""))
    return re.sub(r"[^a-z]", "", text)

def is_health_scheme(row: pd.Series) -> bool:
    text = combined_text(row)
    health_words = [
        "health","disease","hospital","mental","tobacco","virus",
        "cancer","vector","medical","ayush","covid"
    ]
    return any(w in text for w in health_words)

def is_agri_scheme(row: pd.Series) -> bool:
    text = combined_text(row)
    agri_words = [
        "kisan","farmer","agri","agriculture","crop","pmksy","rkvy",
        "oilseed","horticulture","irrigation","livestock","dairy","bamboo","rythu"
    ]
    return any(w in text for w in agri_words)

def is_other_state_specific(row: pd.Series, u: Profile) -> bool:
    """
    Block schemes that clearly mention some other state in their NAME,
    e.g. 'Rythu Bandhu (Telangana)' when user_state = 'bihar'.
    """
    name = norm(row.get("Scheme_Name",""))
    user_state = norm(u.state)
    # if scheme name contains some state and it is not the user's state, block it
    mentioned_states = [s for s in INDIAN_STATES if s in name]
    if not mentioned_states:
        return False
    # if at least one mentioned state is not user's state -> treat as other-state
    for st in mentioned_states:
        if st not in user_state:  # simple check
            # but also: if user_state is not that state
            if user_state not in st:
                return True
    return False

def tag_hit(row: pd.Series, u: Profile) -> bool:
    text = norm(row.get("Scheme_Name","")) + " " + norm(row.get("Target_Group","")) + " " + norm(row.get("Summary",""))
    for t in u.tags:
        for kw in TAG_KEYWORDS.get(t, []):
            if kw in text:
                return True
    return False

# ---------------- 5. Filters ----------------
def passes_core_filters(row: pd.Series, u: Profile) -> bool:
    if not is_central(row["State"]) and norm(u.state) not in norm(row["State"]):
        return False
    if not (row["Min_Age"] <= u.age <= row["Max_Age"]):
        return False
    if not edu_ok(u.education, row["Min_Education"]):
        return False
    if not gender_ok(row["Gender_Eligibility"], u.gender):
        return False
    return True

def allowed_category(row: pd.Series, u: Profile) -> bool:
    # block clearly health schemes unless special health-related tags
    if is_health_scheme(row):
        if ("Pregnant Women" not in u.tags and "Senior Citizen" not in u.tags):
            return False
    # block agriculture schemes unless Farmer tag
    if is_agri_scheme(row):
        if "Farmer" not in u.tags:
            return False
    # block schemes that explicitly mention a different state in the name
    if is_other_state_specific(row, u):
        return False
    return True

def strong_match(row: pd.Series, u: Profile) -> bool:
    return tag_hit(row, u)

def mild_match(row: pd.Series, u: Profile) -> bool:
    if tag_hit(row, u):
        return True
    if gender_ok(row["Gender_Eligibility"], u.gender):
        return True
    if is_central(row["State"]) or norm(u.state) in norm(row["State"]):
        return True
    return False

def score_row(row: pd.Series, u: Profile) -> float:
    s = 0.0
    if gender_ok(row["Gender_Eligibility"], u.gender):
        s += 3.0
    if is_central(row["State"]):
        s += 3.0
    if tag_hit(row, u):
        s += 2.0
    s += link_score(row["Application_Link"])
    return s

# ---------------- 6. Recommender ----------------
def recommend_schemes(df: pd.DataFrame, u: Profile, k: int = 5) -> pd.DataFrame:
    df = df.copy()

    # Step 1: strong matches (core + allowed + tag-based)
    mask_strong = df.apply(
        lambda r: passes_core_filters(r, u) and allowed_category(r, u) and strong_match(r, u),
        axis=1
    )
    strong = df[mask_strong].copy()
    if not strong.empty:
        strong["__score"] = strong.apply(lambda r: score_row(r, u), axis=1)
        strong = strong.sort_values("__score", ascending=False)
    base = strong

    # Step 2: mild matches if we need more
    need = k - len(base)
    if need > 0:
        mask_mild = df.apply(
            lambda r: passes_core_filters(r, u) and allowed_category(r, u) and mild_match(r, u),
            axis=1
        )
        mild = df[mask_mild].copy()
        if not mild.empty:
            mild = mild[~mild["Scheme_Name"].isin(base["Scheme_Name"])]
            if not mild.empty:
                mild["__score"] = mild.apply(lambda r: score_row(r, u), axis=1)
                mild = mild.sort_values("__score", ascending=False).head(need)
                base = pd.concat([base, mild], ignore_index=True)

    # Step 3: if still not enough, relax to core+allowed only
    need = k - len(base)
    if need > 0:
        mask_core = df.apply(
            lambda r: passes_core_filters(r, u) and allowed_category(r, u),
            axis=1
        )
        core_extra = df[mask_core].copy()
        if not core_extra.empty:
            core_extra = core_extra[~core_extra["Scheme_Name"].isin(base["Scheme_Name"])]
            if not core_extra.empty:
                core_extra["__score"] = core_extra.apply(lambda r: score_row(r, u), axis=1)
                core_extra = core_extra.sort_values("__score", ascending=False).head(need)
                base = pd.concat([base, core_extra], ignore_index=True)

    if base.empty:
        return base

    if "__score" in base.columns:
        base = base.drop(columns=["__score"])
    # try to return up to k; if less, it's because everything else is blocked
    return base.head(k)

# ---------------- 7. CLI Input ----------------
def ask(msg: str) -> str:
    try:
        return input(msg + ": ").strip()
    except EOFError:
        return ""

# Age
while True:
    a = ask("Enter Age")
    if a.isdigit() and 0 < int(a) < 150:
        age = int(a)
        break
    print("Invalid age. Please enter a number between 1â€“149.")

# Gender
while True:
    gender = ask("Gender (Male/Female/Other)").lower()
    if gender in ("male","female","other"):
        break
    print("Invalid gender. Type exactly: Male / Female / Other.")

# Education
while True:
    education = ask("Education (Class 8/Class 10/Class 12/Graduate/Postgraduate/PhD/Any)").lower()
    if education in EDU_ORDER:
        break
    print("Invalid education. Choose exactly from the list.")

# Area
while True:
    area = ask("Area (Any/Rural/Urban/Coastal/Tribal)").lower()
    if area in ("any","rural","urban","coastal","tribal"):
        break
    print("Invalid area. Choose: Any, Rural, Urban, Coastal, Tribal.")

# State
while True:
    state = ask("State (e.g., Bihar, Maharashtra)").strip()
    if state.replace(" ","").isalpha():
        state = state.lower()
        break
    print("Invalid state. Only letters allowed.")

print("\nAllowed Tags:")
print(", ".join(TAG_KEYWORDS.keys()))

while True:
    raw = ask("Tags (comma separated; optional)")
    if raw.strip() == "":
        tags = []
        break
    items = [t.strip().title() for t in raw.split(",") if t.strip()]
    valid = [x for x in items if x.lower() in [y.lower() for y in TAG_KEYWORDS.keys()]]
    if valid:
        tags = valid
        break
    print("One or more tags are invalid. Use only from the allowed list.")

user_profile = Profile(
    age=age,
    gender=gender,
    education=education,
    area=area,
    state=state,
    tags=tags
)

top = recommend_schemes(DF, user_profile, k=5)

print("\n=== Recommended Schemes ===\n")
if top.empty:
    print("No relevant schemes found for this profile.")
else:
    for _, r in top.iterrows():
        print(r["Scheme_Name"])
        print(r["Application_Link"])
        print()


Enter Age: 19
Gender (Male/Female/Other): malw
Invalid gender. Type exactly: Male / Female / Other.
Gender (Male/Female/Other): male
Education (Class 8/Class 10/Class 12/Graduate/Postgraduate/PhD/Any): class 12
Area (Any/Rural/Urban/Coastal/Tribal): Urban
State (e.g., Bihar, Maharashtra): Uttar Pradesh

Allowed Tags:
Student, Unemployed, Youth, Women, Entrepreneur, Farmer
Tags (comma separated; optional): Student, Youth

=== Recommended Schemes ===

Skill India Mission (MSDE)
https://www.skillindiadigital.gov.in/

INSPIRE Scholarship (SHE)
https://online-inspire.gov.in/

PM DAKSH (DoSJE)
https://pmdaksh.dosje.gov.in/

Prime Minister's Research Fellowship (PMRF)
https://www.pmrf.in/

National e-Governance Plan (NeGP)
https://www.meity.gov.in/divisions/national-e-governance-plan

