<a href="https://colab.research.google.com/github/cn8972/Echo-Bot/blob/main/University_Recommender_Final2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ================== University Recommender — Colab + KaggleHub + Gradio (Single Cell) ==================
# 1) Installs kagglehub and gradio
# 2) Auto-downloads and loads Kaggle dataset: nitishabharathi/university-recommendation
# 3) Builds an interactive Gradio UI with a public share URL
# 4) Recognizes `univName` / `univname` / `univ_name` for university names
# =======================================================================================================

# -------- Install --------
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "gradio>=4.0.0", "pandas", "numpy", "kagglehub[pandas-datasets]"])

# -------- Imports --------
import os
import io
import glob
import typing as T
import numpy as np
import pandas as pd
import gradio as gr

# KaggleHub
import kagglehub
from kagglehub import KaggleDatasetAdapter

# ---------------------
# Column aliases (includes `univName`)
# ---------------------
COLUMN_MAP = {
    "university_name": [
        "University", "university", "University Name", "UNIVERSITY NAME",
        "College", "college", "College Name", "college_name",
        "Institution", "institution", "Institution Name", "inst_name",
        "University_Name", "university_name", "univ", "u_name", "name",
        "univName", "univname", "univ_name"
    ],
    "university_id": ["university_id", "uni_id", "college_id", "univ_id"],
    "rank": ["Rank", "ranking", "University Rating", "University_Rating", "Rank*", "rank", "rank_percentile"],
    "region": ["Region", "State", "Location", "Country", "region", "state", "location", "country"],
    "gre": ["GRE", "GRE Score", "GreScore", "GRE_Score", "GRE_New", "gre", "gre_score", "gre_new"],
    "toefl": ["TOEFL", "TOEFL Score", "TOEFL_Score", "toefl", "toefl_score", "ielts"],
    "gpa": ["GPA", "CGPA", "cgpa", "GPA_Score", "gpa", "c_gpa", "undergrad_gpa"],
    "sop": ["SOP", "Statement of Purpose", "SOP_Score", "sop", "sop_score"],
    "lor": ["LOR", "LOR ", "Letter of Recommendation", "LOR_Score", "lor", "lor_score"],
    "research": ["Research", "Research_Flag", "research", "research_flag"],
    "label": ["admit", "admission", "admission_decision", "decision", "accepted", "target", "label"]
}

WEIGHTS = {"rank": -1.0, "gre": 0.40, "toefl": 0.20, "gpa": 0.40, "sop": 0.10, "lor": 0.10, "research": 0.15}
DEFAULT_BANDS = {"reach": 0.40, "target": 0.70}
TOP_N_DEFAULT = 5

# ---------------------
# Utilities
# ---------------------
def _lc(s: str) -> str:
    return s.strip().lower().replace("\u00a0", " ")

def build_column_index(df: pd.DataFrame) -> dict:
    return {_lc(c): c for c in df.columns}

def find_col_ci(df: pd.DataFrame, candidates: T.List[str]) -> T.Optional[str]:
    idx = build_column_index(df)
    for cand in candidates:
        key = _lc(cand)
        if key in idx:
            return idx[key]
    return None

def first_existing(df: pd.DataFrame, aliases: T.List[str]) -> T.Optional[str]:
    return find_col_ci(df, aliases)

def minmax(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors="coerce")
    if s.notna().sum() == 0 or s.max() == s.min():
        return pd.Series(np.zeros(len(s)), index=s.index)
    return (s - s.min()) / (s.max() - s.min())

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def normalize_user_prior(key: str, val: float) -> float:
    if val is None:
        return 0.0
    v = float(val)
    ranges = {"gre": (260.0, 340.0), "toefl": (0.0, 120.0), "gpa": (0.0, 4.0), "sop": (1.0, 5.0), "lor": (1.0, 5.0)}
    if key in ranges:
        lo, hi = ranges[key]
        if hi <= lo:
            return 0.5
        return float(np.clip((v - lo) / (hi - lo), 0.0, 1.0))
    if key == "research":
        return 1.0 if v > 0 else 0.0
    return 0.5

def validate_dataset(df: pd.DataFrame) -> None:
    has_name = first_existing(df, COLUMN_MAP["university_name"]) is not None
    has_uid  = first_existing(df, COLUMN_MAP["university_id"])   is not None
    if not (has_name or has_uid):
        raise ValueError(
            "University identifier not found. Include a name column (e.g., 'univName') or a 'university_id' column."
        )

def _is_numeric_scalar(v) -> bool:
    return isinstance(v, (int, float, np.integer, np.floating)) and pd.notna(v)

def make_university_display(df: pd.DataFrame, col_univ: T.Optional[str], col_uid: T.Optional[str]) -> pd.Series:
    if col_univ is not None:
        return df[col_univ].astype(str)
    if col_uid is not None:
        uid = pd.to_numeric(df[col_uid], errors="coerce")
        def _fmt(v):
            if _is_numeric_scalar(v):
                try:
                    return f"University #{int(v)}"
                except Exception:
                    return f"University {v}"
            return f"University {v}"
        return uid.map(_fmt)
    return pd.Series([f"University @row[{i}]" for i in df.index], index=df.index)

def build_features(df: pd.DataFrame, cols: dict, user_inputs: dict) -> T.Tuple[pd.DataFrame, dict]:
    feats = pd.DataFrame(index=df.index)
    detail = {"applied": {}}
    if cols.get("rank") is not None:
        rnorm = 1.0 - minmax(df[cols["rank"]])
        feats["rank_util"] = rnorm
        detail["applied"]["rank"] = {"kind": "item_only", "col": cols["rank"]}
    for key in ["gre", "toefl", "gpa", "sop", "lor", "research"]:
        user_val = user_inputs.get(key, None)
        colname = cols.get(key, None)
        if user_val is None:
            continue
        if key == "research":
            if colname is not None:
                item = pd.to_numeric(df[colname], errors="coerce").fillna(0)
                util = (item > 0).astype(float) * (1.0 if float(user_val) > 0 else 0.0)
            else:
                util = np.full(len(df), normalize_user_prior(key, user_val), dtype=float)
        else:
            if colname is not None:
                item = pd.to_numeric(df[colname], errors="coerce")
                item_norm = minmax(item)
                umin, umax = item.min(skipna=True), item.max(skipna=True)
                if pd.isna(umin) or pd.isna(umax) or umax == umin:
                    user_norm = 0.5
                else:
                    user_norm = float(np.clip((float(user_val) - umin) / ((umax - umin) + 1e-9), 0.0, 1.0))
                util = 1.0 - np.abs(item_norm - user_norm)
            else:
                util = np.full(len(df), normalize_user_prior(key, user_val), dtype=float)
        feats[f"{key}_util"] = util
        detail["applied"][key] = {"kind": "match" if colname is not None else "prior", "col": colname}
    return feats, detail

def score_items(feats: pd.DataFrame) -> T.Tuple[pd.Series, dict]:
    util = pd.Series(np.zeros(len(feats), dtype=float), index=feats.index)
    contribs = {}
    for key, w in WEIGHTS.items():
        col = "rank_util" if key == "rank" else f"{key}_util"
        if col in feats:
            term = w * feats[col].astype(float)
            util = util + term
            contribs[key] = term
    p = pd.Series(sigmoid(util.values), index=feats.index, name="prob")
    return p, contribs

def explain_top_contributors(contribs: dict, idx: int, top_k: int = 3) -> T.List[str]:
    rows = []
    for k, series in contribs.items():
        val = float(series.loc[idx])
        rows.append((k, val, abs(val)))
    rows.sort(key=lambda x: x[2], reverse=True)
    out = []
    for k, val, _ in rows[:top_k]:
        direction = "increased" if val >= 0 else "decreased"
        out.append(f"{k.upper()} {direction} the score ({val:+.3f}).")
    return out

def load_and_prepare(df: pd.DataFrame):
    validate_dataset(df)
    cols = {
        "university_name": first_existing(df, COLUMN_MAP["university_name"]),
        "university_id": first_existing(df, COLUMN_MAP["university_id"]),
        "rank": first_existing(df, COLUMN_MAP["rank"]),
        "region": first_existing(df, COLUMN_MAP["region"]),
        "gre": first_existing(df, COLUMN_MAP["gre"]),
        "toefl": first_existing(df, COLUMN_MAP["toefl"]),
        "gpa": first_existing(df, COLUMN_MAP["gpa"]),
        "sop": first_existing(df, COLUMN_MAP["sop"]),
        "lor": first_existing(df, COLUMN_MAP["lor"]),
        "research": first_existing(df, COLUMN_MAP["research"]),
        "label": first_existing(df, COLUMN_MAP["label"]),
    }
    uni_display = make_university_display(df, cols["university_name"], cols["university_id"])
    return df, cols, uni_display

def template_1_topn(df: pd.DataFrame, gre: float, toefl: float, gpa: float, sop: float, lor: float, research: int, top_n: int = TOP_N_DEFAULT):
    df, cols, uni_display = load_and_prepare(df)
    user = {"gre": gre, "toefl": toefl, "gpa": gpa, "sop": sop, "lor": lor, "research": research}
    feats, _ = build_features(df, cols, user)
    probs, contribs = score_items(feats)
    top_idx = probs.sort_values(ascending=False).head(int(top_n)).index
    rows = []
    for i, idx in enumerate(top_idx, 1):
        drivers = "; ".join(explain_top_contributors(contribs, idx, top_k=3))
        rows.append({
            "Rank #": i,
            "University": uni_display.loc[idx],
            "Predicted Admit Probability": float(probs.loc[idx]),
            "Drivers (Top 3)": drivers
        })
    out_df = pd.DataFrame(rows)
    names_only = "\n".join(f"- {u}" for u in out_df["University"].astype(str).tolist())
    return out_df, names_only

# ---------------------
# Kaggle dataset loader
# ---------------------
KAGGLE_DATASET_SLUG = "nitishabharathi/university-recommendation"

def _load_kaggle_df() -> tuple[pd.DataFrame, str]:
    """
    Download the Kaggle dataset locally and load a CSV.
    Returns (DataFrame, loaded_csv_path).
    Tries to locate a CSV automatically; prefers names containing 'original' if present.
    """
    # Download latest dataset locally (directory path)
    try:
        local_dir = kagglehub.dataset_download(KAGGLE_DATASET_SLUG)
    except Exception:
        # Fallback: attempt using the Pandas adapter with common file names
        for cand in ["Original.csv", "original.csv", "Admission_Predict.csv", "Admission_Predict_Ver1.1.csv"]:
            try:
                df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, KAGGLE_DATASET_SLUG, cand)
                return df, f"[adapter]{cand}"
            except Exception:
                pass
        raise RuntimeError("Could not download or load a CSV from the Kaggle dataset.")

    # Search for CSV files inside local_dir
    csvs = glob.glob(os.path.join(local_dir, "**", "*.csv"), recursive=True)
    if not csvs:
        raise RuntimeError(f"No CSV files found in the Kaggle dataset at: {local_dir}")

    # Prefer file names containing 'original', else first CSV
    preferred = [p for p in csvs if "original" in os.path.basename(p).lower()]
    chosen = preferred[0] if preferred else csvs[0]

    # Robust encoding attempts
    last_err = None
    for enc in (None, "utf-8", "utf-8-sig", "latin-1"):
        try:
            df = pd.read_csv(chosen, encoding=enc)
            return df, chosen
        except Exception as e:
            last_err = e
    raise RuntimeError(f"Failed to read CSV {chosen}: {type(last_err).__name__}: {last_err}")

# Cache the Kaggle DataFrame on first use
_cached_df = None
_cached_path = None

def get_default_df() -> tuple[pd.DataFrame, str]:
    global _cached_df, _cached_path
    if _cached_df is None:
        _cached_df, _cached_path = _load_kaggle_df()
    return _cached_df, _cached_path

# ---------------------
# Gradio helpers
# ---------------------
def _coerce_to_df(file_obj) -> pd.DataFrame:
    """Accepts gr.File input and returns a DataFrame with robust encoding handling."""
    if file_obj is None:
        raise ValueError("Please upload a CSV file or choose 'Kaggle (auto)'.")
    path = None
    if isinstance(file_obj, (str, os.PathLike)):
        path = str(file_obj)
    elif hasattr(file_obj, "name"):
        path = str(file_obj.name)
    elif isinstance(file_obj, dict) and "name" in file_obj:
        path = str(file_obj["name"])
    else:
        # Fallback: try bytes
        try:
            content = file_obj.read()
            for enc in (None, "utf-8", "utf-8-sig", "latin-1"):
                try:
                    return pd.read_csv(io.BytesIO(content), encoding=enc)
                except Exception:
                    pass
            raise ValueError("Could not decode uploaded CSV bytes.")
        except Exception:
            pass
    if path is None or not os.path.exists(path):
        raise FileNotFoundError("Could not resolve the uploaded file path.")
    # Try multiple encodings
    last_err = None
    for enc in (None, "utf-8", "utf-8-sig", "latin-1"):
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception as e:
            last_err = e
    raise RuntimeError(f"Failed to read CSV: {type(last_err).__name__}: {last_err}")

def ui_topn(source, file_obj, gre, toefl, gpa, sop, lor, research, top_n):
    # Resolve dataset source
    if source == "Kaggle (auto)":
        try:
            df_raw, path_used = get_default_df()
            loaded_from = f"Kaggle: {KAGGLE_DATASET_SLUG}\nFile: {path_used}"
        except Exception as e:
            msg = f"Error loading Kaggle dataset: {type(e).__name__}: {e}"
            return pd.DataFrame([{"Status": msg}]), msg, ""
    else:
        try:
            df_raw = _coerce_to_df(file_obj)
            loaded_from = "Uploaded CSV"
        except Exception as e:
            msg = f"Error reading uploaded CSV: {type(e).__name__}: {e}"
            return pd.DataFrame([{"Status": msg}]), msg, ""

    # Run recommender
    try:
        out_df, names_only = template_1_topn(
            df=df_raw,
            gre=float(gre), toefl=float(toefl), gpa=float(gpa),
            sop=float(sop), lor=float(lor), research=int(research),
            top_n=int(top_n)
        )
        return out_df, names_only, loaded_from
    except Exception as e:
        msg = f"Error during recommendation: {type(e).__name__}: {e}"
        return pd.DataFrame([{"Status": msg}]), msg, loaded_from

# ---------------------
# Gradio app (with Kaggle source option)
# ---------------------
with gr.Blocks(title="University Recommender — Top-N") as demo:
    gr.Markdown("## University Recommender — Top-N")
    gr.Markdown("Choose data source, set the applicant profile, and get Top-N universities. Defaults to **Kaggle (auto)**.")

    with gr.Row():
        source = gr.Radio(choices=["Kaggle (auto)", "Upload CSV"], value="Kaggle (auto)", label="Data Source")
        dataset = gr.File(label="Upload CSV (only used if 'Upload CSV' selected)", file_types=[".csv"])
        top_n = gr.Number(value=5, precision=0, label="Top-N")

    with gr.Row():
        gre = gr.Number(value=320, label="GRE")
        toefl = gr.Number(value=105, label="TOEFL")
        gpa = gr.Number(value=3.6, label="GPA")

    with gr.Row():
        sop = gr.Number(value=4.0, label="SOP (1–5)")
        lor = gr.Number(value=4.0, label="LOR (1–5)")
        research = gr.Dropdown(choices=[0, 1], value=1, label="Research (0/1)")

    run_btn = gr.Button("Recommend")

    out_table = gr.Dataframe(interactive=False, label="Top-N Table")
    out_names = gr.Textbox(label="University Names Only", lines=8)
    out_src = gr.Textbox(label="Loaded From", interactive=False)

    run_btn.click(
        fn=ui_topn,
        inputs=[source, dataset, gre, toefl, gpa, sop, lor, research, top_n],
        outputs=[out_table, out_names, out_src]
    )

# Launch and print share URL for Colab
launch_info = demo.launch(share=True, inbrowser=False, prevent_thread_lock=True)

def _extract_share_url(obj):
    # Compatible across Gradio versions (obj may be object or tuple/list)
    url = getattr(obj, "share_url", None)
    if url:
        return url
    if isinstance(obj, (list, tuple)):
        # Newer gradio often returns (app, local_url, share_url)
        if len(obj) >= 3 and isinstance(obj[2], str):
            return obj[2]
        if len(obj) >= 2 and isinstance(obj[1], str) and obj[1].startswith("http"):
            return obj[1]
    return None

share_url = _extract_share_url(launch_info)
print("\n================ Gradio URL ================")
print(f" Public: {share_url or 'N/A'}")
try:
    local_url = getattr(launch_info, "local_url", None)
except Exception:
    local_url = None
print(f"  Local:  {local_url or 'N/A'}")
print("============================================\n")


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://981e0287e54ac73a8c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



 Public: https://981e0287e54ac73a8c.gradio.live
  Local:  N/A

