In [23]:
import os, json, re, hashlib, numpy as np
import pandas as pd
from datetime import datetime, timezone, timedelta
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from difflib import SequenceMatcher
from functools import lru_cache


with open("../data/school_info/old_data/dmu_documents_cleaned.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df_json = pd.DataFrame(data)
df_json = df_json.dropna(subset=["content"])
df_json["title"] = df_json["title"].fillna("")
df_json["url"] = df_json["url"].fillna("")
df_json["source"] = "main"

dept_dir = "../data/dept"
dept_files = [f for f in os.listdir(dept_dir) if f.endswith("_notices.csv")]

df_list = []
for file in dept_files:
    file_path = os.path.join(dept_dir, file)
    try:
        df_dept = pd.read_csv(file_path)
        df_dept["title"] = df_dept["제목"].fillna("")
        df_dept["url"] = df_dept["링크"].fillna("")
        df_dept["content"] = (
            "부서: " + df_dept["부서"].fillna("없음") + "\n"
            + "작성자: " + df_dept["작성자"].fillna("없음") + "\n"
            + "작성일: " + df_dept["작성일"].astype(str).fillna("없음")
        )
        df_dept["source"] = file.replace("_notices.csv", "")
        df_list.append(df_dept[["title", "content", "url", "source"]])
    except Exception as e:
        print(f"⚠️ 파일 오류: {file_path} - {e}")

df_csv = pd.concat(df_list, ignore_index=True) if df_list else pd.DataFrame(columns=["title", "content", "url", "source"])
df = pd.concat([df_json[["title","content","url","source"]], df_csv], ignore_index=True)
df = df.dropna(subset=["title", "content"]).reset_index(drop=True)


def normalize_text(s): 
    return re.sub(r"\s+", " ", str(s)).strip()


df["title"]   = df["title"].apply(normalize_text)
df["content"] = df["content"].apply(normalize_text)
df["fulltext"] = (df["title"] + " " + df["content"]).apply(normalize_text)
df["content_hash"] = df["fulltext"].apply(lambda x: hashlib.sha1(x.encode("utf-8")).hexdigest())
df = df.drop_duplicates(subset=["url"]).drop_duplicates(subset=["content_hash"]).reset_index(drop=True)

KST = timezone(timedelta(hours=9))
KST_NOW = datetime.now(KST)
df["notice_flag"] = (df["source"] != "main").astype(int)


def try_parse_date(x):
    x = str(x)
    fmts = ("%Y-%m-%d", "%Y.%m.%d", "%Y/%m/%d", "%Y%m%d")
    for fmt in fmts:
        try:
            return datetime.strptime(x[:10], fmt).replace(tzinfo=KST)
        except:
            pass
    m = re.search(r"(\d{4})\s*년\s*(\d{1,2})\s*월\s*(\d{1,2})\s*일", x)
    if m:
        y, mo, d = map(int, m.groups())
        return datetime(y, mo, d, tzinfo=KST)
    return None


date_patterns = [
    re.compile(r"(?:작성일|등록일|게시일)\s*[:\-]?\s*(\d{4}[./-]\d{1,2}[./-]\d{1,2})"),
    re.compile(r"(?:작성일|등록일|게시일)\s*[:\-]?\s*(\d{4}\s*년\s*\d{1,2}\s*월\s*\d{1,2}\s*일)")
]


def extract_date_from_content(txt):
    if not isinstance(txt, str):
        return None
    for pat in date_patterns:
        m = pat.search(txt)
        if m:
            return try_parse_date(m.group(1))
    return None

df["updated_at"] = df.apply(
    lambda r: extract_date_from_content(r["content"]) if r["notice_flag"]==1 else None, axis=1
)


try:
    import kss
    def sent_split(text):
        text = str(text).strip()
        return [s for s in kss.split_sentences(text) if s.strip()]
except Exception:
    SENT_PAT = re.compile(r'.+?(?:다\.|요\.|[.!?])(?=\s+|$)')
    def sent_split(text):
        text = ' '.join(str(text).split())
        if not text:
            return []
        sents = SENT_PAT.findall(text)
        return sents if sents else [text]


def chunk_text(text, max_tokens=400, overlap=0.15):
    sents = [s for s in sent_split(text) if s.strip()]
    if not sents:
        return []
    chunks, cur, tok = [], [], 0
    ovl = max(int(max_tokens * overlap), 0)
    for s in sents:
        t = len(s.split())
        if t >= max_tokens and not cur:
            chunks.append(s.strip()); continue
        if tok + t <= max_tokens:
            cur.append(s); tok += t
        else:
            if cur:
                blob = " ".join(cur).strip()
                chunks.append(blob)
                if ovl > 0:
                    tail_tokens = blob.split()[-ovl:]
                    tail = " ".join(tail_tokens)
                    cur = [tail, s]
                    tok = len(tail_tokens) + t
                else:
                    cur = [s]; tok = t
            else:
                chunks.append(s.strip()); cur, tok = [], 0
    if cur:
        chunks.append(" ".join(cur).strip())
    return chunks

rows = []
for i, r in df.iterrows():
    parts = chunk_text(r["fulltext"], max_tokens=400, overlap=0.15)
    for j, ch in enumerate(parts):
        rows.append({
            "doc_id": i,
            "chunk_id": f"{i}-{j}",
            "title": r["title"],
            "url": r["url"],
            "source": r["source"],
            "notice_flag": r["notice_flag"],
            "updated_at": r["updated_at"],
            "text": ch
        })

chunk_df = pd.DataFrame(rows).reset_index(drop=True)


try:
    from konlpy.tag import Okt
    okt = Okt()
    def tokenize_kor(s):
        return [w for w, pos in okt.pos(s, norm=True, stem=True) 
                if pos not in ("Josa","Punctuation","Foreign")]
except Exception:
    def tokenize_kor(s):
        return re.findall(r"[가-힣A-Za-z0-9]+", s)


tokenized_corpus = [tokenize_kor(t) for t in chunk_df["text"].tolist()]
bm25 = BM25Okapi(tokenized_corpus)


model = SentenceTransformer("intfloat/multilingual-e5-base")


def embed_passages(texts):
    return model.encode([f"passage: {t}" for t in texts], 
                        normalize_embeddings=True, show_progress_bar=True)


@lru_cache(maxsize=256)
def embed_query(q: str):
    return model.encode([f"query: {q}"], normalize_embeddings=True)[0].astype(np.float32)

embeddings = embed_passages(chunk_df["text"].tolist()).astype(np.float32)


PHONE_RE = re.compile(r"\b0\d{1,2}[-\s]?\d{3,4}[-\s]?\d{4}\b")
EMAIL_RE = re.compile(r"[\w\.-]+@[\w\.-]+")
UNIT_RE  = re.compile(r"([가-힣A-Za-z·\s\-/()]{2,40}?(?:팀|센터|처|단|과|부|본부|위원회|연구소|지원실|실|연대|학부|학과))")


def last_seg(title: str):
    return title.split("/")[-1].strip() if isinstance(title, str) else ""


def clean_name(s: str) -> str:
    s = re.sub(r"\([^)]*\)", "", str(s))
    s = re.sub(r"[\s\-/·]+", "", s)
    return s.lower()


def extract_units_and_contacts(row):
    text = row["content"] if isinstance(row["content"], str) else ""
    title = row["title"]; url = row["url"]; source = row["source"]
    out = []

    for m in UNIT_RE.finditer(text):
        unit = m.group(1).strip()
        win = 140
        start, end = max(0, m.start()-win), min(len(text), m.end()+win)
        near = text[start:end]
        phones = PHONE_RE.findall(near)
        emails = EMAIL_RE.findall(near)
        out.append({
            "unit": unit,
            "phone": phones[-1] if phones else "없음",
            "email": emails[-1] if emails else "없음",
            "title": title, "url": url, "source": source, "how": "content"
        })


    hint = last_seg(title)
    mh = UNIT_RE.search(hint)
    if mh:
        phones = PHONE_RE.findall(text)
        emails = EMAIL_RE.findall(text)
        out.append({
            "unit": mh.group(1).strip(),
            "phone": phones[-1] if phones else "없음",
            "email": emails[-1] if emails else "없음",
            "title": title, "url": url, "source": source, "how": "title"
        })


    if not out:
        phones = PHONE_RE.findall(text)
        emails = EMAIL_RE.findall(text)
        if phones or emails:
            cand = hint if hint else "미상"
            out.append({
                "unit": cand,
                "phone": phones[-1] if phones else "없음",
                "email": emails[-1] if emails else "없음",
                "title": title, "url": url, "source": source, "how": "fallback"
            })
    return out

contact_cands = []
for _, r in df.iterrows():
    contact_cands.extend(extract_units_and_contacts(r))


contacts_raw = pd.DataFrame(contact_cands)
if contacts_raw.empty:
    contacts_df = pd.DataFrame(columns=[
        "unit","phone","email","title","url","source","unit_norm","cluster_id",
        "unit_canonical","search_text"
    ])
else:
    contacts_raw["unit_norm"] = contacts_raw["unit"].apply(clean_name)

    cluster_map = {}; reps = []
    for name in contacts_raw["unit_norm"].unique():
        assigned = False
        for cid, rep in enumerate(reps):
            if SequenceMatcher(None, name, rep).ratio() >= 0.92:
                cluster_map[name] = cid; assigned = True; break
        if not assigned:
            cid = len(reps); reps.append(name); cluster_map[name] = cid

    contacts_raw["cluster_id"] = contacts_raw["unit_norm"].map(cluster_map.get)

    rep_map = {}
    for cid in set(cluster_map.values()):
        members = contacts_raw[contacts_raw["cluster_id"] == cid]
        rep_map[cid] = members["unit"].value_counts().idxmax()
    contacts_raw["unit_canonical"] = contacts_raw["cluster_id"].map(rep_map.get)

    contacts_df = (contacts_raw
                .sort_values(["cluster_id","url"])
                .drop_duplicates(subset=["cluster_id","phone","email"])
                .reset_index(drop=True))

    contacts_df["search_text"] = (
        contacts_df["unit_canonical"].fillna("") + " " +
        contacts_df["unit"].fillna("") + " " +
        contacts_df["title"].fillna("")
    ).str.replace(r"\s+", " ", regex=True).str.strip()


if not contacts_df.empty:
    bm25_contacts = BM25Okapi([tokenize_kor(t) for t in contacts_df["search_text"].tolist()])
    emb_contacts = model.encode([f"passage: {t}" for t in contacts_df["search_text"].tolist()],
                                normalize_embeddings=True, show_progress_bar=False).astype(np.float32)

    cluster_df = contacts_df.groupby("cluster_id").agg({
        "unit_canonical": "first",
        "search_text": "first"
    }).reset_index()
    cluster_df["rep_norm"] = cluster_df["unit_canonical"].apply(lambda s: re.sub(r"[\s\-/·]+","", str(s)).lower())

    cluster_rep_emb = model.encode(
        [f"passage: {t}" for t in cluster_df["search_text"].tolist()],
        normalize_embeddings=True, show_progress_bar=False
    ).astype(np.float32)

    cluster_members = contacts_df.groupby("cluster_id").indices

else:
    bm25_contacts, emb_contacts = None, None
    cluster_df = pd.DataFrame(columns=["cluster_id","unit_canonical","search_text","rep_norm"])
    cluster_rep_emb = np.zeros((0, 768), dtype=np.float32)
    cluster_members = {}


NOTICE_KEYWORDS = ["공지","안내","모집","변경","연장","일정","신청","발표","채용","장학","기간","마감","등록","수강","정정"]


def _minmax(x):
    x = np.asarray(x, dtype=float)
    lo, hi = np.nanmin(x), np.nanmax(x)
    if hi - lo < 1e-9:
        return np.zeros_like(x)
    return (x - lo) / (hi - lo + 1e-9)


def recency_score(ts, now=KST_NOW, half_life_days=30):
    if pd.isna(ts) or ts is None:
        return 0.0
    days = max((now - ts).days, 0)
    return float(np.exp(-np.log(2) * days / half_life_days))


def hybrid_search_docs(query, top_k=10, alpha=0.6, half_life_days=30, recency_weight=0.45, notice_boost=0.20):
    qv = embed_query(query)
    dense_raw = embeddings @ qv
    bm25_raw  = bm25.get_scores(tokenize_kor(query))

    dense = _minmax(dense_raw); bm25s = _minmax(bm25_raw)
    base = alpha * dense + (1 - alpha) * bm25s

    rec = np.asarray([recency_score(ts, half_life_days=half_life_days) for ts in chunk_df["updated_at"]])
    rec = _minmax(rec)
    bonus = recency_weight * rec

    is_notice_q = any(k in query for k in NOTICE_KEYWORDS)
    if is_notice_q:
        bonus = bonus + notice_boost * chunk_df["notice_flag"].to_numpy()

    scores = base + bonus

    tmp = chunk_df.copy()
    tmp["score"] = scores
    best_idx = tmp.groupby("url")["score"].idxmax()
    doc_top = tmp.loc[best_idx].copy()
    doc_top.sort_values(by=["score","updated_at"], ascending=[False, False], inplace=True, na_position="last")
    return doc_top.head(top_k)[["title","url","score","updated_at","notice_flag","text"]].reset_index(drop=True)


CONTACT_KWS = ["연락처","전화","전화번호","이메일","메일","담당자","상담","교직원","직원검색","문의","contact","email","phone"]


def is_contact_query(q: str) -> bool:
    return any(k in q for k in CONTACT_KWS)


def match_units_for_query(query, topn=10):
    if contacts_df.empty or cluster_df.empty:
        return []
    q_clean = re.sub(r"[\s\-/·]+","", str(query)).lower()

    sim_str = cluster_df["rep_norm"].apply(lambda x: SequenceMatcher(None, q_clean, x).ratio()).to_numpy()

    qv = embed_query(query)
    sim_emb = cluster_rep_emb @ qv


    def _mm_vec(v):
        lo, hi = float(np.min(v)), float(np.max(v))
        return np.zeros_like(v, dtype=np.float32) if hi - lo < 1e-9 else (v - lo) / (hi - lo + 1e-9)
    
    hybrid = 0.5*_mm_vec(sim_str.astype(np.float32)) + 0.5*_mm_vec(sim_emb.astype(np.float32))
    order = np.argsort(-hybrid)[:topn]

    return cluster_df.loc[order, "cluster_id"].tolist()


def hybrid_search_contacts(query, top_k=5, alpha=0.6):
    if contacts_df.empty:
        return pd.DataFrame(columns=["unit_canonical","phone","email","title","url","score"])

    cand_clusters = match_units_for_query(query, topn=20)

    if cand_clusters:
        cand_idx = sorted({i for cid in cand_clusters for i in cluster_members.get(cid, [])})
    else:
        cand_idx = list(range(len(contacts_df)))

    q_tokens = tokenize_kor(query)
    qv = embed_query(query)

    dense_all = emb_contacts @ qv
    bm25_all  = bm25_contacts.get_scores(q_tokens)

    dense = dense_all[cand_idx]
    bm25s = bm25_all[cand_idx]


    def _mm(x):
        lo, hi = float(np.min(x)), float(np.max(x))
        return np.zeros_like(x, dtype=np.float32) if hi - lo < 1e-9 else (x - lo) / (hi - lo + 1e-9)
    base = alpha * _mm(dense.astype(np.float32)) + (1 - alpha) * _mm(bm25s.astype(np.float32))
    sub = contacts_df.iloc[cand_idx].copy()
    title_hint = sub["title"].str.contains(r"(교직원|직원검색|연락처|전화)", regex=True, na=False).astype(int).to_numpy()
    is_board   = sub["url"].str.contains(r"/bbs/").fillna(False).astype(int).to_numpy()
    q_norm     = re.sub(r"[\s\-/·]+","", query.lower())
    contains   = sub["unit"].apply(lambda d: int(re.sub(r"[\s\-/·]+","", str(d)).lower() in q_norm or q_norm in re.sub(r"[\s\-/·]+","", str(d)).lower())).to_numpy()

    scores = base + 0.15*title_hint - 0.20*is_board + 0.25*contains

    sub["score"] = scores
    sub.sort_values("score", ascending=False, inplace=True)

    return sub.head(top_k)[["unit_canonical","phone","email","title","url","score"]].reset_index(drop=True)


def search(query, top_k=8):
    if is_contact_query(query):
        cands = hybrid_search_contacts(query, top_k=max(5, top_k//2))
        if not cands.empty and ((cands["phone"]!="없음") | (cands["email"]!="없음")).any():
            return {"type":"contact", "results": cands.to_dict(orient="records")}
        docs = hybrid_search_docs(query, top_k=top_k)
        return {"type":"docs", "results": docs.to_dict(orient="records")}
    else:
        docs = hybrid_search_docs(query, top_k=top_k)
        return {"type":"docs", "results": docs.to_dict(orient="records")}



os.makedirs("../model/artifacts", exist_ok=True)
np.save("../model/artifacts/embeddings.npy", embeddings)
chunk_df.to_parquet("../model/artifacts/chunk_df.parquet", index=False)
if 'contacts_df' in locals() and not contacts_df.empty:
    contacts_df.to_csv("../model/artifacts/contacts.csv", index=False, encoding="utf-8-sig")

Batches:   0%|          | 0/402 [00:00<?, ?it/s]

  if _pandas_api.is_sparse(col):


In [None]:
hybrid_search_contacts("학생상담센터 이메일", top_k=5)

  title_hint = sub["title"].str.contains(r"(교직원|직원검색|연락처|전화)", regex=True, na=False).astype(int).to_numpy()


Unnamed: 0,unit_canonical,phone,email,title,url,score
0,컴퓨터소프트웨어공학과,02-2610-1988,없음,학사안내/현장실습,https://www.dongyang.ac.kr/dmu/4756/subview.do,1.208546
1,컴퓨터소프트웨어공학과,02-2610-1843,min101933@dongyan,학부ㆍ학과/컴퓨터소프트웨어공학과,https://www.dongyang.ac.kr/dmu/4573/subview.do,1.133818
2,컴퓨터소프트웨어공학과,02-2610-1843,min101933@dongyang.ac.kr,학부ㆍ학과/컴퓨터소프트웨어공학과,https://www.dongyang.ac.kr/dmu/4573/subview.do,1.133818
3,컴퓨터소프트웨어공학과,없음,없음,대학소개/대학기구,https://www.dongyang.ac.kr/dmu/4361/subview.do,1.035099
4,학년도 컴퓨터소프트웨어공학과,없음,없음,[컴소] 2023학년도 컴퓨터소프트웨어공학과 대학교육과정 SQF 인정,https://www.dongyang.ac.kr/combBbs/dmu/84/320/...,0.848281


In [28]:
search("컴소과 시간표", top_k=3)

{'type': 'docs',
 'results': [{'title': '[컴소] 2025학년도 1학기 컴퓨터소프트웨어공학과 기말고사 시간표 안내',
   'url': 'https://www.dongyang.ac.kr/combBbs/dmu/84/320/249060/view.do?layout=unknown',
   'score': 1.1944537606676444,
   'updated_at': Timestamp('2025-06-10 00:00:00+0900', tz='UTC+09:00'),
   'notice_flag': 1,
   'text': '[컴소] 2025학년도 1학기 컴퓨터소프트웨어공학과 기말고사 시간표 안내 부서: 학과공지 작성자: 민** 작성일: 2025.06.10'},
  {'title': '[컴소] 2023-1 컴퓨터소프트웨어공학과 강의시간표 및 수강신청 안내(03/09)',
   'url': 'https://www.dongyang.ac.kr/combBbs/dmu/84/320/121464/view.do?layout=unknown',
   'score': 0.9781315294119632,
   'updated_at': Timestamp('2023-02-01 00:00:00+0900', tz='UTC+09:00'),
   'notice_flag': 1,
   'text': '[컴소] 2023-1 컴퓨터소프트웨어공학과 강의시간표 및 수강신청 안내(03/09) 부서: 학과공지 작성자: 오** 작성일: 2023.02.01'},
  {'title': '[컴소] 2025-1학기 컴퓨터소프트웨어공학과 강의시간표 및 수강신청 안내(02/28 수정)',
   'url': 'https://www.dongyang.ac.kr/combBbs/dmu/84/320/130183/view.do?layout=unknown',
   'score': 0.9739198661061367,
   'updated_at': Timestamp('2025-02-05 00:00:00+0900