In [8]:
import os
import json
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

with open("../data/school_info/dmu_documents_cleaned.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df_json = pd.DataFrame(data)
df_json = df_json.dropna(subset=["content"])
df_json["source"] = "main"

dept_dir = "../data/dept"
dept_files = [f for f in os.listdir(dept_dir) if f.endswith("_notices.csv")]

df_list = []
for file in dept_files:
    file_path = os.path.join(dept_dir, file)
    try:
        df_dept = pd.read_csv(file_path)
        df_dept["title"] = df_dept["제목"].fillna("")
        df_dept["url"] = df_dept["링크"].fillna("")

        df_dept["content"] = (
            "부서: " + df_dept["부서"].fillna("없음") + "\n"
            + "작성자: " + df_dept["작성자"].fillna("없음") + "\n"
            + "작성일: " + df_dept["작성일"].astype(str).fillna("없음")
        )
        df_dept["source"] = file.replace("_notices.csv", "")
        df_list.append(df_dept[["title", "content", "url", "source"]])

    except Exception as e:
        print(f"⚠️ 파일 오류: {file_path} - {e}")

df_csv = pd.concat(df_list, ignore_index=True) if df_list else pd.DataFrame(columns=["title", "content", "url", "source"])

df = pd.concat([df_json, df_csv], ignore_index=True)
df = df.dropna(subset=["title", "content"])
df["combined_text"] = df["title"] + " " + df["content"]
df["fulltext"] = df["combined_text"].apply(lambda x: re.sub(r"\s+", " ", x.strip()))

tokenized_corpus = [doc.split() for doc in df["fulltext"]]
bm25 = BM25Okapi(tokenized_corpus)

model = SentenceTransformer("jhgan/ko-sroberta-multitask")
embeddings = model.encode(df["fulltext"].tolist(), convert_to_tensor=True)

In [9]:
def extract_contact_info(text):
    phone_lines = []
    email_lines = []

    for line in text.splitlines():
        if '팩스' in line or 'fax' in line.lower():
            continue
        if re.search(r'\b0\d{1,2}[-\s]?\d{3,4}[-\s]?\d{4}\b', line):
            phone_lines.append(line)
        if re.search(r'[\w\.-]+@[\w\.-]+', line):
            email_lines.append(line)

    for line in phone_lines:
        if any(email in line for email in email_lines):
            phones = re.findall(r'\b0\d{1,2}[-\s]?\d{3,4}[-\s]?\d{4}\b', line)
            emails = re.findall(r'[\w\.-]+@[\w\.-]+', line)
            if phones and emails:
                return {'전화번호': phones[0], '이메일': emails[0]}

    emails = []
    phones = []
    for line in text.splitlines():
        line_phones = re.findall(r'\b0\d{1,2}[-\s]?\d{3,4}[-\s]?\d{4}\b', line)
        line_emails = re.findall(r'[\w\.-]+@[\w\.-]+', line)
        if line_emails:
            emails.extend(line_emails)
        if line_phones:
            phones.extend(line_phones)

    return {
        '전화번호': phones[-1] if phones else '없음',
        '이메일': emails[-1] if emails else '없음'
    }

In [10]:
def hybrid_search(query, top_k=10, alpha=0.99, threshold=0.75):
    q_embedding = model.encode([query], convert_to_tensor=True)
    cosine_scores = cosine_similarity(q_embedding.cpu().numpy(), embeddings.cpu().numpy())[0]
    bm25_scores = bm25.get_scores(query.split())

    penalty = df["source"].notnull().astype(float) * 0.15
    hybrid_scores = alpha * cosine_scores + (1 - alpha) * bm25_scores - penalty

    candidates = [
        (i, hybrid_scores[i]) for i in range(len(hybrid_scores)) if hybrid_scores[i] >= threshold
    ]
    sorted_candidates = sorted(candidates, key=lambda x: x[1], reverse=True)[:top_k]

    top_idx = [idx for idx, _ in sorted_candidates]
    results = df.iloc[top_idx][["title", "url", "fulltext"]].copy()
    results["score"] = [s for _, s in sorted_candidates]
    return results.reset_index(drop=True)

In [11]:
query = "컴공 학과 공지"
results = hybrid_search(query, top_k=5, threshold=0.4)

for i, row in results.iterrows():
    print(f"📌 제목: {row['title']}")
    print(f"🔗 URL: {row['url']}")
    print(f"📈 점수: {row['score']:.4f}")
    contact = extract_contact_info(row["fulltext"])
    print(f"📞 전화번호: {contact['전화번호']}, 📧 이메일: {contact['이메일']}")
    print("-" * 50)

📌 제목: 2017-1학기 컴퓨터공학부 사물함 배부 공지
🔗 URL: https://www.dongyang.ac.kr/combBbs/dmu/86/321/71558/view.do
📈 점수: 0.5609
📞 전화번호: 없음, 📧 이메일: 없음
--------------------------------------------------
📌 제목: (통신) 정보통신공학과 PCB설계 J2 강좌 휴보강 일정 공지
🔗 URL: https://www.dongyang.ac.kr/combBbs/dmu/90/314/248950/view.do
📈 점수: 0.5517
📞 전화번호: 없음, 📧 이메일: 없음
--------------------------------------------------
📌 제목: 수업 시간표 공지
🔗 URL: https://www.dongyang.ac.kr/combBbs/dmu/98/309/70470/view.do
📈 점수: 0.5478
📞 전화번호: 없음, 📧 이메일: 없음
--------------------------------------------------
📌 제목: 반도체전자공학과 프로그래밍언어 H2 강좌 휴보강 일정 공지
🔗 URL: https://www.dongyang.ac.kr/combBbs/dmu/92/315/248777/view.do
📈 점수: 0.5465
📞 전화번호: 없음, 📧 이메일: 없음
--------------------------------------------------
📌 제목: [컴정] 2022-1 컴퓨터정보공학과 강의시간표 및 수강신청 안내(03/10)
🔗 URL: https://www.dongyang.ac.kr/combBbs/dmu/86/321/118519/view.do
📈 점수: 0.5437
📞 전화번호: 없음, 📧 이메일: 없음
--------------------------------------------------
