In [None]:
import os
import re
from openai import OpenAI
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

# 🔑 환경 설정
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"  # ← 실제 키로 교체
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
embedding = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
db3 = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embedding
)

In [16]:
# ✅ 한-영 교수 이름 매핑
professor_name_map = {
    "노맹석": "Maengseok Noh",
    "문형빈": "HyungBin Moon",
    "하지환": "Jihwan Ha",
    "지준화": "Junhwa Chi",
}

In [17]:
# ✅ 번역
def translate_with_gpt(text, source_lang="ko", target_lang="en") -> str:
    prompt = f"Translate this from {source_lang} to {target_lang}:\n\n{text}"
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3
    )
    return response.choices[0].message.content.strip()

In [18]:
# ✅ 질문 유형 분류
def classify_question_type(question_ko: str) -> str:
    prompt = f"""
다음 질문의 유형을 아래 중 하나로 분류해 주세요:
- 논문_목록
- 논문_요약
- 연구_흐름

질문: {question_ko}
질문 유형:"""
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt.strip()}],
        temperature=0
    )
    return response.choices[0].message.content.strip()

In [19]:
translate_with_gpt("문형빈 교수님의 논문 목록은")

"The list of Professor Moon Hyung-bin's papers is"

In [20]:
# ✅ 교수명 추출
def extract_professor_name(question: str) -> str | None:
    match = re.search(r"([가-힣]{2,4})\s*교수", question)
    return match.group(1) if match else None

In [27]:
def get_first_page_summary(doc: Document) -> str:
    # 전체 page_content에서 첫 문단 또는 앞부분만 요약용으로 자름
    content = doc.page_content.strip().split("\n")[:2]  # 첫 줄만
    return f"📄 {content}"

In [30]:
# ✅ 문서 포맷 구성 (교수명만 존재하는 메타데이터 구조 대응)
def format_doc_with_metadata(doc: Document) -> str:
    professor = doc.metadata.get("professor", "교수 정보 없음")
    content = doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content
    return f"""🧑‍🏫 교수: {professor}
📄 내용 요약:
{content}
"""

# ✅ 프롬프트 템플릿
prompt_templates = {
    "논문_목록": PromptTemplate(
        input_variables=["context", "question"],
        template="""
You are provided with a collection of academic papers written by a professor. 
Based on the following user request, list the key papers along with:

1. The title of each paper (📌 Please keep the title in English)  
2. The publication year (if available)  
3. A few core keywords representing the main topic (in Korean)  
4. The author(s) of each paper (in Korean)

User question:
{question}

Paper content:
{context}

📌 Please write your response in Korean using a respectful and organized tone, **but keep the paper titles in English**.

논문 목록 요약 (in Korean):"""
    ),
    "논문_요약": PromptTemplate(
        input_variables=["context"],
        template="""
You are a research summarization assistant. Based on the following academic paper, provide a clear and concise summary including the following elements:

1. Research subject (what or who is being studied)  
2. Research method (how it was studied)  
3. Research findings (what was discovered)  
4. Suggestions or implications (recommendations or conclusions)

Paper content:
{context}

📌 Please write your summary in Korean, using a polite and professional tone.

논문 요약 (in Korean):"""
    ),
    "연구_흐름": PromptTemplate(
        input_variables=["context", "question"],
        template="""
You are an academic assistant. Given a collection of research papers written by a single professor, analyze how the research topics or areas of interest have evolved over time. 
Identify key shifts, trends, or patterns chronologically based on the publication content.

Context:
{context}

Question:
{question}

Now, summarize the chronological progression of the professor’s research focus. 
📌 Please write your answer in Korean using a clear and respectful tone.

연구 흐름 요약 (한국어로):"""
    )
}

In [31]:
def process_question(question_ko: str):
    # 1. 질문 유형 분류
    question_type = classify_question_type(question_ko)

    # 2. 교수 이름 추출
    target_author_ko = extract_professor_name(question_ko)
    target_author_en = professor_name_map.get(target_author_ko) if target_author_ko else None

    if question_type in ["논문_목록", "연구_흐름"] and not target_author_en:
        raise ValueError("질문에서 유효한 교수 이름을 찾을 수 없습니다.")

    # 3. 질문 번역
    question_en = translate_with_gpt(question_ko)

    # 4. 검색 (professor 기준 수동 필터링)
    collection = db3._collection.get(include=["metadatas", "documents"])

    # metadatas와 documents를 묶어서 Document 객체로 재구성
    docs = [
        Document(page_content=page, metadata=meta)
        for page, meta in zip(collection["documents"], collection["metadatas"])
        if meta.get("professor") == target_author_en
    ]

    # 5. 첫 페이지 기반 요약용 context 구성
    context_text = "\n\n---\n\n".join(get_first_page_summary(doc) for doc in docs)

    # 6. 프롬프트 설정
    if question_type == "논문_목록":
        context_text = "\n\n---\n\n".join(get_first_page_summary(doc) for doc in docs)
        prompt = prompt_templates["논문_목록"]

    elif question_type == "연구_흐름":
        context_text = "\n\n---\n\n".join(get_first_page_summary(doc) for doc in docs)
        prompt = prompt_templates["연구_흐름"]

    else:  # 논문_요약
        context_text = "\n\n---\n\n".join(doc.page_content for doc in docs)
        prompt = prompt_templates["논문_요약"]


    # 7. 실행
    chain = prompt | ChatOpenAI(model="gpt-4o")
    inputs = {"context": context_text}
    if "question" in prompt.input_variables:
        inputs["question"] = question_ko

    result = chain.invoke(inputs)

    print(f"[{question_type.upper()}]")
    print(result.content)


In [32]:
# ✅ 실행 예시
question_ko = "노맹석 교수님의 논문 정리해줘"
process_question(question_ko)

[논문_목록]
노맹석 교수님의 논문을 다음과 같이 정리하였습니다:

1. **Title**: "STATISTICS IN MEDICINE"
   - **Publication Year**: 2006
   - **Core Keywords**: 의료 통계, frailty 모델, 이질성
   - **Authors**: 노맹석, 하인득, 이영조

2. **Title**: "Robust estimation of dropout models using hierarchical likelihood"
   - **Publication Year**: 2011
   - **Core Keywords**: 결측 데이터, robust estimation, 계층적 우도
   - **Authors**: 노맹석, 이영조, 켄워드 마이클 G.

3. **Title**: "Multicomponent Variance Estimation for Binary Traits"
   - **Publication Year**: 2006
   - **Core Keywords**: 이분형 형질, 분산 추정, 유전 통계
   - **Authors**: 노맹석, 이영조

4. **Title**: "Double hierarchical generalized linear models (with discussion)"
   - **Publication Year**: 2006
   - **Core Keywords**: 이중 계층적 일반화 선형 모델, 통계 모형, 분산 구성 요소
   - **Authors**: 이영조, 넬더 J.A.

5. **Title**: "Journal of Multivariate Analysis"
   - **Publication Year**: 2007
   - **Core Keywords**: 다변량 분석, 통계적 효율성, REML 절차
   - **Authors**: 노맹석, 이영조

이 목록은 노맹석 교수님의 대표적인 논문들로 구성되어 있으며, 각 논문은 특정 통계적 방법론이나 데이터 분석 기술을 