In [2]:
!pip install sentence-transformers
!pip install sentence-transformers PyPDF2 nltk




In [43]:
import os
import glob
import PyPDF2
import numpy as np
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import nltk

# 처음 한 번만 실행 (nltk sentence tokenizer)
nltk.download("punkt")

# ✅ 1. 설정
DATA_DIR = "data/guidelines"  # 여기에 PDF와 TXT가 들어있어야 함
OUTPUT_DIR = "data/processed/rag_chunks"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ✅ 2. 파일 로딩
documents = []
doc_names = []

# PDF 파일 로드
for file_path in glob.glob(os.path.join(DATA_DIR, "*.pdf")):
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    documents.append(text)
    doc_names.append(os.path.basename(file_path))

# TXT 파일 로드
for file_path in glob.glob(os.path.join(DATA_DIR, "*.txt")):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    documents.append(text)
    doc_names.append(os.path.basename(file_path))

print(f"📄 총 {len(documents)}개의 문서 로딩 완료")

# ✅ 3. 텍스트 증강 (소문자 변환 예시)
augmented_docs = []
augmented_sources = []

for doc, name in zip(documents, doc_names):
    lowercased = doc.lower()
    augmented_docs.append(lowercased)
    augmented_sources.append(name)

# ✅ 4. 청크 분리 (2문장씩 묶음)
def chunk_text(text, chunk_size=2):
    sentences = sent_tokenize(text)
    chunks = [' '.join(sentences[i:i+chunk_size]) for i in range(0, len(sentences), chunk_size)]
    return chunks

all_chunks = []
chunk_sources = []

for doc, name in zip(augmented_docs, augmented_sources):
    chunks = chunk_text(doc, chunk_size=2)
    all_chunks.extend(chunks)
    chunk_sources.extend([name] * len(chunks))

print(f"✂️ 총 {len(all_chunks)}개의 청크 생성 완료")

# ✅ 5. 임베딩 생성
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(all_chunks, show_progress_bar=True)

# ✅ 6. 저장
np.save(os.path.join(OUTPUT_DIR, "chunk_embeddings.npy"), embeddings)
with open(os.path.join(OUTPUT_DIR, "chunk_texts.txt"), "w", encoding="utf-8") as f:
    for i, chunk in enumerate(all_chunks):
        f.write(f"--- {chunk_sources[i]} | Chunk {i+1} ---\n{chunk}\n\n")

print("✅ 임베딩 및 청크 저장 완료")


[nltk_data] Downloading package punkt to /Users/cocoxoxo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


📄 총 3개의 문서 로딩 완료
✂️ 총 6271개의 청크 생성 완료


Batches:   0%|          | 0/196 [00:00<?, ?it/s]

✅ 임베딩 및 청크 저장 완료


In [6]:
#!pip install sentence-transformers openai python-dotenv torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [55]:
import os
import json
import torch
import numpy as np
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai

# 환경 변수 로드 (.env 파일에서 API 키 가져오기)
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# Gemini 설정
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel("gemini-1.5-pro")

# 임베딩 모델 로드 (SentenceTransformer)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ 청크 데이터 로딩
CHUNK_DATA = []  # 이전에 저장된 CHUNK_DATA 로딩된 상태라고 가정
def load_chunk_data(chunk_folder="data/processed/guidelines"):
    import glob
    chunk_files = glob.glob(os.path.join(chunk_folder, "*_chunks.txt"))
    chunk_data = []

    for chunk_path in chunk_files:
        name = os.path.basename(chunk_path).replace("_chunks.txt", "")
        embed_path = chunk_path.replace("_chunks.txt", "_embeddings.npy")

        with open(chunk_path, encoding="utf-8") as f:
            chunks = f.read().split("\n\n")
        embeddings = np.load(embed_path)

        n = min(len(chunks), len(embeddings))
        for i in range(n):
            if chunks[i].strip():
                chunk_data.append({
                    "text": chunks[i],
                    "embedding": embeddings[i],
                    "source": name
                })

    print(f"✅ 총 {len(chunk_data)}개의 청크 로딩 완료")
    return chunk_data

CHUNK_DATA = load_chunk_data()

# ✅ 유사 청크 검색 함수
def search_chunks(query, top_k=10):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True).cpu()
    chunk_embeddings = torch.tensor([c["embedding"] for c in CHUNK_DATA]).cpu()
    scores = util.cos_sim(query_embedding, chunk_embeddings)[0]
    top_results = scores.topk(k=top_k)

    return [{
        "text": CHUNK_DATA[idx]["text"],
        "source": CHUNK_DATA[idx]["source"],
        "score": float(scores[idx])
    } for idx in top_results.indices]

# ✅ Gemini로 답변 생성
def generate_answer_with_gemini(query, top_k=10):
    chunks = search_chunks(query, top_k=top_k)
    context = "\n\n".join([f"📘 Source {i+1} ({c['source']}):\n{c['text']}" for i, c in enumerate(chunks)])


    prompt = f"""
You are a highly qualified medical assistant.

Below is a summary of the average health profile for patients:

{summary}

Based on obesity treatment guidelines, please provide a personalized treatment strategy for this patient.
Be specific and cover multiple aspects such as:
- Dietary modifications
- Physical activity recommendations
- Behavioral therapy
- Pharmacological treatment
- Surgical options (if necessary)

Respond in clear and structured English.
"""

    response = gemini_model.generate_content(prompt)
    return {
        "query": query,
        "answer": response.text.strip(),
        "sources": chunks
    }

✅ 총 6551개의 청크 로딩 완료


In [63]:
# 7. 테스트 실행
if __name__ == "__main__":
    patient_info = """
    Patient Info:
    - Age: 52
    - Gender: Male
    - BMI: 38.2
    - Medical History: Type 2 Diabetes, Hypertension
    - Current Medications: Metformin 1000mg, Lisinopril 10mg

    Question:
    What is the recommended treatment strategy for this patient based on obesity clinical guidelines?
    """

    result = generate_answer_with_gemini(patient_info, top_k=10)

    print("🧠 Gemini Answer:\n")
    print(result["answer"])

    print("\n📚 Sources Used:")
    for i, src in enumerate(result["sources"]):
        print(f"\n--- Source {i+1} ({src['source']}) ---\n{src['text'][:500]}")


ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 9
}
]

✅ 네 파이프라인 순서 요약:
📄 PDF/TXT 문서 로딩 →

✂️ 문장을 2개씩 청크로 나눔

🔍 청크 임베딩 생성 및 저장

🧠 질문 입력 시 쿼리 임베딩 생성

🔎 가장 유사한 청크 top-k 검색

✨ 검색된 청크 + 질문으로 Gemini에게 응답 생성 요청

💬 결과 반환 + 출처까지 포함

In [67]:
#!pip install pandas numpy torch sentence-transformers bert_score google-generativeai

python(46554) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m543.3 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [6]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from bert_score import score as bert_score
from dotenv import load_dotenv
import google.generativeai as genai

# ✅ 환경 설정
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
llm = genai.GenerativeModel("gemini-1.5-pro")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ 청크 로딩
def load_chunk_data(folder="data/processed/guidelines"):
    import glob
    chunk_data = []
    for chunk_file in glob.glob(os.path.join(folder, "*_chunks.txt")):
        name = os.path.basename(chunk_file).replace("_chunks.txt", "")
        emb_file = chunk_file.replace("_chunks.txt", "_embeddings.npy")

        with open(chunk_file, encoding="utf-8") as f:
            chunks = f.read().split("\n\n")
        embeddings = np.load(emb_file)

        for i in range(min(len(chunks), len(embeddings))):
            if chunks[i].strip():
                chunk_data.append({
                    "text": chunks[i],
                    "embedding": embeddings[i],
                    "source": name
                })
    return chunk_data

CHUNK_DATA = load_chunk_data()

# ✅ 공통 프롬프트 함수
def make_prompt(info):
    return f"""
You are a medical assistant.

Below is the patient cluster summary or patient information:

{info}

Based on obesity clinical guidelines, provide a personalized treatment strategy including:
- Diet
- Exercise
- Medication
- Counseling
- Other interventions (if needed)

Respond in clear and structured English.
"""

# ✅ 일반 Gemini 호출 (쿼터 초과 시 재시도)
def generate_response_with_gemini(info_text, max_retries=3, delay_seconds=60):
    prompt = make_prompt(info_text)
    for attempt in range(1, max_retries + 1):
        try:
            response = llm.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            print(f"⚠️ Gemini retry {attempt}/{max_retries} failed: {e}")
            if attempt < max_retries:
                time.sleep(delay_seconds)
            else:
                raise RuntimeError("❌ Max retries exceeded for Gemini.")

# ✅ RAG 기반 Gemini 호출 (쿼터 초과 시 재시도)
def generate_response_with_rag(info_text, top_k=10, max_retries=3, delay_seconds=60):
    query_embedding = embedding_model.encode([info_text])[0]
    chunk_embeddings = np.array([c["embedding"] for c in CHUNK_DATA])
    sims = cosine_similarity([query_embedding], chunk_embeddings)[0]
    top_indices = sims.argsort()[-top_k:][::-1]
    chunks = [CHUNK_DATA[i] for i in top_indices]

    context = "\n\n".join([f"📘 Source {i+1} ({c['source']}): {c['text']}" for i, c in enumerate(chunks)])
    prompt = f"""
You are a medical assistant. Use the following context to provide a personalized treatment plan.

Input:
{info_text}

Relevant Guideline Chunks:
{context}

Answer:"""

    for attempt in range(1, max_retries + 1):
        try:
            response = llm.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            print(f"⚠️ RAG retry {attempt}/{max_retries} failed: {e}")
            if attempt < max_retries:
                time.sleep(delay_seconds)
            else:
                raise RuntimeError("❌ Max retries exceeded for RAG.")


In [8]:
summary_input_0 = summaries[0]
summary_input_1 = summaries[1]
summary_input_2 = summaries[2]
summary_input_3 = summaries[3]
summary_input_4 = summaries[4]
summary_input_5 = summaries[5]
summary_input_6 = summaries[6]
summary_input_7 = summaries[7]
summary_input_8 = summaries[8]
summary_input_9 = summaries[9]

for i in range(10):
    print(f"📘 summary_input_{i}:\n{summaries[i]}\n")


NameError: name 'summaries' is not defined

In [60]:
# ✅ 클러스터 정보 로드
data = pd.read_csv("data/processed/obesity_data_with_clusters.csv")
cluster_means = pd.read_csv("data/processed/cluster_means.csv")

# ✅ 클러스터 요약문 생성
summaries = [
    f"Cluster {i}: " + ", ".join([f"{k} is {v}" for k, v in row.items()])
    for i, row in cluster_means.iterrows()
]

# ✅ 클러스터 n번으로 테스트
cluster_id = 1
summary_input = summaries[cluster_id]
summary_answer = generate_response_with_rag(summary_input)
print(f"\n📘 Cluster Summary RAG-based Answer:\n{summary_answer}")


📘 Cluster Summary RAG-based Answer:
Treatment Plan for Patient:

Based on the provided information, the patient presents with a BMI of 34.56, classifying them as obese (Class I).  This necessitates a comprehensive weight management plan focusing on lifestyle modifications and addressing potential comorbidities.  The patient's relatively young age (27.52) makes early intervention crucial to prevent long-term health complications.

**Goals:**

* Achieve and maintain a healthier weight, aiming for a BMI within the healthy range (18.5-24.9 kg/m²).  Realistic initial goals might include a 5-10% weight loss within the first six months.
* Improve dietary habits, focusing on a balanced, nutrient-rich diet.
* Increase physical activity levels.
* Address any underlying psychological or social factors influencing eating habits.
* Monitor and manage potential obesity-related comorbidities (e.g., hypertension, dyslipidemia, OSA).

**Interventions:**

* **Dietary Counseling:**  A registered dietiti

In [46]:
# ✅ 클러스터 내 5개 샘플 가져오기
selected_features = [col for col in cluster_means.columns if col != "cluster"]
samples = data[data["cluster"] == cluster_id].sample(n=5, random_state=42)
samples

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad,BMI,cluster
502,21.90012,0,1.843419,165.057269,2,yes,3.0,3.0,no,no,2.406541,1,0.10032,0.479221,2,Public_Transportation,Obesity_Type_III,48.572062,9
1946,23.69484,0,1.637524,113.90506,2,yes,3.0,3.0,no,no,2.495961,1,0.189831,0.652289,2,Public_Transportation,Obesity_Type_III,42.478353,9
1921,25.918524,0,1.621231,104.986792,2,yes,3.0,3.0,no,no,1.653049,1,0.139159,0.711331,2,Public_Transportation,Obesity_Type_III,39.943385,9
1828,26.0,0,1.637725,111.208963,2,yes,3.0,3.0,no,no,2.70914,1,0.0,0.110518,2,Public_Transportation,Obesity_Type_III,41.462724,9
1876,19.725718,0,1.746529,129.363771,2,yes,3.0,3.0,no,no,2.250711,1,0.64235,0.685129,2,Public_Transportation,Obesity_Type_III,42.409296,9


In [48]:
sample_cluster1 = samples[selected_features]
sample_cluster1

Unnamed: 0,BMI,Weight,Gender,FCVC,Height,Age,NCP,CH2O,TUE,CAEC,family_history_with_overweight,FAF,CALC
502,48.572062,165.057269,0,3.0,1.843419,21.90012,3.0,2.406541,0.479221,2,1,0.10032,2
1946,42.478353,113.90506,0,3.0,1.637524,23.69484,3.0,2.495961,0.652289,2,1,0.189831,2
1921,39.943385,104.986792,0,3.0,1.621231,25.918524,3.0,1.653049,0.711331,2,1,0.139159,2
1828,41.462724,111.208963,0,3.0,1.637725,26.0,3.0,2.70914,0.110518,2,1,0.0,2
1876,42.409296,129.363771,0,3.0,1.746529,19.725718,3.0,2.250711,0.685129,2,1,0.64235,2


In [50]:
summaries = []

for idx, row in sample_cluster1.iterrows():
    summary = ", ".join([f"{col} is {round(row[col], 2)}" for col in sample_cluster1.columns])
    summaries.append(summary)

# 확인
for i, s in enumerate(summaries):
    print(f"Sample {i} ➜ {s}")


Sample 0 ➜ BMI is 48.57, Weight is 165.06, Gender is 0.0, FCVC is 3.0, Height is 1.84, Age is 21.9, NCP is 3.0, CH2O is 2.41, TUE is 0.48, CAEC is 2.0, family_history_with_overweight is 1.0, FAF is 0.1, CALC is 2.0
Sample 1 ➜ BMI is 42.48, Weight is 113.91, Gender is 0.0, FCVC is 3.0, Height is 1.64, Age is 23.69, NCP is 3.0, CH2O is 2.5, TUE is 0.65, CAEC is 2.0, family_history_with_overweight is 1.0, FAF is 0.19, CALC is 2.0
Sample 2 ➜ BMI is 39.94, Weight is 104.99, Gender is 0.0, FCVC is 3.0, Height is 1.62, Age is 25.92, NCP is 3.0, CH2O is 1.65, TUE is 0.71, CAEC is 2.0, family_history_with_overweight is 1.0, FAF is 0.14, CALC is 2.0
Sample 3 ➜ BMI is 41.46, Weight is 111.21, Gender is 0.0, FCVC is 3.0, Height is 1.64, Age is 26.0, NCP is 3.0, CH2O is 2.71, TUE is 0.11, CAEC is 2.0, family_history_with_overweight is 1.0, FAF is 0.0, CALC is 2.0
Sample 4 ➜ BMI is 42.41, Weight is 129.36, Gender is 0.0, FCVC is 3.0, Height is 1.75, Age is 19.73, NCP is 3.0, CH2O is 2.25, TUE is 0.6

In [52]:
summaries

['BMI is 48.57, Weight is 165.06, Gender is 0.0, FCVC is 3.0, Height is 1.84, Age is 21.9, NCP is 3.0, CH2O is 2.41, TUE is 0.48, CAEC is 2.0, family_history_with_overweight is 1.0, FAF is 0.1, CALC is 2.0',
 'BMI is 42.48, Weight is 113.91, Gender is 0.0, FCVC is 3.0, Height is 1.64, Age is 23.69, NCP is 3.0, CH2O is 2.5, TUE is 0.65, CAEC is 2.0, family_history_with_overweight is 1.0, FAF is 0.19, CALC is 2.0',
 'BMI is 39.94, Weight is 104.99, Gender is 0.0, FCVC is 3.0, Height is 1.62, Age is 25.92, NCP is 3.0, CH2O is 1.65, TUE is 0.71, CAEC is 2.0, family_history_with_overweight is 1.0, FAF is 0.14, CALC is 2.0',
 'BMI is 41.46, Weight is 111.21, Gender is 0.0, FCVC is 3.0, Height is 1.64, Age is 26.0, NCP is 3.0, CH2O is 2.71, TUE is 0.11, CAEC is 2.0, family_history_with_overweight is 1.0, FAF is 0.0, CALC is 2.0',
 'BMI is 42.41, Weight is 129.36, Gender is 0.0, FCVC is 3.0, Height is 1.75, Age is 19.73, NCP is 3.0, CH2O is 2.25, TUE is 0.69, CAEC is 2.0, family_history_with_o

In [54]:
# ✅ RAG 응답 저장 리스트
rag_answers = []

# ✅ 각 summary에 대해 LLM 호출
for i, summary in enumerate(summaries):
    try:
        print(f"🔄 Processing Sample {i}...")
        answer = generate_response_with_rag(summary)
        rag_answers.append(answer)
        print(f"✅ Sample {i} done.\n")
    except Exception as e:
        print(f"❌ Error at Sample {i}: {e}")
        rag_answers.append("ERROR")
        time.sleep(60)
        continue


🔄 Processing Sample 0...
✅ Sample 0 done.

🔄 Processing Sample 1...
✅ Sample 1 done.

🔄 Processing Sample 2...
✅ Sample 2 done.

🔄 Processing Sample 3...
✅ Sample 3 done.

🔄 Processing Sample 4...
✅ Sample 4 done.



In [56]:
rag_answers

["Based on the provided information, the patient is a 21.9-year-old (Gender 0.0 likely indicating male) with a BMI of 48.57, classifying them as severely obese.  This is calculated from a weight of 165.06 kg and height of 1.84 m.  Additional lifestyle factors include a frequency of consuming fruits and vegetables 3 times per week (FCVC), 3 meals per day (NCP), 2.41 liters of water daily (CH2O), 0.48 hours of daily screen time (TUE), alcohol consumption twice a month (CAEC), a family history of overweight (family_history_with_overweight), low physical activity (FAF), and normal calcium consumption (CALC).\n\n**Personalized Treatment Plan:**\n\nThis patient requires a comprehensive weight management plan focused on lifestyle modifications, given their severe obesity and young age.\n\n1. **Dietary Modifications:**  A calorie deficit is crucial.  A registered dietitian referral is recommended for personalized meal planning, emphasizing nutrient-dense foods, portion control, and limiting pr

In [58]:
from bert_score import score as bert_score

# 동일한 summary_answer를 참조로 5번 복제해서 비교
P, R, F1 = bert_score(
    cands=rag_answers,
    refs=[summary_answer] * len(rag_answers),
    lang="en",
    verbose=True
)

# 결과 출력
print(f"\n📊 평균 BERTScore F1: {F1.mean().item():.4f}")
for i, score in enumerate(F1):
    print(f"Sample {i+1} vs Summary: F1 = {score:.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 5.35 seconds, 0.93 sentences/sec

📊 평균 BERTScore F1: 0.8724
Sample 1 vs Summary: F1 = 0.8666
Sample 2 vs Summary: F1 = 0.8771
Sample 3 vs Summary: F1 = 0.8606
Sample 4 vs Summary: F1 = 0.8984
Sample 5 vs Summary: F1 = 0.8591


cluster = 1
done in 5.46 seconds, 0.92 sentences/sec
📊 평균 BERTScore F1: 0.8723
Sample 1 vs Summary: F1 = 0.8596
Sample 2 vs Summary: F1 = 0.8743
Sample 3 vs Summary: F1 = 0.8854
Sample 4 vs Summary: F1 = 0.8690
Sample 5 vs Summary: F1 = 0.8730


📘 Cluster Summary RAG-based Answer:
Treatment Plan for Patient:

Based on the provided information, the patient presents with a BMI of 34.56, classifying them as obese (Class I).  This necessitates a comprehensive weight management plan focusing on lifestyle modifications and addressing potential comorbidities.  The patient's relatively young age (27.52) makes early intervention crucial to prevent long-term health complications.

**Goals:**

* Achieve and maintain a healthier weight, aiming for a BMI within the healthy range (18.5-24.9 kg/m²).  Realistic initial goals might include a 5-10% weight loss within the first six months.
* Improve dietary habits, focusing on a balanced, nutrient-rich diet.
* Increase physical activity levels.
* Address any underlying psychological or social factors influencing eating habits.
* Monitor and manage potential obesity-related comorbidities (e.g., hypertension, dyslipidemia, OSA).

**Interventions:**

* **Dietary Counseling:**  A registered dietitian referral is recommended for personalized dietary guidance. This should include education on portion control, healthy food choices, and meal planning.  The patient's FCVC score of 2.21 suggests a need for increased fruit and vegetable consumption.  Addressing NCP (2.9 meals per day) can help establish regular eating patterns.  Limiting CAEC (1.98 - almost "sometimes") is also important to curb emotional eating patterns.
* **Physical Activity:**  Develop an exercise plan tailored to the patient's current fitness level and preferences. Start with moderate-intensity aerobic exercise (e.g., brisk walking, cycling) for at least 150 minutes per week, gradually increasing intensity and duration.  The low FAF score (1.18) indicates a need for increased physical activity.
* **Behavioral Therapy:** Consider cognitive behavioral therapy (CBT) to address underlying emotional and behavioral factors contributing to overeating. This can help modify unhealthy eating habits and develop coping mechanisms for emotional eating.
* **Support Groups:** Encourage participation in support groups or weight management programs to provide ongoing motivation and social support.
* **Medication:** While lifestyle modifications are the cornerstone of obesity treatment,  pharmacological interventions may be considered in conjunction with lifestyle changes if initial interventions are unsuccessful.  This should be discussed with the physician and is not indicated at this initial stage.
* **Monitoring:**  Regular monitoring of weight, BMI, waist circumference, blood pressure, and lipid profile is essential to assess progress and identify potential complications.  Given the family history of overweight (family_history_with_overweight = 0.99), close monitoring is particularly important.

**Further Assessment:**

* **Sleep Assessment:**  Given the high prevalence of obesity and obstructive sleep apnea (OSA), screening for OSA should be considered (e.g., STOP-BANG questionnaire).
* **Comprehensive Metabolic Panel:** To assess for obesity-related comorbidities such as diabetes, dyslipidemia, and fatty liver disease.

This plan should be reviewed and adjusted regularly based on the patient's progress and individual needs.  Collaboration with the physician and other healthcare professionals (e.g., registered dietitian, mental health professional) is crucial for comprehensive and effective management.

**Disclaimer:**  This treatment plan is based on the provided information and general guidelines. It should not be considered a substitute for professional medical advice.  A physician should be consulted for diagnosis and personalized treatment recommendations.

cluster = 4

done in 5.50 seconds, 0.91 sentences/sec

📊 평균 BERTScore F1: 0.8691
Sample 1 vs Summary: F1 = 0.8655
Sample 2 vs Summary: F1 = 0.8645
Sample 3 vs Summary: F1 = 0.8673
Sample 4 vs Summary: F1 = 0.8869
Sample 5 vs Summary: F1 = 0.8614


📘 Cluster Summary RAG-based Answer:
Based on the provided information, your BMI is 23.62 kg/m², classifying you as having a healthy weight according to the AACE guidelines (Source 7: BMI 25-29.9 kg/m² is overweight, BMI ≥30 kg/m² is obese).  However, it's important to look beyond just BMI.  Your age (20.62 years), reported dietary habits (FCVC: 2.69, NCP: 2.48, CH2O: 2.07), physical activity (FAF: 1.31), and other factors (CAEC: 1.69, TUE: 0.73) will be considered in developing a personalized plan. The family history of overweight (0.61) is also noted.

While your BMI is currently within the healthy range, it's crucial to maintain a healthy lifestyle to prevent future weight gain and associated health risks.

**Personalized Treatment Plan:**

1. **Dietary Assessment and Counseling:** A detailed dietary assessment will help identify areas for improvement in your eating habits.  This may involve keeping a food diary or discussing your typical meals and snacks.  Counseling will focus on promoting a balanced diet rich in fruits, vegetables, and whole grains, while limiting processed foods, sugary drinks, and unhealthy fats.  Specific recommendations will be tailored to your preferences and cultural background.  Your current FCVC, NCP, and CH2O values suggest areas we can discuss for improvement.

2. **Physical Activity Recommendations:**  Regular physical activity is crucial for maintaining a healthy weight and overall well-being.  We will discuss your current activity levels (FAF: 1.31) and develop a personalized exercise plan that you find enjoyable and sustainable.  This may include a combination of aerobic exercise, strength training, and flexibility exercises.

3. **Behavioral Modification Strategies:**  Behavioral strategies can help you adopt and maintain healthy habits. This may include setting realistic goals, self-monitoring, problem-solving, and stress management techniques.  Your CAEC and TUE values suggest we can discuss coping mechanisms and potential triggers for unhealthy behaviors.

4. **Ongoing Monitoring and Support:**  Regular follow-up appointments will be scheduled to monitor your progress, provide support, and make adjustments to the treatment plan as needed.  We will track your weight, BMI, and other relevant health indicators.

5. **Addressing Family History:**  Given your family history of overweight, we will discuss strategies to mitigate your genetic predisposition.  This may involve extra emphasis on healthy lifestyle choices and potentially increased monitoring.

This plan focuses on preventative measures and promoting long-term healthy habits.  Because your BMI is currently in the healthy range, interventions such as medication or surgery are not indicated at this time.  However, we will continue to monitor your progress and adjust the plan as needed.  It is important to remember that maintaining a healthy weight is an ongoing process that requires commitment and lifestyle changes.  We will work together to achieve your health goals.

cluster = 9

done in 5.35 seconds, 0.93 sentences/sec

📊 평균 BERTScore F1: 0.8724
Sample 1 vs Summary: F1 = 0.8666
Sample 2 vs Summary: F1 = 0.8771
Sample 3 vs Summary: F1 = 0.8606
Sample 4 vs Summary: F1 = 0.8984
Sample 5 vs Summary: F1 = 0.8591

📘 Cluster Summary RAG-based Answer:
Treatment Plan for Patient in Cluster 9

This patient is a 23.65-year-old female with a BMI of 42.05, classifying her as having severe obesity (Class III).  Her weight is 120.14 kg and height is 1.69 m.  Additional relevant factors include:

* **FCVC (Frequency of Consumption of Vegetables):** 2.99 (likely on a scale of 1-7, indicating relatively low vegetable consumption)
* **NCP (Number of main meals):** 2.99 (suggests inconsistent meal patterns)
* **CH2O (Water Consumption):** 2.22 (likely inadequate)
* **TUE (Time using electronic devices):** 0.6 (potentially low, but units unclear)
* **CAEC (Consumption of food between meals):** 2.0 (likely moderate snacking)
* **Family History of Overweight:** Yes
* **FAF (Physical Activity Frequency):** 0.65 (likely low physical activity)
* **CALC (Alcohol Consumption):** 2.02 (moderate alcohol consumption)

**Goals:**

* **Weight loss:**  Targeting a healthier BMI range through diet, exercise, and lifestyle modifications.
* **Improved diet:**  Increased consumption of vegetables, consistent meal patterns, and increased water intake.
* **Increased physical activity:** Implementing a regular exercise program tailored to the patient's abilities and preferences.
* **Behavior modification:** Addressing emotional eating, snacking habits, and potential lifestyle factors contributing to obesity.
* **Screening and management of comorbidities:**  Assessing for and managing potential weight-related complications like obstructive sleep apnea (OSA), cardiovascular disease, and type 2 diabetes.


**Action Plan:**

1. **Comprehensive Medical Evaluation:**  A full medical history and physical exam are crucial.  This should include blood pressure, fasting glucose, lipid panel, and liver function tests.  Given the family history of overweight and high BMI, screening for other obesity-related conditions is warranted.

2. **Dietary Counseling:** A registered dietitian should be consulted to create a personalized meal plan focusing on:
    * Calorie reduction appropriate for gradual and sustainable weight loss.
    * Increased fruit and vegetable intake.
    * Portion control.
    * Regular meal timing and minimizing snacking between meals.
    * Limiting processed foods, sugary drinks, and unhealthy fats.

3. **Physical Activity Program:** Begin with moderate-intensity exercise and gradually increase the duration and intensity as tolerated.  The patient should aim for at least 150 minutes of moderate-intensity or 75 minutes of vigorous-intensity aerobic exercise per week.  Incorporating strength training exercises 2-3 times per week is also beneficial.  An exercise physiologist or certified personal trainer can help develop a safe and effective program.

4. **Behavior Modification Therapy:**  This can help identify and address underlying emotional or psychological factors contributing to overeating and unhealthy habits. Cognitive behavioral therapy (CBT) and other behavioral strategies can be effective.

5. **Consider Pharmacotherapy:** If lifestyle modifications are insufficient, weight-loss medications may be discussed with the physician.

6. **Surgical Options:**  Bariatric surgery might be considered in patients with severe obesity (BMI ≥ 40) who have not responded to other treatments. This requires further evaluation and consultation with a bariatric surgeon.

7. **Ongoing Monitoring and Support:** Regular follow-up appointments are essential to monitor progress, provide support, and adjust the treatment plan as needed.  This includes tracking weight, BMI, blood pressure, and other relevant health markers.  Connecting the patient with support groups or online communities can provide additional motivation and encouragement.


**Important Considerations:**

* This treatment plan is based on the provided information and general guidelines.  A personalized plan requires a full medical evaluation and discussion with the patient's physician and other healthcare professionals.
* Patient adherence to the treatment plan is critical for successful weight loss and improved health outcomes.
* Addressing any underlying psychological or social factors contributing to obesity is essential.
* Regular monitoring and support are crucial for long-term success.

This comprehensive approach aims to address the patient's obesity and promote long-term health and well-being.

In [62]:
from bert_score import score as bert_score
import pandas as pd

# 데이터 로드
data = pd.read_csv("data/processed/obesity_data_with_clusters.csv")
cluster_means = pd.read_csv("data/processed/cluster_means.csv")

# 클러스터 요약문 생성
summaries = [
    f"Cluster {i}: " + ", ".join([f"{k} is {v}" for k, v in row.items()])
    for i, row in cluster_means.iterrows()
]

# 클러스터 1, 4, 9 선택
cluster_ids = [1, 4, 9]
cluster_summaries = [summaries[i] for i in cluster_ids]

# 각 클러스터 요약문에 대한 RAG 답변 생성
rag_answers = []
for summary in cluster_summaries:
    answer = generate_response_with_rag(summary)
    rag_answers.append(answer)

# 동일한 summary_answer를 참조로 5번 복제해서 비교
P, R, F1 = bert_score(
    cands=rag_answers,
    refs=[summaries[cluster_ids[0]]] * len(rag_answers),  # 첫 번째 클러스터의 요약을 참조로 사용
    lang="en",
    verbose=True
)

# 결과 출력
print(f"\n📊 평균 BERTScore F1: {F1.mean().item():.4f}")
for i, score in enumerate(F1):
    print(f"Sample {i+1} vs Summary: F1 = {score:.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 5.63 seconds, 0.53 sentences/sec

📊 평균 BERTScore F1: 0.8204
Sample 1 vs Summary: F1 = 0.8221
Sample 2 vs Summary: F1 = 0.8202
Sample 3 vs Summary: F1 = 0.8190


In [2]:
# ✅ 전체 클러스터 평가
cluster_id = 7

print(f"\n🔍 Evaluating Cluster {cluster_id}...")
summary_input = summaries[cluster_id]
summary_answer = generate_response_with_rag(summary_input)
time.sleep(60)  # 쿼터 방지 대기

# 클러스터 내 샘플 5개 추출
cluster_samples = data[data["cluster"] == cluster_id].sample(n=5, random_state=cluster_id)
cluster_samples = cluster_samples[selected_features]
sample_texts = []

for _, row in cluster_samples.iterrows():
    patient_info = row[selected_features].copy()
    for col in label_encoders:
        inv_map = {v: k for k, v in enumerate(label_encoders[col].classes_)}
        patient_info[col] = inv_map[int(patient_info[col])]
    text = ", ".join([f"{k} is {round(patient_info[k], 2)}" for k in selected_features])
    sample_texts.append(text)

# 샘플별 RAG 응답 생성
sample_answers = []
for i, s in enumerate(sample_texts):
    print(f"  ➤ Sample {i+1} processing...")
    answer = generate_response_with_rag(s)
    sample_answers.append(answer)
    time.sleep(60)  # 쿼터 방지 대기

# BERTScore 평가
try:
    P, R, F1 = bert_score(
        cands=sample_answers,
        refs=[summary_answer] * len(sample_answers),
        lang="en",
        verbose=False
    )
    mean_f1 = F1.mean().item()
except Exception as e:
    print(f"❌ BERTScore Error: {e}")
    mean_f1 = None

results.append({
    "cluster_id": cluster_id,
    "bert_f1": mean_f1,
    "sample_f1s": [round(f.item(), 4) for f in F1]
})



🔍 Evaluating Cluster 7...


NameError: name 'summaries' is not defined

In [107]:
results

[{'cluster_id': 0,
  'bert_f1': 0.8780937194824219,
  'sample_f1s': [0.88, 0.8882, 0.8815, 0.8728, 0.8679]},
 {'cluster_id': 2,
  'bert_f1': 0.8775218725204468,
  'sample_f1s': [0.8708, 0.8901, 0.8768, 0.87, 0.88]}]

In [None]:
# cluster 7 하고
# cluster 0, 2, 7 mean 비교)

In [6]:
#클러스터 10개를 그냥 llm에 넣어서 치료 결과 봐보기 (w/RAG)
import os
import time
import glob
import pandas as pd
import numpy as np
import torch
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai

# ✅ 1. 환경 설정
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
gemini_model = genai.GenerativeModel("gemini-1.5-pro")

# ✅ 2. RAG 청크 불러오기
def load_chunk_data(folder="data/processed/guidelines"):
    chunk_data = []
    for chunk_file in glob.glob(os.path.join(folder, "*_chunks.txt")):
        name = os.path.basename(chunk_file).replace("_chunks.txt", "")
        emb_file = chunk_file.replace("_chunks.txt", "_embeddings.npy")

        with open(chunk_file, encoding="utf-8") as f:
            chunks = f.read().split("\n\n")
        embeddings = np.load(emb_file)

        for i in range(min(len(chunks), len(embeddings))):
            if chunks[i].strip():
                chunk_data.append({
                    "text": chunks[i],
                    "embedding": embeddings[i],
                    "source": name
                })
    return chunk_data

CHUNK_DATA = load_chunk_data()

# ✅ 3. 유사 청크 검색
def search_chunks(query, top_k=10):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True).cpu()
    chunk_embeddings = torch.tensor([c["embedding"] for c in CHUNK_DATA]).cpu()
    scores = util.cos_sim(query_embedding, chunk_embeddings)[0]
    top_indices = scores.topk(k=top_k).indices
    return [CHUNK_DATA[i] for i in top_indices]

# ✅ 4. 치료 전략 생성
def generate_cluster_treatment(cluster_index, summary, top_k=3):
    chunks = search_chunks(summary, top_k=top_k)
    context = "\n\n".join([f"📘 {i+1} ({c['source']}): {c['text']}" for i, c in enumerate(chunks)])

    prompt = f"""
You are a highly qualified medical assistant.

Below is a summary of the average health profile for patients in Cluster {cluster_index}:

{summary}

Use the retrieved guideline excerpts below to craft a treatment plan:

{context}

Based on obesity treatment guidelines, please provide a personalized treatment strategy for this cluster.
Be specific and cover multiple aspects such as:
- Dietary modifications
- Physical activity recommendations
- Behavioral therapy
- Pharmacological treatment
- Surgical options (if necessary)

Respond in clear and structured English.
"""

    response = gemini_model.generate_content(prompt)
    return response.text.strip()

# ✅ 5. 클러스터 요약 불러오기
cluster_means = pd.read_csv("data/processed/cluster_means.csv")
summaries = [
    f"Cluster {i}: " + ", ".join([f"{k} is {v}" for k, v in row.items()])
    for i, row in cluster_means.iterrows()
]

# ✅ 6. 기존 결과 불러오기
save_path = "data/processed/cluster_treatment_strategies_rag.csv"
if os.path.exists(save_path):
    existing_df = pd.read_csv(save_path)
    processed_clusters = set(existing_df["cluster"].tolist())
else:
    existing_df = pd.DataFrame(columns=["cluster", "summary", "treatment_strategy"])
    processed_clusters = set()

# ✅ 7. 생성 및 저장
for i, summary in enumerate(summaries):
    if i in processed_clusters:
        print(f"⏩ Cluster {i} 이미 처리됨")
        continue

    try:
        answer = generate_cluster_treatment(i, summary)
        new_row = pd.DataFrame([{
            "cluster": i,
            "summary": summary,
            "treatment_strategy": answer
        }])
        existing_df = pd.concat([existing_df, new_row], ignore_index=True)
        existing_df.to_csv(save_path, index=False)
        print(f"✅ Cluster {i} 완료")
        time.sleep(10)  # 쿼터 초과 방지용

    except Exception as e:
        print(f"❌ Cluster {i} 실패: {e}")
        print("⏳ 60초 대기 후 다음 클러스터 시도")
        time.sleep(60)
        continue

print("📄 cluster_treatment_strategies_rag.csv 저장 완료")


✅ Cluster 0 완료
✅ Cluster 1 완료
✅ Cluster 2 완료
✅ Cluster 3 완료
✅ Cluster 4 완료
✅ Cluster 5 완료
✅ Cluster 6 완료
✅ Cluster 7 완료
✅ Cluster 8 완료
✅ Cluster 9 완료
📄 cluster_treatment_strategies_rag.csv 저장 완료
