In [7]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# ✅ Step 1: Load & Process Dataset
file_path = "Extracted_Medical_Q_A.csv"
df = pd.read_csv(file_path)

# ✅ Debugging: Check Available Columns
print("📊 Available Columns:", df.columns)

# ✅ Ensure "Question" & "Answer" Columns Exist
if "Question" not in df.columns or "Clean_Answer" not in df.columns:
    raise KeyError("❌ Missing 'Question' or 'Clean_Answer' column in dataset.")

# ✅ Step 2: Remove Duplicates & NaNs
df = df.dropna(subset=["Question", "Clean_Answer"]).drop_duplicates(subset=["Question"]).reset_index(drop=True)

# ✅ Debugging: Print Extracted Questions
print("\n📢 Extracted Questions Sample:")
print(df["Question"].head(10))

# ✅ Step 3: Load Pretrained BERT Model
bert_model = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight & optimized for Q&A

# ✅ Step 4: Compute BERT Sentence Embeddings for Questions
question_embeddings = bert_model.encode(df["Question"].tolist(), convert_to_tensor=True)

# ✅ Debugging: Check Shape of Embeddings
print("\n📊 Question Embeddings Shape:", question_embeddings.shape)

if question_embeddings.shape[0] == 0:
    raise ValueError("❌ No question embeddings found. Check dataset processing!")

# ✅ Step 5: Improved Token Processing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    """
    Tokenizes and preprocesses input text:
    - Removes stopwords
    - Applies lemmatization
    """
    tokens = word_tokenize(text.lower())  # Convert to lowercase & tokenize
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(processed_tokens)

# ✅ Step 6: Train TF-IDF Model on Processed Questions
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["Question"].apply(preprocess_text))

# ✅ Step 7: Hybrid Retrieval (BERT + TF-IDF)
def retrieve_best_answer(input_question, top_k=3):
    """
    Uses a hybrid approach (TF-IDF + BERT) to retrieve the best-matched question and return the corresponding answer.
    """
    # 🔹 Process input question
    processed_input = preprocess_text(input_question)

    # 🔹 TF-IDF Similarity
    input_vector = vectorizer.transform([processed_input])
    tfidf_scores = np.dot(input_vector, tfidf_matrix.T).toarray().flatten()

    # 🔹 BERT Similarity
    input_embedding = bert_model.encode([input_question], convert_to_tensor=True).view(1, -1)
    bert_scores = util.pytorch_cos_sim(input_embedding, question_embeddings)[0].cpu().numpy()

    # 🔹 Combine Scores (Weighted Sum)
    final_scores = (tfidf_scores * 0.4) + (bert_scores * 0.6)  # Adjust weights for better accuracy

    # 🔹 Get Top Matching Questions
    top_indices = np.argsort(final_scores)[-top_k:][::-1]

    # 🔹 Retrieve Best Matching Answer
    results = []
    for idx in top_indices:
        matched_question = df.iloc[idx]["Question"]
        matched_answer = df.iloc[idx]["Clean_Answer"]
        score = final_scores[idx]
        results.append((matched_question, matched_answer, score))

    return results

# ✅ Step 8: Test the Answer Retrieval
input_question = "What are the symptoms of diabetes?"
retrieved_answers = retrieve_best_answer(input_question, top_k=1)

# ✅ Step 9: Display Retrieved Answer
print("\n🔍 **Input Question:**", input_question)
print("\n🎯 **Best Matched Answer:**")
for idx, (matched_question, answer, score) in enumerate(retrieved_answers, start=1):
    print(f"{idx}. Matched Question: {matched_question} (Score: {score:.4f})")
    print(f"   ✅ Answer: {answer}")





[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


📊 Available Columns: Index(['Question', 'Clean_Answer'], dtype='object')

📢 Extracted Questions Sample:
0    What is (are) Polycystic ovary syndrome ? (Als...
1    What causes Polycystic ovary syndrome ? (Also ...
2                        What causes Noonan syndrome ?
3      What are the complications of Noonan syndrome ?
4                     How to prevent Noonan syndrome ?
5    What are the symptoms of Neurofibromatosis-Noo...
6    Is Noonan syndrome inherited ? (Also called: M...
7    What are the treatments for Noonan syndrome ? ...
8    How many people are affected by polycystic kid...
9    What are the treatments for polycystic kidney ...
Name: Question, dtype: object

📊 Question Embeddings Shape: torch.Size([1805, 384])

🔍 **Input Question:** What are the symptoms of diabetes?

🎯 **Best Matched Answer:**
1. Matched Question: What is (are) Diabetes ? (Score: 0.6757)
   ✅ Answer: URL: http://nihseniorhealth.gov/diabetes/toc.html
Answer: Heart disease and stroke are the leading ca

In [11]:
# This piece of CODE gives us three most relevant  matched questions and answers from the data set .
# I haven't made translations into this for now!
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# ✅ Load & Process Dataset
file_path = "Extracted_Medical_Q_A.csv"
df = pd.read_csv(file_path)

# ✅ Check necessary columns
if "Question" not in df.columns or "Clean_Answer" not in df.columns:
    raise KeyError("Missing 'Question' or 'Clean_Answer' column in dataset.")

# ✅ Remove Duplicates & NaNs
df = df.dropna(subset=["Question", "Clean_Answer"]).drop_duplicates(subset=["Question"]).reset_index(drop=True)

# ✅ Load Pretrained BERT Model
bert_model = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ Compute Sentence Embeddings
question_embeddings = bert_model.encode(df["Question"].tolist(), convert_to_tensor=True)

# ✅ Check if embeddings exist
if question_embeddings.shape[0] == 0:
    raise ValueError("No question embeddings found. Check dataset processing!")

# ✅ Preprocessing Functions
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def truncate_answer(answer, max_sentences=3):
    import re
    sentences = re.split(r'(?<=[.!?]) +', answer.strip())
    return ' '.join(sentences[:max_sentences])

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(processed_tokens)

# ✅ Train TF-IDF Model
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["Question"].apply(preprocess_text))

# ✅ Hybrid Retrieval
def retrieve_best_answer(input_question, top_k=3, match_threshold=0.90):
    processed_input = preprocess_text(input_question)
    input_vector = vectorizer.transform([processed_input])
    tfidf_scores = np.dot(input_vector, tfidf_matrix.T).toarray().flatten()
    input_embedding = bert_model.encode([input_question], convert_to_tensor=True).view(1, -1)
    bert_scores = util.pytorch_cos_sim(input_embedding, question_embeddings)[0].cpu().numpy()
    final_scores = (tfidf_scores * 0.4) + (bert_scores * 0.6)
    top_indices = np.argsort(final_scores)[-top_k:][::-1]

    best_idx = top_indices[0]
    best_score = final_scores[best_idx]
    matched_question = df.iloc[best_idx]["Question"]
    matched_answer = df.iloc[best_idx]["Clean_Answer"]

    if matched_question.strip().lower() == input_question.strip().lower() or best_score >= match_threshold:
        return [(matched_question, truncate_answer(matched_answer), best_score)]

    results = []
    for idx in top_indices:
        matched_q = df.iloc[idx]["Question"]
        matched_a = df.iloc[idx]["Clean_Answer"]
        score = final_scores[idx]
        results.append((matched_q, truncate_answer(matched_a), score))

    return results

# ✅ Example Run (Only Output Relevant Answers)
# ✅ Example Run (Take Input from User)
while True:
    input_question = input("❓ Enter your medical question (or type 'exit' to quit): ")

    if input_question.strip().lower() == "exit":
        print("👋 Exiting. Stay healthy!")
        break

    retrieved_answers = retrieve_best_answer(input_question, top_k=3)

    print("\n🔍 Top Matching Results:\n")
    for idx, (matched_question, answer, score) in enumerate(retrieved_answers, start=1):
        print(f"{idx}. Matched Question: {matched_question} (Score: {score:.4f})")
        print(f"   ✅ Answer: {answer}\n")



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



🔍 Top Matching Results:

1. Matched Question: Do you have information about Glucose urine test (Also called: Urine sugar test; Urine glucose test; Glucosuria test; Glycosuria test) (Score: 0.8511)
   ✅ Answer: URL: https://www.nlm.nih.gov/medlineplus/ency/article/003581.htm
Answer: Summary : The glucose urine test measures the amount of sugar (glucose) in a urine sample. The presence of glucose in the urine is called glycosuria or glucosuria.  Glucose level can also be measured using a blood test or a cerebrospinal fluid test.

How the Test is Performed : After you provide a urine sample, it is tested right away. The health care provider uses a dipstick made with a color-sensitive pad. The color the dipstick changes to tells the provider the level of glucose in your urine.   If needed,your provider may ask you to collect your urine at home over 24 hours. Your provider will tell you how to do this. Follow instructions exactly so that the results are accurate.

How to Prepare for the Te

KeyboardInterrupt: Interrupted by user

In [15]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
from IndicTransToolkit.IndicTransToolkit import IndicProcessor

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def initialize_model_and_tokenizer(ckpt_dir, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        # Decode the generated tokens into text

        with tokenizer.as_target_tokenizer():
            generated_tokens = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )

        # Postprocess the translations, including entity replacement
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations

In [None]:
# Maincode + translation logic -final

In [None]:
# === IMPORTS ===
import os
import pandas as pd
import numpy as np
import torch
import nltk
import scispacy
import spacy
import warnings
import time

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
warnings.filterwarnings('ignore', category=FutureWarning)

# === DOWNLOAD NLTK STUFF ===
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# === LOAD NLP & MODELS ===
nlp = spacy.load("en_core_sci_sm")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# === LOAD DATASET ===
file_path = "Extracted_Medical_Q_A.csv"
df = pd.read_csv(file_path)

if "Question" not in df.columns or "Clean_Answer" not in df.columns:
    raise KeyError("Missing 'Question' or 'Clean_Answer' column in dataset.")

df = df.dropna(subset=["Question", "Clean_Answer"]).drop_duplicates(subset=["Question"]).reset_index(drop=True)

# === LOAD EMBEDDINGS ===
bert_model = SentenceTransformer("all-MiniLM-L6-v2")
question_embeddings = bert_model.encode(df["Question"].tolist(), convert_to_tensor=True)

# === TF-IDF ===
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(processed_tokens)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["Question"].apply(preprocess_text))

# === TRANSLATION MODEL LOADING ===
def load_translation_models():
    en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
    indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"

    en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, None)
    indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, None)

    ip = IndicProcessor(inference=True)
    return (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip

# === TRANSLATION WRAPPER ===
def translate_text(text, src_lang, tgt_lang, model, tokenizer, ip):
    try:
        translations = batch_translate([text], src_lang, tgt_lang, model, tokenizer, ip)
        return translations[0] if translations else None
    except Exception as e:
        print(f"Translation Error: {e}")
        return None

# === TRUNCATE LONG ANSWERS ===
def truncate_answer(answer, max_sentences=3):
    import re
    sentences = re.split(r'(?<=[.!?]) +', answer.strip())
    return ' '.join(sentences[:max_sentences])

# === MAIN RETRIEVAL LOGIC ===
def retrieve_best_answer(input_question_en, top_k=3, match_threshold=0.90):
    processed_input = preprocess_text(input_question_en)
    input_vector = vectorizer.transform([processed_input])
    tfidf_scores = np.dot(input_vector, tfidf_matrix.T).toarray().flatten()
    input_embedding = bert_model.encode([input_question_en], convert_to_tensor=True).view(1, -1)
    bert_scores = util.pytorch_cos_sim(input_embedding, question_embeddings)[0].cpu().numpy()
    final_scores = (tfidf_scores * 0.4) + (bert_scores * 0.6)
    top_indices = np.argsort(final_scores)[-top_k:][::-1]

    best_idx = top_indices[0]
    best_score = final_scores[best_idx]
    matched_question = df.iloc[best_idx]["Question"]
    matched_answer = df.iloc[best_idx]["Clean_Answer"]

    if matched_question.strip().lower() == input_question_en.strip().lower() or best_score >= match_threshold:
        return [(matched_question, truncate_answer(matched_answer), best_score)]

    results = []
    for idx in top_indices:
        matched_q = df.iloc[idx]["Question"]
        matched_a = df.iloc[idx]["Clean_Answer"]
        score = final_scores[idx]
        results.append((matched_q, truncate_answer(matched_a), score))

    return results

# === COMBINED RETRIEVAL SYSTEM: PUNJABI ENGLISH PUNJABI ===
def punjabi_medical_qa_clir(punjabi_query, top_k=3):
    (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip = load_translation_models()

    # Step 1: Translate Punjabi to English
    print("\n🔄 Translating Punjabi query to English...")
    english_query = translate_text(punjabi_query, "pan_Guru", "eng_Latn", indic_en_model, indic_en_tokenizer, ip)
    if not english_query:
        return ["❌ Translation to English failed. Please try again."]

    print(f"\n🔎 Translated Query (EN): {english_query}")

    # Step 2: Retrieve top answers in English
    print("\n📡 Retrieving top answers in English...")
    results = retrieve_best_answer(english_query, top_k=top_k)

    print("\n🎯 Retrieved answers in English:")
    for idx, (matched_question, answer, score) in enumerate(results, 1):
        print(f"{idx}. {matched_question} \nAnswer: {answer} (Score: {score:.4f})")

    # Step 3: Translate results back to Punjabi
    print("\n🔄 Translating results back to Punjabi...")
    punjabi_outputs = []
    for idx, (matched_question, answer, score) in enumerate(results, 1):
        translated_answer = translate_text(answer, "eng_Latn", "pan_Guru", en_indic_model, en_indic_tokenizer, ip)
        if not translated_answer:
            translated_answer = "❌ Translation error while returning result."

        punjabi_outputs.append(f"{idx}. {translated_answer} (Score: {score:.4f})")

    return punjabi_outputs

# === INTERACTIVE MODE ===
if __name__ == "__main__":
    while True:
        query = input("\n🤖 ਤੁਹਾਡਾ ਮੈਡੀਕਲ ਸਵਾਲ ਦਰਜ ਕਰੋ (Punjabi) (or type 'exit'): ").strip()
        if query.lower() == "exit":
            print("👋 Exiting. Stay healthy!")
            break

        answers = punjabi_medical_qa_clir(query, top_k=3)
        print("\n📋 ਉਚਿਤ ਜਵਾਬ (Punjabi Summaries):\n")
        for line in answers:
            print(line)



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
