In [4]:
pip install pandas torch transformers tqdm scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
from IndicTransToolkit.IndicTransToolkit import IndicProcessor

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def initialize_model_and_tokenizer(ckpt_dir, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        # Decode the generated tokens into text

        with tokenizer.as_target_tokenizer():
            generated_tokens = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )

        # Postprocess the translations, including entity replacement
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations

In [3]:
import scispacy
import spacy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from indicnlp.tokenize import indic_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import os
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load SciSpacy Medical Model
nlp = spacy.load("en_core_sci_sm")

# Load AI4Bharat Translation Models
def load_translation_models():
    en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
    indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"

    en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, None)
    indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, None)

    ip = IndicProcessor(inference=True)
    return (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip

# Translation Function
def translate_text(text, src_lang, tgt_lang, model, tokenizer, ip):
    try:
        translations = batch_translate([text], src_lang, tgt_lang, model, tokenizer, ip)
        return translations[0] if translations else None
    except Exception as e:
        print(f"Translation Error: {e}")
        return None

# Text Preprocessing
def preprocess_text(text, language="en"):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalnum()]
    return tokens

# Improved Query Expansion (Medical Terms)
def expand_query(tokens):
    expanded_tokens = set(tokens)
    for token in tokens:
        if token in ["medications", "medicine", "medicament"]:
            expanded_tokens.update(["drug", "treatment", "prescription", "medication"])
        elif token == "diabetes":
            expanded_tokens.update(["diabetic", "insulin", "blood sugar"])
        elif token == "hypertension":
            expanded_tokens.update(["high blood pressure", "blood pressure"])
    return list(expanded_tokens)

# Retrieve Top 2 Documents using TF-IDF and Cosine Similarity
def retrieve_documents(query_tokens, documents):
    """
    Uses TF-IDF Vectorization and Cosine Similarity to find the most relevant documents.
    Always returns exactly 2 top-ranked documents.
    """
    all_docs = documents + [" ".join(query_tokens)]  # Append query as a "document"
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_docs)

    # Compute similarity between the last item (query) and all documents
    similarity_scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

    # Rank documents by similarity
    ranked_docs = sorted(
        [(documents[i], similarity_scores[i]) for i in range(len(documents))],
        key=lambda x: x[1], reverse=True
    )

    print("\nRetrieved Top 2 Documents with Scores:")
    for i, (doc, score) in enumerate(ranked_docs[:2]):
        print(f"{i+1}. Score: {score:.4f} | Content: {doc}")

    return [doc for doc, score in ranked_docs[:2] if score > 0.1]  # Return only top 2 relevant docs

# Summarize Documents
def summarize_document(doc_content):
    sentences = doc_content.split(". ")
    return sentences[0] + "."

# Main System Function
def punjabi_english_clir(query, documents):
    # Load translation models
    (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip = load_translation_models()

    # Step 1: Translate Punjabi to English
    translated_query = translate_text(query, "pan_Guru", "eng_Latn", indic_en_model, indic_en_tokenizer, ip)
    if not translated_query:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (Translation failed)"]
    print(f"\nTranslated Query (English): {translated_query}")

    # Step 2: Preprocess translated query
    query_tokens = preprocess_text(translated_query, language="en")
    print(f"\nPreprocessed Tokens: {query_tokens}")

    # Step 3: Expand query
    expanded_tokens = expand_query(query_tokens)
    print(f"\nExpanded Tokens: {expanded_tokens}")

    # Step 4: Retrieve top 2 documents using TF-IDF and Cosine Similarity
    retrieved_docs = retrieve_documents(expanded_tokens, documents)

    if not retrieved_docs:
        print("\n❌ No documents were retrieved. Check indexing or query matching.")
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ।"]

    # Step 5: Summarize and translate back to Punjabi
    punjabi_summaries = []
    for i, content in enumerate(retrieved_docs, 1):
        summary_en = summarize_document(content)

        # Step 6: Translate back to Punjabi
        summary_pa = translate_text(summary_en, "eng_Latn", "pan_Guru", en_indic_model, en_indic_tokenizer, ip)
        if summary_pa:
            punjabi_summaries.append(f"{i}. {summary_pa}")
        else:
            punjabi_summaries.append(f"{i}. ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।")

    return punjabi_summaries

# Example Usage
if __name__ == "__main__":
    documents = [
        "Diabetes medications include insulin and oral drugs like Metformin.",
        "People with diabetes should monitor their blood sugar regularly.",
        "A healthy diet and physical activity help manage diabetes effectively.",
        "Doctors recommend prescription drugs for managing diabetes symptoms.",
        "Insulin therapy is essential for some diabetic patients.",
        "The best medications for diabetes treatment include Metformin and Insulin.",
        "Patients with high blood sugar levels should consult a doctor for appropriate medications.",
        "Hypertension medications include beta-blockers and ACE inhibitors.",
        "Metformin is a widely used prescription drug for diabetes management."
    ]

    punjabi_query = "ਕੀ ਮੈਂ ਸ਼ੂਗਰ ਲਈ ਕਿਹੜੀ ਦਵਾਈ ਲੈ ਸਕਦਾ ਹਾਂ?"
    results = punjabi_english_clir(punjabi_query, documents)

    print("\nਸੰਖੇਪ ਜਵਾਬ (Summaries in Punjabi):")
    for summary in results:
        print(summary)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]



Translated Query (English): What medications can I take for diabetes?

Preprocessed Tokens: ['medications', 'take', 'diabetes']

Expanded Tokens: ['take', 'medications', 'prescription', 'medication', 'diabetic', 'blood sugar', 'diabetes', 'insulin', 'treatment', 'drug']

Retrieved Top 2 Documents with Scores:
1. Score: 0.2813 | Content: The best medications for diabetes treatment include Metformin and Insulin.
2. Score: 0.2332 | Content: Metformin is a widely used prescription drug for diabetes management.


KeyboardInterrupt: 

In [6]:
import scispacy
import spacy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import torch
import os

# Required NLTK data
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Load SciSpacy Medical Model
nlp = spacy.load("en_core_sci_sm")

# Load AI4Bharat Translation Models
def load_translation_models():
    en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
    indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"

    en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, None)
    indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, None)

    ip = IndicProcessor(inference=True)
    return (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip

# Translation function
def translate_text(text, src_lang, tgt_lang, model, tokenizer, ip):
    try:
        translations = batch_translate([text], src_lang, tgt_lang, model, tokenizer, ip)
        return translations[0] if translations else None
    except Exception as e:
        print(f"Translation Error: {e}")
        return None

# Text preprocessing
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words("english"))
    tokens = [t for t in tokens if t.isalnum() and t not in stop_words]
    return tokens

# ✅ **Improved Query Expansion (More Precise Synonyms)**
def expand_query_with_wordnet(tokens):
    """
    Expands query only with medically relevant synonyms.
    Avoids irrelevant words like "dainty", "kickshaw", "yield".
    """
    expanded_tokens = set(tokens)

    # **Define medically relevant synonyms**
    medical_expansion = {
        "pills": ["tablets", "capsules", "medication", "drug"],
        "diabetes": ["diabetic", "blood sugar", "insulin"],
        "medications": ["drugs", "prescription", "treatment"],
        "treat": ["therapy", "cure", "medicate"]
    }

    # **Use predefined medical synonyms**
    for token in tokens:
        if token in medical_expansion:
            expanded_tokens.update(medical_expansion[token])

    # **Restrict WordNet Expansion to Medical Terms Only**
    allowed_categories = {"medication", "treatment", "drug", "medicine", "therapy", "disease"}
    for token in tokens:
        for syn in wn.synsets(token, pos=wn.NOUN):  # Only use noun synonyms
            for lemma in syn.lemmas():
                synonym = lemma.name().replace("_", " ").lower()
                if synonym.isalnum() and synonym in allowed_categories:  # ✅ Filter irrelevant words
                    expanded_tokens.add(synonym)

    return list(expanded_tokens)

# ✅ **Retrieve Top 2 Documents (Always Returns Translations)**
def retrieve_documents(query_tokens, documents):
    """
    Uses TF-IDF Vectorization with improved similarity calculation.
    Always returns exactly 2 top-ranked documents and translates them.
    """
    all_docs = documents + [" ".join(query_tokens)]
    vectorizer = TfidfVectorizer(norm='l2')  # ✅ Normalizing TF-IDF scores
    tfidf_matrix = vectorizer.fit_transform(all_docs)

    similarity_scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

    ranked_docs = sorted(
        [(documents[i], similarity_scores[i]) for i in range(len(documents))],
        key=lambda x: x[1], reverse=True
    )

    top_docs = ranked_docs[:2]  # ✅ Always return 2 documents, even with low scores

    print("\nRetrieved Top 2 Documents with Scores:")
    for idx, (doc, score) in enumerate(top_docs):
        print(f"{idx+1}. Score: {score:.4f} | Content: {doc}")

    return [doc for doc, score in top_docs]  # ✅ No threshold check, always return top 2

# Summarize documents
def summarize_document(doc_content):
    return doc_content.split(". ")[0] + "."

# Main Punjabi-English-Punjabi retrieval system
def punjabi_english_clir(query, documents):
    # Load translation models
    (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip = load_translation_models()

    # Step 1: Translate Punjabi to English
    translated_query = translate_text(query, "pan_Guru", "eng_Latn", indic_en_model, indic_en_tokenizer, ip)
    if not translated_query:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (Translation failed)"]
    print(f"\nTranslated Query (English): {translated_query}")

    # Step 2: Preprocess translated query
    query_tokens = preprocess_text(translated_query)
    print(f"\nPreprocessed Tokens: {query_tokens}")

    # Step 3: Expand query using refined WordNet synonyms
    expanded_tokens = expand_query_with_wordnet(query_tokens)
    print(f"\nExpanded Tokens with WordNet: {expanded_tokens}")

    # Step 4: Retrieve top 2 documents using normalized TF-IDF and Cosine Similarity
    retrieved_docs = retrieve_documents(expanded_tokens, documents)

    # Step 5: Summarize and translate back to Punjabi (Always Translate)
    punjabi_summaries = []
    for idx, content in enumerate(retrieved_docs, 1):
        summary_en = summarize_document(content)

        # Translate summary back to Punjabi
        summary_pa = translate_text(summary_en, "eng_Latn", "pan_Guru", en_indic_model, en_indic_tokenizer, ip)
        if summary_pa:
            punjabi_summaries.append(f"{idx}. {summary_pa}")
        else:
            punjabi_summaries.append(f"{idx}. ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।")

    return punjabi_summaries

# Example Usage
if __name__ == "__main__":
    documents = [
        "Diabetes medications include insulin and oral drugs like Metformin.",
        "People with diabetes should monitor their blood sugar regularly.",
        "A healthy diet and physical activity help manage diabetes effectively.",
        "Doctors recommend prescription drugs for managing diabetes symptoms.",
        "Insulin therapy is essential for some diabetic patients.",
        "Medications in the form of pills or tablets are often prescribed for diabetes.",
        "Capsules and oral medications like Metformin help control diabetes."
    ]

    punjabi_query = "ਕੀ ਮੈਂ ਗੋਲੀਆਂ ਲਈ ਸਕਦਾ ਹਾਂ ਸ਼ੂਗਰ ਦੇ ਇਲਾਜ ਲਈ?"
    results = punjabi_english_clir(punjabi_query, documents)

    print("\nਸੰਖੇਪ ਜਵਾਬ (Summaries in Punjabi):")
    for summary in results:
        print(summary)



Translated Query (English): Can I take pills to treat diabetes?

Preprocessed Tokens: ['take', 'pills', 'treat', 'diabetes']

Expanded Tokens with WordNet: ['take', 'tablets', 'capsules', 'medication', 'treat', 'therapy', 'cure', 'diabetes', 'medicate', 'diabetic', 'blood sugar', 'insulin', 'pills', 'drug']

Retrieved Top 2 Documents with Scores:
1. Score: 0.2225 | Content: Insulin therapy is essential for some diabetic patients.
2. Score: 0.1705 | Content: People with diabetes should monitor their blood sugar regularly.

ਸੰਖੇਪ ਜਵਾਬ (Summaries in Punjabi):
1. ਸ਼ੂਗਰ ਦੇ ਕੁੱਝ ਮਰੀਜ਼ਾਂ ਲਈ ਇਨਸੁਲਿਨ ਥੈਰੇਪੀ ਜ਼ਰੂਰੀ ਹੈ। 
2. ਸ਼ੂਗਰ ਵਾਲੇ ਲੋਕਾਂ ਨੂੰ ਆਪਣੇ ਬਲੱਡ ਸ਼ੂਗਰ ਦੀ ਨਿਯਮਤ ਤੌਰ'ਤੇ ਨਿਗਰਾਨੀ ਕਰਨੀ ਚਾਹੀਦੀ ਹੈ। 


In [None]:
doing!! webscraping a try

In [7]:
import requests
import scispacy
import spacy
import re
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import nltk

# Required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load SciSpacy Medical Model
nlp = spacy.load("en_core_sci_sm")

# ✅ Function to Perform Google Search
def google_search(query, num_results=5):
    """
    Uses Google Search (via API or Scraping) to retrieve top results.
    """
    search_url = f"https://www.google.com/search?q={query}&num={num_results}"
    headers = {"User-Agent": "Mozilla/5.0"}

    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    links = []
    for result in soup.find_all('a', href=True):
        href = result['href']
        match = re.search(r'/url\?q=(.*?)&', href)
        if match:
            links.append(match.group(1))

    return links[:num_results]

# ✅ Function to Scrape Web Page Content
def scrape_webpage(url):
    """
    Extracts text content from a given URL.
    """
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        return None

    soup = BeautifulSoup(response.text, "html.parser")

    # Extracting readable text
    paragraphs = soup.find_all("p")
    text = " ".join([para.get_text() for para in paragraphs if para.get_text()])

    return text if len(text) > 300 else None  # Filter out short results

# ✅ NLP-Based Document Ranking
def rank_documents(query, documents):
    """
    Uses TF-IDF Vectorization and Cosine Similarity to rank web search results.
    """
    if not documents:
        return []

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents + [query])

    similarity_scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

    ranked_docs = sorted(
        [(documents[i], similarity_scores[i]) for i in range(len(documents))],
        key=lambda x: x[1], reverse=True
    )

    return ranked_docs[:2]  # ✅ Return only top 2 documents

# ✅ Text Summarization (Extractive)
def summarize_text(text, num_sentences=3):
    """
    Extracts the most relevant sentences using NLP-based scoring.
    """
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]

    if len(sentences) <= num_sentences:
        return " ".join(sentences)

    # Use TF-IDF to rank sentences
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)

    scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
    ranked_sentences = sorted(
        [(sentences[i], scores[i]) for i in range(len(sentences))],
        key=lambda x: x[1], reverse=True
    )

    return " ".join([s[0] for s in ranked_sentences[:num_sentences]])

# ✅ Load Translation Models
def load_translation_models():
    en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
    indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"

    en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, None)
    indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, None)

    ip = IndicProcessor(inference=True)
    return (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip

# ✅ Translation Function
def translate_text(text, src_lang, tgt_lang, model, tokenizer, ip):
    try:
        translations = batch_translate([text], src_lang, tgt_lang, model, tokenizer, ip)
        return translations[0] if translations else None
    except Exception as e:
        print(f"Translation Error: {e}")
        return None

# ✅ Main Function: Web Search → Scraping → NLP Processing → Translation
def web_search_clir(query):
    # Load translation models
    (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip = load_translation_models()

    # ✅ Step 1: Translate Punjabi Query to English
    translated_query = translate_text(query, "pan_Guru", "eng_Latn", indic_en_model, indic_en_tokenizer, ip)
    if not translated_query:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (Translation failed)"]
    print(f"\n🔍 Translated Query (English): {translated_query}")

    # ✅ Step 2: Google Search for Relevant Links
    search_results = google_search(translated_query)
    print("\n🔗 Retrieved Web Links:")
    for link in search_results:
        print(link)

    if not search_results:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (No web results found)"]

    # ✅ Step 3: Scrape Web Pages and Extract Text
    extracted_texts = []
    for link in search_results:
        text = scrape_webpage(link)
        if text:
            extracted_texts.append(text)

    if not extracted_texts:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (No readable content found)"]

    # ✅ Step 4: Rank Documents Using NLP
    ranked_docs = rank_documents(translated_query, extracted_texts)

    print("\n📄 Top Ranked Documents (Post-Ranking):")
    for i, (doc, score) in enumerate(ranked_docs, 1):
        print(f"{i}. Score: {score:.4f}")

    if not ranked_docs:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (No relevant documents found)"]

    # ✅ Step 5: Summarize the Top Document
    best_doc = ranked_docs[0][0]
    summary_en = summarize_text(best_doc)

    # ✅ Step 6: Translate Summary Back to Punjabi
    summary_pa = translate_text(summary_en, "eng_Latn", "pan_Guru", en_indic_model, en_indic_tokenizer, ip)

    return [summary_pa if summary_pa else "ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।"]

# ✅ Example Usage
if __name__ == "__main__":
    punjabi_query = "ਕੀ ਮੈਂ ਸ਼ੂਗਰ ਲਈ ਕਿਹੜੀ ਦਵਾਈ ਲੈ ਸਕਦਾ ਹਾਂ?"
    results = web_search_clir(punjabi_query)

    print("\n📢 **Final Answer in Punjabi:**")
    for res in results:
        print(res)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



🔍 Translated Query (English): What medications can I take for diabetes?

🔗 Retrieved Web Links:

📢 **Final Answer in Punjabi:**
ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (No web results found)


In [8]:
api key-AIzaSyB9SV91pHITThVl-RrRaPMgoHnMcJuZMwM

SyntaxError: invalid syntax (1693684359.py, line 1)