In [21]:
pip2 install transformers indic-nlp-library nltk whoosh scikit-learn torch

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [22]:
import ssl
import certifi
ssl._create_default_https_context = ssl.create_default_context(cafile=certifi.where())


In [1]:
%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git


In [2]:
%%capture
%cd /content/IndicTrans2/huggingface_interface

In [11]:
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone https://github.com/VarunGumma/IndicTransToolkit.git
%cd IndicTransToolkit
!python3 -m pip install --editable ./
%cd ..

In [3]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
from IndicTransToolkit.IndicTransToolkit import IndicProcessor

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
def initialize_model_and_tokenizer(ckpt_dir, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        # Decode the generated tokens into text

        with tokenizer.as_target_tokenizer():
            generated_tokens = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )

        # Postprocess the translations, including entity replacement
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations

Punjabi to English translation using indictrans

In [5]:
indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"
  # ai4bharat/indictrans2-indic-en-dist-200M
indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, None)

ip = IndicProcessor(inference=True)

punjabi_sents = [
    "ਜਦੋਂ ਮੈਂ ਛੋਟਾ ਸੀ, ਮੈਂ ਹਰ ਰੋਜ਼ ਪਾਰਕ ਜਾਂਦਾ ਸੀ।",

]

src_lang, tgt_lang = "pan_Guru", "eng_Latn"
en_translations = batch_translate(punjabi_sents, src_lang, tgt_lang, indic_en_model, indic_en_tokenizer, ip)

print(f"\n{src_lang} - {tgt_lang}")
for input_sentence, translation in zip(punjabi_sents, en_translations):
    print(f"{src_lang}: {input_sentence}")
    print(f"{tgt_lang}: {translation}")

# flush the models to free the GPU memory
del indic_en_tokenizer, indic_en_model



pan_Guru - eng_Latn
pan_Guru: ਜਦੋਂ ਮੈਂ ਛੋਟਾ ਸੀ, ਮੈਂ ਹਰ ਰੋਜ਼ ਪਾਰਕ ਜਾਂਦਾ ਸੀ।
eng_Latn: When I was a kid, I used to go to the park every day.




In [6]:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"  # Model for English to Indic translations
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, None)

ip = IndicProcessor(inference=True)

en_sents = [
    "When I was young, I used to go to the park every day."
]

src_lang, tgt_lang = "eng_Latn", "pan_Guru"  # Changed to Punjabi (Gurmukhi script)
pa_translations = batch_translate(en_sents, src_lang, tgt_lang, en_indic_model, en_indic_tokenizer, ip)

print(f"\n{src_lang} - {tgt_lang}")
for input_sentence, translation in zip(en_sents, pa_translations):
    print(f"{src_lang}: {input_sentence}")
    print(f"{tgt_lang}: {translation}")

# flush the models to free the GPU memory
del en_indic_tokenizer, en_indic_model


eng_Latn - pan_Guru
eng_Latn: When I was young, I used to go to the park every day.
pan_Guru: ਜਦੋਂ ਮੈਂ ਛੋਟਾ ਸੀ, ਮੈਂ ਹਰ ਰੋਜ਼ ਪਾਰਕ ਜਾਂਦਾ ਸੀ। 


now trying to make..the code for...main..

In [25]:
!pip install indic-nlp-library



In [38]:
import nltk
import os

# Add your local directories to NLTK data path
nltk.data.path.append(os.path.abspath("stopwords"))
nltk.data.path.append(os.path.abspath("wordnet"))
nltk.data.path.append(os.path.abspath("punkt"))

# # Now import the required resources
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize

# Example usage
stop_words = set(stopwords.words('english'))
print("Stopwords Loaded:", len(stop_words))



Stopwords Loaded: 179


In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from indicnlp.tokenize import indic_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import os
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Load Translation Models
def load_translation_models():
    en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
    indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"

    en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, None)
    indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, None)

    ip = IndicProcessor(inference=True)
    return (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip

# Translation Function
def translate_text(text, src_lang, tgt_lang, model, tokenizer, ip):
    try:
        translations = batch_translate([text], src_lang, tgt_lang, model, tokenizer, ip)
        return translations[0] if translations else None
    except Exception as e:
        print(f"Translation Error: {e}")
        return None

# Text Preprocessing
def preprocess_text(text, language="en"):
    if language == "pa":
        tokens = indic_tokenize.trivial_tokenize(text, lang="pa")
        stop_words = set(["ਅਤੇ", "ਹੈ", "ਵਿੱਚ", "ਨੂੰ"])
        tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalnum()]
    else:
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words("english"))
        tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalnum()]
    return tokens

# Query Expansion
def expand_query(tokens):
    expanded_tokens = set(tokens)
    for token in tokens:
        for syn in wn.synsets(token):
            for lemma in syn.lemmas():
                expanded_tokens.add(lemma.name().lower())
    return list(expanded_tokens)

# Indexing Documents
def create_index(documents, index_dir="index"):
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
    schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
    ix = create_in(index_dir, schema)
    writer = ix.writer()
    for i, doc in enumerate(documents):
        writer.add_document(title=f"doc{i}", content=doc)
    writer.commit()
    return ix

# Retrieve Documents
def retrieve_documents(query_tokens, ix):
    with ix.searcher() as searcher:
        query_str = " ".join(query_tokens)
        query = QueryParser("content", ix.schema).parse(query_str)
        results = searcher.search(query, limit=10)
        return [(r["title"], r["content"]) for r in results]

# Summarize Documents
def summarize_document(doc_content):
    sentences = doc_content.split(". ")
    return sentences[0] + "."

# Rank Documents
def rank_documents(query_tokens, retrieved_docs):
    docs_content = [doc[1] for doc in retrieved_docs]
    if not docs_content:
        return retrieved_docs
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(docs_content + [" ".join(query_tokens)])
    scores = tfidf_matrix[-1].dot(tfidf_matrix[:-1].T).toarray()[0]
    ranked_docs = [(retrieved_docs[i][0], retrieved_docs[i][1], scores[i]) for i in range(len(retrieved_docs))]
    ranked_docs.sort(key=lambda x: x[2], reverse=True)
    return [(title, content) for title, content, score in ranked_docs]

# Main System Function
def punjabi_english_clir(query, documents):
    # Load models
    (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip = load_translation_models()

    # Translate Punjabi to English
    translated_query = translate_text(query, "pan_Guru", "eng_Latn", indic_en_model, indic_en_tokenizer, ip)
    if not translated_query:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (Translation failed)"]
    print(f"Translated Query (English): {translated_query}")

    # Preprocess translated query
    query_tokens = preprocess_text(translated_query, language="en")
    print(f"Preprocessed Tokens: {query_tokens}")

    # Expand query
    expanded_tokens = expand_query(query_tokens)
    print(f"Expanded Tokens: {expanded_tokens}")

    # Create or load index
    ix = create_index(documents)

    # Retrieve documents
    retrieved_docs = retrieve_documents(expanded_tokens, ix)
    if not retrieved_docs:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ।"]

    # Rank documents
    ranked_docs = rank_documents(expanded_tokens, retrieved_docs)

    # Summarize and translate to Punjabi
    punjabi_summaries = []
    for i, (title, content) in enumerate(ranked_docs[:5], 1):
        summary_en = summarize_document(content)
        summary_pa = translate_text(summary_en, "eng_Latn", "pan_Guru", en_indic_model, en_indic_tokenizer, ip)
        if summary_pa:
            punjabi_summaries.append(f"{i}. {summary_pa}")
        else:
            punjabi_summaries.append(f"{i}. ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।")

    return punjabi_summaries

# Example Usage
if __name__ == "__main__":
    documents = [
        "Diabetes is a chronic condition requiring insulin or oral medication. It affects blood sugar levels.",
        "Heart disease can be managed with proper diet and exercise. Consult a doctor regularly.",
        "Antibiotics are used to treat bacterial infections effectively. They don’t work on viruses.",
        "High blood pressure requires regular monitoring and medication. Lifestyle changes help.",
        "Asthma treatment includes inhalers and avoiding triggers. It can be controlled with care."
    ]

    punjabi_query = "ਕੀ ਮੈਂ ਸ਼ੂਗਰ ਲਈ ਕਿਹੜੀ ਦਵਾਈ ਲੈ ਸਕਦਾ ਹਾਂ?"
    results = punjabi_english_clir(punjabi_query, documents)

    print("\nਸੰਖੇਪ ਜਵਾਬ (Summaries in Punjabi):")
    for summary in results:
        print(summary)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Translated Query (English): What medications can I take for diabetes?
Preprocessed Tokens: ['medications', 'take', 'diabetes']
Expanded Tokens: ['bring', 'accept', 'take_up', 'select', 'acquire', 'adopt', 'guide', 'subscribe', 'contract', 'diabetes', 'assume', 'make', 'engage', 'study', 'yield', 'pick_out', 'medicinal_drug', 'take', 'take_in', 'remove', 'rent', 'take_on', 'film', 'deal', 'medications', 'get', 'claim', 'payoff', 'conduct', 'contain', 'lead', 'aim', 'use_up', 'read', 'direct', 'consider', 'hold', 'withdraw', 'consume', 'take_aim', 'call_for', 'charter', 'subscribe_to', 'postulate', 'medicament', 'train', 'submit', 'lease', 'drive', 'look_at', 'proceeds', 'involve', 'medication', 'ingest', 'fill', 'demand', 'take_away', 'necessitate', 'learn', 'pack', 'return', 'ask', 'choose', 'shoot', 'convey', 'hire', 'occupy', 'strike', 'need', 'require', 'get_hold_of', 'admit', 'have', 'takings', 'medicine', 'carry', 'exact', 'issue']

ਸੰਖੇਪ ਜਵਾਬ (Summaries in Punjabi):
ਕੋਈ ਸੰਬੰਧਿਤ ਜ

In [23]:
import scispacy
import spacy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from indicnlp.tokenize import indic_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import os
import nltk
from spacy import displacy

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Load SciSpacy Medical Model
nlp = spacy.load("en_core_sci_sm")

# Load Translation Models
def load_translation_models():
    en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
    indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"

    en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, None)
    indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, None)

    ip = IndicProcessor(inference=True)
    return (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip

# Extract Medical Terms using SciSpacy
def extract_medical_terms(text):
    """Extract medical terms with additional specific terms."""
    doc = nlp(text)

    # Get terms identified by SciSpacy
    medical_terms = [ent.text.lower() for ent in doc.ents if ent.label_ in ["DISEASE", "DRUG", "TREATMENT"]]

    # Add specific medical terms we know should be included
    known_terms = ["diabetes", "hypertension", "blood pressure", "sugar", "insulin"]
    for term in known_terms:
        if term in text.lower() and term not in medical_terms:
            medical_terms.append(term)

    return list(set(medical_terms))  # Remove duplicates

# Expand Medical Terms using WordNet
# def expand_medical_terms(medical_terms):
#     expanded_terms = set(medical_terms)
#     for term in medical_terms:
#         for synset in wn.synsets(term, pos=wn.NOUN):  # Get noun synonyms
#             for lemma in synset.lemmas():
#                 expanded_terms.add(lemma.name().lower())
#     return list(expanded_terms)
def expand_medical_terms(medical_terms):
    """
    Expand medical terms using WordNet with less restrictive filtering.
    """
    expanded_terms = set(medical_terms)
    for term in medical_terms:
        # Add variations of the term
        expanded_terms.add(term)
        expanded_terms.add(term + "s")  # Add plural form
        expanded_terms.add(term.rstrip("s"))  # Add singular form

        # Add WordNet synonyms
        for synset in wn.synsets(term, pos=wn.NOUN):
            for lemma in synset.lemmas():
                expanded_terms.add(lemma.name().lower().replace("_", " "))

    return list(expanded_terms)

# Preprocess Text with Medical Focus
# def preprocess_text(text, language="en"):
#     if language == "pa":
#         tokens = indic_tokenize.trivial_tokenize(text, lang="pa")
#         stop_words = set(["ਅਤੇ", "ਹੈ", "ਵਿੱਚ", "ਨੂੰ"])
#         tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalnum()]
#     else:
#         tokens = word_tokenize(text)
#         stop_words = set(stopwords.words("english"))
#         tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalnum()]
#     medical_terms = extract_medical_terms(text)
#     return medical_terms if medical_terms else tokens  # Prefer medical terms
def preprocess_text(text, language="en"):
    """
    Tokenizes text while extracting medical terms when possible.
    - Extract medical terms first
    - Add important words from the text
    """
    # Extract medical terms using SciSpacy (enhanced version)
    medical_terms = extract_medical_terms(text)

    # Regular tokenization
    if language == "pa":
        tokens = indic_tokenize.trivial_tokenize(text, lang="pa")
        stop_words = set(["ਅਤੇ", "ਹੈ", "ਵਿੱਚ", "ਨੂੰ"])
        tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalnum()]
    else:
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words("english"))
        # Keep important words even if they're common
        important_words = ["diabetes", "hypertension", "blood", "pressure", "sugar", "insulin"]
        tokens = [t.lower() for t in tokens if (t.lower() not in stop_words or t.lower() in important_words) and t.isalnum()]

    # Combine medical terms with important tokens
    combined_tokens = list(set(medical_terms + tokens))

    return combined_tokens
# Indexing Documents
def create_index(documents, index_dir="index"):
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
    schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
    ix = create_in(index_dir, schema)
    writer = ix.writer()
    for i, doc in enumerate(documents):
        writer.add_document(title=f"doc{i}", content=doc)
    writer.commit()
    return ix

# Retrieve Medical Documents
def retrieve_medical_documents(query_tokens, ix):
    """
    Retrieve documents using a more flexible search approach.
    """
    results = []
    with ix.searcher() as searcher:
        # Try different search strategies

        # 1. Try exact matching for medical terms
        query_str = " OR ".join(query_tokens)
        query = QueryParser("content", ix.schema).parse(query_str)
        results = searcher.search(query, limit=10)

        # 2. If no results, try more flexible matching
        if len(results) == 0:
            # Create a more flexible query string
            query_str = " OR ".join([f"*{term}*" for term in query_tokens if len(term) > 3])
            if query_str:
                query = QueryParser("content", ix.schema).parse(query_str)
                results = searcher.search(query, limit=10)

        return [(r["title"], r["content"]) for r in results]

# Summarize Documents
def summarize_document(doc_content):
    sentences = doc_content.split(". ")
    return sentences[0] + "."

# Rank Documents by Medical Relevance
def rank_medical_documents(query_tokens, retrieved_docs):
    docs_content = [doc[1] for doc in retrieved_docs]
    if not docs_content:
        return retrieved_docs

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(docs_content + [" ".join(query_tokens)])
    scores = tfidf_matrix[-1].dot(tfidf_matrix[:-1].T).toarray()[0]
    ranked_docs = [(retrieved_docs[i][0], retrieved_docs[i][1], scores[i]) for i in range(len(retrieved_docs))]
    ranked_docs.sort(key=lambda x: x[2], reverse=True)
    return [(title, content) for title, content, score in ranked_docs]

# Main CLIR Function
def punjabi_english_clir(query, documents):
    (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip = load_translation_models()

    # Step 1: Translate Punjabi query to English
    translated_query = translate_text(query, "pan_Guru", "eng_Latn", indic_en_model, indic_en_tokenizer, ip)
    print(f"\n[Step 1] Translated Query (Punjabi → English): {translated_query}")

    if not translated_query:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (Translation failed)"]

    # Step 2: Extract relevant query tokens
    query_tokens = preprocess_text(translated_query, language="en")
    print(f"\n[Step 2] Extracted Query Tokens: {query_tokens}")

    # Step 3: Expand medical terms only (general terms will remain unchanged)
    medical_terms = expand_medical_terms(query_tokens)
    print(f"\n[Step 3] Expanded Medical Terms: {medical_terms}")

    # Step 4: Index documents
    ix = create_index(documents)

    # Step 5: Retrieve relevant documents
    retrieved_docs = retrieve_medical_documents(medical_terms, ix)
    print("\n[Step 5] Retrieved Documents (before ranking):")
    for i, (title, content) in enumerate(retrieved_docs, 1):
        print(f"{i}. {title}: {content}")

    if not retrieved_docs:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ।"]

    # Step 6: Rank documents based on medical relevance
    ranked_docs = rank_medical_documents(medical_terms, retrieved_docs)
    print("\n[Step 6] Ranked Documents:")
    for i, (title, content) in enumerate(ranked_docs[:5], 1):
        print(f"{i}. {title}: {content}")

    # Step 7: Summarize and translate retrieved documents back to Punjabi
    punjabi_summaries = []
    print("\n[Step 7] Translated Summaries (English → Punjabi):")
    for i, (title, content) in enumerate(ranked_docs[:5], 1):
        summary_en = summarize_document(content)
        summary_pa = translate_text(summary_en, "eng_Latn", "pan_Guru", en_indic_model, en_indic_tokenizer, ip)
        if summary_pa:
            print(f"{i}. {summary_pa}")
            punjabi_summaries.append(f"{i}. {summary_pa}")
        else:
            print(f"{i}. ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।")
            punjabi_summaries.append(f"{i}. ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।")

    return punjabi_summaries


def main():
    punjabi_query = "ਮੈਨੂੰ ਸੁਗਰ ਦੀ ਬਿਮਾਰੀ ਬਾਰੇ ਜਾਣਕਾਰੀ ਚਾਹੀਦੀ ਹੈ।"
    documents = [
    "High blood pressure, also called hypertension, occurs when the force of blood against artery walls is too high. It is often managed with lifestyle changes and medications.",
    "Hypertension is a common condition that can lead to severe health complications such as heart disease and stroke if left untreated.",
    "Beta-blockers, ACE inhibitors, and calcium channel blockers are some of the most commonly prescribed medications for controlling high blood pressure.",
    "A low-sodium diet, regular exercise, and weight management are crucial for reducing hypertension risk.",
    "Symptoms of high blood pressure are often unnoticed, but severe hypertension can cause headaches, shortness of breath, and nosebleeds.",
    "Doctors recommend monitoring blood pressure regularly to avoid complications like kidney failure and cardiovascular diseases.",
    "Hypertension treatment includes lifestyle modifications along with medications such as diuretics and angiotensin receptor blockers (ARBs).",
    "Excessive salt intake, stress, and lack of physical activity are major contributing factors to high blood pressure.",
    "Managing high blood pressure involves a combination of dietary changes, physical exercise, and prescribed medication.",
    "Hypertension is known as the 'silent killer' because it often has no symptoms but can damage organs over time.",
    "Diabetes is a chronic disease that affects how the body processes blood sugar (glucose). Type 1 and Type 2 diabetes are the most common forms.",
    "Insulin therapy is essential for Type 1 diabetes patients, while Type 2 diabetes can often be managed with oral medications and lifestyle changes.",
    "Common symptoms of diabetes include frequent urination, excessive thirst, unexplained weight loss, and blurred vision.",
    "Metformin is one of the first-line medications prescribed for Type 2 diabetes to help regulate blood sugar levels.",
    "Uncontrolled diabetes can lead to complications such as nerve damage, kidney disease, and cardiovascular problems.",
    "A balanced diet, regular exercise, and medication adherence are critical for diabetes management.",
    "Blood sugar monitoring helps patients manage their diabetes effectively and avoid hyperglycemia or hypoglycemia.",
    "Diabetes increases the risk of other conditions, including hypertension and heart disease.",
    "Gestational diabetes occurs during pregnancy and increases the risk of developing Type 2 diabetes later in life.",
    "Insulin resistance is a key factor in Type 2 diabetes and is often linked to obesity and lack of physical activity."
]

    results = punjabi_english_clir(punjabi_query, documents)

    # Print the results
    print("\nPunjabi Query Results:")
    for result in results:
        print(result)

# Ensure the script runs only when executed directly
if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



[Step 1] Translated Query (Punjabi → English): I need information about diabetes.

[Step 2] Extracted Query Tokens: ['need', 'diabetes', 'information']

[Step 3] Expanded Medical Terms: ['indigence', 'pauperism', 'info', 'selective information', 'entropy', 'diabetes', 'motive', 'needs', 'informations', 'information', 'pauperization', 'data', 'demand', 'penury', 'want', 'motivation', 'need', 'diabetess', 'diabete']

[Step 5] Retrieved Documents (before ranking):

Punjabi Query Results:
ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ।


In [6]:
import scispacy
import spacy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from indicnlp.tokenize import indic_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import os
import nltk
from spacy import displacy
from openai import OpenAI

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Load SciSpacy Medical Model
nlp = spacy.load("en_core_sci_sm")

# Initialize OpenAI Client
client = OpenAI()

# Function to query OpenAI API for medical information
def query_openai_for_medical_info(query):
    """
    Query OpenAI API for medical information based on the query.
    """
    try:
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": f"Provide medical information about: {query}. Include only factual information and mention any important medical resources or websites where patients can find more information."
                }
            ]
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error querying OpenAI API: {e}")
        return "Error retrieving information from AI."

# Extract any URLs mentioned in the AI response
def extract_urls(text):
    """
    Extract URLs from the text response.
    """
    import re
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    urls = url_pattern.findall(text)
    return urls

# Load Translation Models
def load_translation_models():
    en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
    indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"

    en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, None)
    indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, None)

    ip = IndicProcessor(inference=True)
    return (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip

# Initialize model and tokenizer (placeholder function)
def initialize_model_and_tokenizer(model_path, device):
    # This is a placeholder function - you would need to implement the actual initialization
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    if device:
        model = model.to(device)
    return tokenizer, model

# IndicProcessor class (placeholder)
class IndicProcessor:
    def __init__(self, inference=False):
        # This is a placeholder
        self.inference = inference

# Translate text (placeholder function)
def translate_text(text, src_lang, tgt_lang, model, tokenizer, processor):
    # This is a placeholder function - you would need to implement the actual translation
    # For now, let's just return a mock translation
    if src_lang == "pan_Guru" and tgt_lang == "eng_Latn":
        # Punjabi to English
        return "I was recently diagnosed with diabetes, and I don't understand how to manage my diet. Can you tell me what kind of eating habits I should develop and what things I should avoid?"
    elif src_lang == "eng_Latn" and tgt_lang == "pan_Guru":
        # English to Punjabi (mock translation)
        return "ਅੰਗਰੇਜ਼ੀ ਤੋਂ ਪੰਜਾਬੀ ਵਿੱਚ ਅਨੁਵਾਦ: " + text[:50] + "..."
    return text

# Extract Medical Terms using SciSpacy
def extract_medical_terms(text):
    """Extract medical terms with additional specific terms."""
    doc = nlp(text)

    # Get terms identified by SciSpacy
    medical_terms = [ent.text.lower() for ent in doc.ents if ent.label_ in ["DISEASE", "DRUG", "TREATMENT"]]

    # Add specific medical terms we know should be included
    known_terms = ["diabetes", "hypertension", "blood pressure", "sugar", "insulin"]
    for term in known_terms:
        if term in text.lower() and term not in medical_terms:
            medical_terms.append(term)

    return list(set(medical_terms))  # Remove duplicates

# Expand Medical Terms using WordNet
def expand_medical_terms(medical_terms):
    """
    Simple, focused expansion of medical terms.
    """
    expanded_terms = set()
    for term in medical_terms:
        # Only add the original term and simple variations
        expanded_terms.add(term)
        # For medical conditions, add singular/plural variations
        if term == "diabetes":
            expanded_terms.add("diabetic")
            expanded_terms.add("sugar")  # Common term for diabetes
        elif term == "hypertension":
            expanded_terms.add("high blood pressure")
            expanded_terms.add("blood pressure")

    return list(expanded_terms)

# Preprocess Text with Medical Focus
def preprocess_text(text, language="en"):
    """
    Extract only the most relevant tokens from text.
    """
    # Focus on key medical terms
    if "diabetes" in text.lower():
        return ["diabetes", "diabetic", "sugar"]
    elif "hypertension" in text.lower() or "blood pressure" in text.lower():
        return ["hypertension", "high blood pressure", "blood pressure"]

    # If no specific conditions found, extract medical entities
    doc = nlp(text)
    medical_terms = [ent.text.lower() for ent in doc.ents if ent.label_ in ["DISEASE", "DRUG", "TREATMENT"]]

    # If still no medical terms, fall back to regular tokenization
    if not medical_terms:
        if language == "pa":
            tokens = indic_tokenize.trivial_tokenize(text, lang="pa")
            stop_words = set(["ਅਤੇ", "ਹੈ", "ਵਿੱਚ", "ਨੂੰ"])
            tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalnum()]
        else:
            tokens = word_tokenize(text)
            stop_words = set(stopwords.words("english"))
            # Only keep substantive words (ignore common verbs, etc.)
            tokens = [t.lower() for t in tokens if t.lower() not in stop_words and len(t) > 3 and t.isalnum()]
        return tokens

    return medical_terms

# Indexing Documents
def create_index(documents, index_dir="index"):
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
    schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
    ix = create_in(index_dir, schema)
    writer = ix.writer()
    for i, doc in enumerate(documents):
        writer.add_document(title=f"doc{i}", content=doc)
    writer.commit()
    return ix

# Retrieve Medical Documents
def retrieve_medical_documents(query_tokens, ix, documents):
    """
    Simple keyword matching for documents.
    """
    results = []

    # Simple keyword search
    for i, doc in enumerate(documents):
        doc_lower = doc.lower()
        for token in query_tokens:
            if token in doc_lower:
                results.append((f"doc{i}", doc))
                break

    return results[:10]  # Return top 10 results

# Summarize Documents
def summarize_document(doc_content):
    sentences = doc_content.split(". ")
    return sentences[0] + "."

# Rank Documents by Medical Relevance
def rank_medical_documents(query_tokens, retrieved_docs):
    docs_content = [doc[1] for doc in retrieved_docs]
    if not docs_content:
        return retrieved_docs

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(docs_content + [" ".join(query_tokens)])
    scores = tfidf_matrix[-1].dot(tfidf_matrix[:-1].T).toarray()[0]
    ranked_docs = [(retrieved_docs[i][0], retrieved_docs[i][1], scores[i]) for i in range(len(retrieved_docs))]
    ranked_docs.sort(key=lambda x: x[2], reverse=True)
    return [(title, content) for title, content, score in ranked_docs]

# Main CLIR Function with OpenAI integration
def punjabi_english_clir_with_openai(query, documents):
    (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip = load_translation_models()

    # Step 1: Translate Punjabi query to English
    translated_query = translate_text(query, "pan_Guru", "eng_Latn", indic_en_tokenizer, indic_en_model, ip)
    print(f"\n[Step 1] Translated Query (Punjabi → English): {translated_query}")

    if not translated_query:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (Translation failed)"]

    # Step 2: Query OpenAI for medical information
    openai_response = query_openai_for_medical_info(translated_query)
    print(f"\n[Step 2] OpenAI Response: {openai_response}")

    # Extract any URLs mentioned in the response
    urls = extract_urls(openai_response)
    print(f"\n[Step 3] URLs mentioned: {urls}")

    # Step 4: Extract relevant query tokens
    if "diabetes" in translated_query.lower():
        query_tokens = ["diabetes"]
    elif "hypertension" in translated_query.lower() or "blood pressure" in translated_query.lower():
        query_tokens = ["hypertension", "blood pressure"]
    else:
        query_tokens = preprocess_text(translated_query, language="en")
    print(f"\n[Step 4] Extracted Query Tokens: {query_tokens}")

    # Step 5: Use simple expansion of medical terms
    medical_terms = expand_medical_terms(query_tokens)
    print(f"\n[Step 5] Expanded Medical Terms: {medical_terms}")

    # Step 6: Retrieve documents and include OpenAI response as a document
    documents_with_openai = documents + [openai_response]
    retrieved_docs = retrieve_medical_documents(medical_terms, None, documents_with_openai)
    print("\n[Step 6] Retrieved Documents (before ranking):")
    for i, (title, content) in enumerate(retrieved_docs, 1):
        print(f"{i}. {title}: {content[:100]}...")

    if not retrieved_docs:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ।"]

    # Step 7: Rank documents based on medical relevance
    ranked_docs = rank_medical_documents(medical_terms, retrieved_docs)
    print("\n[Step 7] Ranked Documents:")
    for i, (title, content) in enumerate(ranked_docs[:5], 1):
        print(f"{i}. {title}: {content[:100]}...")

    # Step 8: Summarize and translate retrieved documents back to Punjabi
    punjabi_summaries = []
    print("\n[Step 8] Translated Summaries (English → Punjabi):")

    # First, translate the OpenAI response
    openai_summary = openai_response[:500]  # Limit to first 500 chars to avoid token limits
    openai_punjabi = translate_text(openai_summary, "eng_Latn", "pan_Guru", en_indic_model, en_indic_tokenizer, ip)

    # Add URLs to the translation if any were found
    if urls:
        url_text = "\n\nਹੋਰ ਜਾਣਕਾਰੀ ਲਈ ਵੈੱਬਸਾਈਟਾਂ: " + ", ".join(urls)
        openai_punjabi += url_text

    print(f"OpenAI Response (Punjabi): {openai_punjabi}")
    punjabi_summaries.append(f"ਏਆਈ ਜਵਾਬ: {openai_punjabi}")

    # Then translate the other documents
    for i, (title, content) in enumerate(ranked_docs[:3], 1):  # Limit to top 3 docs
        summary_en = summarize_document(content)
        summary_pa = translate_text(summary_en, "eng_Latn", "pan_Guru", en_indic_model, en_indic_tokenizer, ip)
        if summary_pa:
            print(f"{i}. {summary_pa}")
            punjabi_summaries.append(f"{i}. {summary_pa}")
        else:
            print(f"{i}. ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।")
            punjabi_summaries.append(f"{i}. ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।")

    return punjabi_summaries

def main():
    # Sample Punjabi medical query
    punjabi_query = "ਮੈਂ ਹਾਲ ਹੀ ਵਿੱਚ ਮਧੁਮੇਹ ਦੀ ਪਛਾਣ ਹੋਈ ਹੈ, ਅਤੇ ਮੈਨੂੰ ਸਮਝ ਨਹੀਂ ਆਉਂਦਾ ਕਿ ਆਪਣੀ ਡਾਇਟ ਕਿਵੇਂ ਪ੍ਰਬੰਧਿਤ ਕਰਨੀ ਹੈ। ਕੀ ਤੁਸੀਂ ਮੈਨੂੰ ਦੱਸ ਸਕਦੇ ਹੋ ਕਿ ਮੈਨੂੰ ਕਿਸ ਤਰ੍ਹਾਂ ਦੇ ਖਾਣ-ਪੀਣ ਦੀ"

SyntaxError: unterminated string literal (detected at line 288) (1193191513.py, line 288)

easy way to given by Claude.

In [None]:
import scispacy
import spacy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from indicnlp.tokenize import indic_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import os
import nltk
from spacy import displacy

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Load SciSpacy Medical Model
nlp = spacy.load("en_core_sci_sm")

# Load Translation Models
def load_translation_models():
    en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
    indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"

    en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, None)
    indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, None)

    ip = IndicProcessor(inference=True)
    return (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip

# Extract Medical Terms using SciSpacy
def extract_medical_terms(text):
    """Extract medical terms with additional specific terms."""
    doc = nlp(text)

    # Get terms identified by SciSpacy
    medical_terms = [ent.text.lower() for ent in doc.ents if ent.label_ in ["DISEASE", "DRUG", "TREATMENT"]]

    # Add specific medical terms we know should be included
    known_terms = ["diabetes", "hypertension", "blood pressure", "sugar", "insulin"]
    for term in known_terms:
        if term in text.lower() and term not in medical_terms:
            medical_terms.append(term)

    return list(set(medical_terms))  # Remove duplicates

# Expand Medical Terms using WordNet
# def expand_medical_terms(medical_terms):
#     expanded_terms = set(medical_terms)
#     for term in medical_terms:
#         for synset in wn.synsets(term, pos=wn.NOUN):  # Get noun synonyms
#             for lemma in synset.lemmas():
#                 expanded_terms.add(lemma.name().lower())
#     return list(expanded_terms)
def expand_medical_terms(medical_terms):
    """
    Simple, focused expansion of medical terms.
    """
    expanded_terms = set()
    for term in medical_terms:
        # Only add the original term and simple variations
        expanded_terms.add(term)
        # For medical conditions, add singular/plural variations
        if term == "diabetes":
            expanded_terms.add("diabetic")
            expanded_terms.add("sugar")  # Common term for diabetes
        elif term == "hypertension":
            expanded_terms.add("high blood pressure")
            expanded_terms.add("blood pressure")

    return list(expanded_terms)

# Preprocess Text with Medical Focus
# def preprocess_text(text, language="en"):
#     if language == "pa":
#         tokens = indic_tokenize.trivial_tokenize(text, lang="pa")
#         stop_words = set(["ਅਤੇ", "ਹੈ", "ਵਿੱਚ", "ਨੂੰ"])
#         tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalnum()]
#     else:
#         tokens = word_tokenize(text)
#         stop_words = set(stopwords.words("english"))
#         tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalnum()]
#     medical_terms = extract_medical_terms(text)
#     return medical_terms if medical_terms else tokens  # Prefer medical terms
def preprocess_text(text, language="en"):
    """
    Extract only the most relevant tokens from text.
    """
    # Focus on key medical terms
    if "diabetes" in text.lower():
        return ["diabetes", "diabetic", "sugar"]
    elif "hypertension" in text.lower() or "blood pressure" in text.lower():
        return ["hypertension", "high blood pressure", "blood pressure"]

    # If no specific conditions found, extract medical entities
    doc = nlp(text)
    medical_terms = [ent.text.lower() for ent in doc.ents if ent.label_ in ["DISEASE", "DRUG", "TREATMENT"]]

    # If still no medical terms, fall back to regular tokenization
    if not medical_terms:
        if language == "pa":
            tokens = indic_tokenize.trivial_tokenize(text, lang="pa")
            stop_words = set(["ਅਤੇ", "ਹੈ", "ਵਿੱਚ", "ਨੂੰ"])
            tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalnum()]
        else:
            tokens = word_tokenize(text)
            stop_words = set(stopwords.words("english"))
            # Only keep substantive words (ignore common verbs, etc.)
            tokens = [t.lower() for t in tokens if t.lower() not in stop_words and len(t) > 3 and t.isalnum()]
        return tokens

    return medical_terms
# Indexing Documents
def create_index(documents, index_dir="index"):
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
    schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
    ix = create_in(index_dir, schema)
    writer = ix.writer()
    for i, doc in enumerate(documents):
        writer.add_document(title=f"doc{i}", content=doc)
    writer.commit()
    return ix

# Retrieve Medical Documents
def retrieve_medical_documents(query_tokens, ix, documents):
    """
    Simple keyword matching for documents.
    """
    results = []

    # Simple keyword search
    for i, doc in enumerate(documents):
        doc_lower = doc.lower()
        for token in query_tokens:
            if token in doc_lower:
                results.append((f"doc{i}", doc))
                break

    return results[:10]  # Return top 10 results

# Summarize Documents
def summarize_document(doc_content):
    sentences = doc_content.split(". ")
    return sentences[0] + "."

# Rank Documents by Medical Relevance
def rank_medical_documents(query_tokens, retrieved_docs):
    docs_content = [doc[1] for doc in retrieved_docs]
    if not docs_content:
        return retrieved_docs

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(docs_content + [" ".join(query_tokens)])
    scores = tfidf_matrix[-1].dot(tfidf_matrix[:-1].T).toarray()[0]
    ranked_docs = [(retrieved_docs[i][0], retrieved_docs[i][1], scores[i]) for i in range(len(retrieved_docs))]
    ranked_docs.sort(key=lambda x: x[2], reverse=True)
    return [(title, content) for title, content, score in ranked_docs]

# Main CLIR Function
def punjabi_english_clir(query, documents):
    (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip = load_translation_models()

    # Step 1: Translate Punjabi query to English
    translated_query = translate_text(query, "pan_Guru", "eng_Latn", indic_en_model, indic_en_tokenizer, ip)
    print(f"\n[Step 1] Translated Query (Punjabi → English): {translated_query}")

    if not translated_query:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (Translation failed)"]


    # Step 2: Extract relevant query tokens - simplify to focus on medical terms
    if "diabetes" in translated_query.lower():
        query_tokens = ["diabetes"]
    elif "hypertension" in translated_query.lower() or "blood pressure" in translated_query.lower():
        query_tokens = ["hypertension", "blood pressure"]
    else:
        query_tokens = preprocess_text(translated_query, language="en")
    print(f"\n[Step 2] Extracted Query Tokens: {query_tokens}")

    # Step 3: Use simple expansion of medical terms
    medical_terms = expand_medical_terms(query_tokens)
    print(f"\n[Step 3] Expanded Medical Terms: {medical_terms}")

    # Step 4: Skip complex indexing and directly search documents
    retrieved_docs = retrieve_medical_documents(medical_terms, None, documents)
    print("\n[Step 5] Retrieved Documents (before ranking):")
    for i, (title, content) in enumerate(retrieved_docs, 1):
        print(f"{i}. {title}: {content}")

    if not retrieved_docs:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ।"]

    # Step 6: Rank documents based on medical relevance
    ranked_docs = rank_medical_documents(medical_terms, retrieved_docs)
    print("\n[Step 6] Ranked Documents:")
    for i, (title, content) in enumerate(ranked_docs[:5], 1):
        print(f"{i}. {title}: {content}")

    # Step 7: Summarize and translate retrieved documents back to Punjabi
    punjabi_summaries = []
    print("\n[Step 7] Translated Summaries (English → Punjabi):")
    for i, (title, content) in enumerate(ranked_docs[:5], 1):
        summary_en = summarize_document(content)
        summary_pa = translate_text(summary_en, "eng_Latn", "pan_Guru", en_indic_model, en_indic_tokenizer, ip)
        if summary_pa:
            print(f"{i}. {summary_pa}")
            punjabi_summaries.append(f"{i}. {summary_pa}")
        else:
            print(f"{i}. ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।")
            punjabi_summaries.append(f"{i}. ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।")

    return punjabi_summaries


def main():
    # Sample Punjabi medical query
    punjabi_query = "ਮੈਂ ਹਾਲ ਹੀ ਵਿੱਚ ਮਧੁਮੇਹ ਦੀ ਪਛਾਣ ਹੋਈ ਹੈ, ਅਤੇ ਮੈਨੂੰ ਸਮਝ ਨਹੀਂ ਆਉਂਦਾ ਕਿ ਆਪਣੀ ਡਾਇਟ ਕਿਵੇਂ ਪ੍ਰਬੰਧਿਤ ਕਰਨੀ ਹੈ। ਕੀ ਤੁਸੀਂ ਮੈਨੂੰ ਦੱਸ ਸਕਦੇ ਹੋ ਕਿ ਮੈਨੂੰ ਕਿਸ ਤਰ੍ਹਾਂ ਦੇ ਖਾਣ-ਪੀਣ ਦੀ ਆਦਤ ਬਣਾਉਣੀ ਚਾਹੀਦੀ ਹੈ ਅਤੇ ਕਿਹੜੀਆਂ ਚੀਜ਼ਾਂ ਤੋਂ ਪਰਹੇਜ਼ ਕਰਨਾ ਚਾਹੀਦਾ ਹੈ?"

    # Sample medical documents (replace with real documents)
    documents = [
    "High blood pressure, also called hypertension, occurs when the force of blood against artery walls is too high. It is often managed with lifestyle changes and medications.",
    "Hypertension is a common condition that can lead to severe health complications such as heart disease and stroke if left untreated.",
    "Beta-blockers, ACE inhibitors, and calcium channel blockers are some of the most commonly prescribed medications for controlling high blood pressure.",
    "A low-sodium diet, regular exercise, and weight management are crucial for reducing hypertension risk.",
    "Symptoms of high blood pressure are often unnoticed, but severe hypertension can cause headaches, shortness of breath, and nosebleeds.",
    "Doctors recommend monitoring blood pressure regularly to avoid complications like kidney failure and cardiovascular diseases.",
    "Hypertension treatment includes lifestyle modifications along with medications such as diuretics and angiotensin receptor blockers (ARBs).",
    "Excessive salt intake, stress, and lack of physical activity are major contributing factors to high blood pressure.",
    "Managing high blood pressure involves a combination of dietary changes, physical exercise, and prescribed medication.",
    "Hypertension is known as the 'silent killer' because it often has no symptoms but can damage organs over time.",
    "Diabetes is a chronic disease that affects how the body processes blood sugar (glucose). Type 1 and Type 2 diabetes are the most common forms.",
    "Insulin therapy is essential for Type 1 diabetes patients, while Type 2 diabetes can often be managed with oral medications and lifestyle changes.",
    "Common symptoms of diabetes include frequent urination, excessive thirst, unexplained weight loss, and blurred vision.",
    "Metformin is one of the first-line medications prescribed for Type 2 diabetes to help regulate blood sugar levels.",
    "Uncontrolled diabetes can lead to complications such as nerve damage, kidney disease, and cardiovascular problems.",
    "A balanced diet, regular exercise, and medication adherence are critical for diabetes management.",
    "Blood sugar monitoring helps patients manage their diabetes effectively and avoid hyperglycemia or hypoglycemia.",
    "Diabetes increases the risk of other conditions, including hypertension and heart disease.",
    "Gestational diabetes occurs during pregnancy and increases the risk of developing Type 2 diabetes later in life.",
    "Insulin resistance is a key factor in Type 2 diabetes and is often linked to obesity and lack of physical activity."
]

    # Run the Punjabi-English CLIR system
    results = punjabi_english_clir(punjabi_query, documents)

    # Print the results
    print("\nPunjabi Query Results:")
    for result in results:
        print(result)

# Ensure the script runs only when executed directly
if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]



Translated Query (English): What medications can I take for diabetes?

Preprocessed Tokens: ['medications', 'take', 'diabetes']

Expanded Tokens: ['blood sugar', 'drug', 'treatment', 'medication', 'diabetes', 'take', 'medications', 'prescription', 'insulin', 'diabetic']

Retrieved Documents with Scores:
1. Score: 0.2813 | Content: The best medications for diabetes treatment include Metformin and Insulin.
2. Score: 0.2332 | Content: Metformin is a widely used prescription drug for diabetes management.
3. Score: 0.1970 | Content: People with diabetes should monitor their blood sugar regularly.
4. Score: 0.1860 | Content: Patients with high blood sugar levels should consult a doctor for appropriate medications.
5. Score: 0.1815 | Content: Insulin therapy is essential for some diabetic patients.


In [10]:
import scispacy
import spacy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from indicnlp.tokenize import indic_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import os
import nltk
from spacy import displacy
from openai import OpenAI

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Load SciSpacy Medical Model
nlp = spacy.load("en_core_sci_sm")

# Initialize OpenAI Client
client = OpenAI()

# Function to query OpenAI API for medical information
def query_openai_for_medical_info(query):
    """
    Query OpenAI API for medical information based on the query.
    """
    try:
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": f"Provide medical information about: {query}. Include only factual information and mention any important medical resources or websites where patients can find more information."
                }
            ]
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error querying OpenAI API: {e}")
        return "Error retrieving information from AI."

# Extract any URLs mentioned in the AI response
def extract_urls(text):
    """
    Extract URLs from the text response.
    """
    import re
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    urls = url_pattern.findall(text)
    return urls

# Load Translation Models
def load_translation_models():
    en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
    indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"

    en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, None)
    indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, None)

    ip = IndicProcessor(inference=True)
    return (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip

# Initialize model and tokenizer (placeholder function)
def initialize_model_and_tokenizer(model_path, device):
    # This is a placeholder function - you would need to implement the actual initialization
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    if device:
        model = model.to(device)
    return tokenizer, model

# IndicProcessor class (placeholder)
class IndicProcessor:
    def __init__(self, inference=False):
        # This is a placeholder
        self.inference = inference

# Translate text (placeholder function)
def translate_text(text, src_lang, tgt_lang, model, tokenizer, processor):
    # This is a placeholder function - you would need to implement the actual translation
    # For now, let's just return a mock translation
    if src_lang == "pan_Guru" and tgt_lang == "eng_Latn":
        # Punjabi to English
        return "I was recently diagnosed with diabetes, and I don't understand how to manage my diet. Can you tell me what kind of eating habits I should develop and what things I should avoid?"
    elif src_lang == "eng_Latn" and tgt_lang == "pan_Guru":
        # English to Punjabi (mock translation)
        return "ਅੰਗਰੇਜ਼ੀ ਤੋਂ ਪੰਜਾਬੀ ਵਿੱਚ ਅਨੁਵਾਦ: " + text[:50] + "..."
    return text

# Extract Medical Terms using SciSpacy
def extract_medical_terms(text):
    """Extract medical terms with additional specific terms."""
    doc = nlp(text)

    # Get terms identified by SciSpacy
    medical_terms = [ent.text.lower() for ent in doc.ents if ent.label_ in ["DISEASE", "DRUG", "TREATMENT"]]

    # Add specific medical terms we know should be included
    known_terms = ["diabetes", "hypertension", "blood pressure", "sugar", "insulin"]
    for term in known_terms:
        if term in text.lower() and term not in medical_terms:
            medical_terms.append(term)

    return list(set(medical_terms))  # Remove duplicates

# Expand Medical Terms using WordNet
def expand_medical_terms(medical_terms):
    """
    Simple, focused expansion of medical terms.
    """
    expanded_terms = set()
    for term in medical_terms:
        # Only add the original term and simple variations
        expanded_terms.add(term)
        # For medical conditions, add singular/plural variations
        if term == "diabetes":
            expanded_terms.add("diabetic")
            expanded_terms.add("sugar")  # Common term for diabetes
        elif term == "hypertension":
            expanded_terms.add("high blood pressure")
            expanded_terms.add("blood pressure")

    return list(expanded_terms)

# Preprocess Text with Medical Focus
def preprocess_text(text, language="en"):
    """
    Extract only the most relevant tokens from text.
    """
    # Focus on key medical terms
    if "diabetes" in text.lower():
        return ["diabetes", "diabetic", "sugar"]
    elif "hypertension" in text.lower() or "blood pressure" in text.lower():
        return ["hypertension", "high blood pressure", "blood pressure"]

    # If no specific conditions found, extract medical entities
    doc = nlp(text)
    medical_terms = [ent.text.lower() for ent in doc.ents if ent.label_ in ["DISEASE", "DRUG", "TREATMENT"]]

    # If still no medical terms, fall back to regular tokenization
    if not medical_terms:
        if language == "pa":
            tokens = indic_tokenize.trivial_tokenize(text, lang="pa")
            stop_words = set(["ਅਤੇ", "ਹੈ", "ਵਿੱਚ", "ਨੂੰ"])
            tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalnum()]
        else:
            tokens = word_tokenize(text)
            stop_words = set(stopwords.words("english"))
            # Only keep substantive words (ignore common verbs, etc.)
            tokens = [t.lower() for t in tokens if t.lower() not in stop_words and len(t) > 3 and t.isalnum()]
        return tokens

    return medical_terms

# Indexing Documents
def create_index(documents, index_dir="index"):
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
    schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
    ix = create_in(index_dir, schema)
    writer = ix.writer()
    for i, doc in enumerate(documents):
        writer.add_document(title=f"doc{i}", content=doc)
    writer.commit()
    return ix

# Retrieve Medical Documents
def retrieve_medical_documents(query_tokens, ix, documents):
    """
    Simple keyword matching for documents.
    """
    results = []

    # Simple keyword search
    for i, doc in enumerate(documents):
        doc_lower = doc.lower()
        for token in query_tokens:
            if token in doc_lower:
                results.append((f"doc{i}", doc))
                break

    return results[:10]  # Return top 10 results

# Summarize Documents
def summarize_document(doc_content):
    sentences = doc_content.split(". ")
    return sentences[0] + "."

# Rank Documents by Medical Relevance
def rank_medical_documents(query_tokens, retrieved_docs):
    docs_content = [doc[1] for doc in retrieved_docs]
    if not docs_content:
        return retrieved_docs

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(docs_content + [" ".join(query_tokens)])
    scores = tfidf_matrix[-1].dot(tfidf_matrix[:-1].T).toarray()[0]
    ranked_docs = [(retrieved_docs[i][0], retrieved_docs[i][1], scores[i]) for i in range(len(retrieved_docs))]
    ranked_docs.sort(key=lambda x: x[2], reverse=True)
    return [(title, content) for title, content, score in ranked_docs]

# Main CLIR Function with OpenAI integration
def punjabi_english_clir_with_openai(query, documents):
    (en_indic_tokenizer, en_indic_model), (indic_en_tokenizer, indic_en_model), ip = load_translation_models()

    # Step 1: Translate Punjabi query to English
    translated_query = translate_text(query, "pan_Guru", "eng_Latn", indic_en_tokenizer, indic_en_model, ip)
    print(f"\n[Step 1] Translated Query (Punjabi → English): {translated_query}")

    if not translated_query:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ। (Translation failed)"]

    # Step 2: Query OpenAI for medical information
    openai_response = query_openai_for_medical_info(translated_query)
    print(f"\n[Step 2] OpenAI Response: {openai_response}")

    # Extract any URLs mentioned in the response
    urls = extract_urls(openai_response)
    print(f"\n[Step 3] URLs mentioned: {urls}")

    # Step 4: Extract relevant query tokens
    if "diabetes" in translated_query.lower():
        query_tokens = ["diabetes"]
    elif "hypertension" in translated_query.lower() or "blood pressure" in translated_query.lower():
        query_tokens = ["hypertension", "blood pressure"]
    else:
        query_tokens = preprocess_text(translated_query, language="en")
    print(f"\n[Step 4] Extracted Query Tokens: {query_tokens}")

    # Step 5: Use simple expansion of medical terms
    medical_terms = expand_medical_terms(query_tokens)
    print(f"\n[Step 5] Expanded Medical Terms: {medical_terms}")

    # Step 6: Retrieve documents and include OpenAI response as a document
    documents_with_openai = documents + [openai_response]
    retrieved_docs = retrieve_medical_documents(medical_terms, None, documents_with_openai)
    print("\n[Step 6] Retrieved Documents (before ranking):")
    for i, (title, content) in enumerate(retrieved_docs, 1):
        print(f"{i}. {title}: {content[:100]}...")

    if not retrieved_docs:
        return ["ਕੋਈ ਸੰਬੰਧਿਤ ਜਾਣਕਾਰੀ ਨਹੀਂ ਮਿਲੀ।"]

    # Step 7: Rank documents based on medical relevance
    ranked_docs = rank_medical_documents(medical_terms, retrieved_docs)
    print("\n[Step 7] Ranked Documents:")
    for i, (title, content) in enumerate(ranked_docs[:5], 1):
        print(f"{i}. {title}: {content[:100]}...")

    # Step 8: Summarize and translate retrieved documents back to Punjabi
    punjabi_summaries = []
    print("\n[Step 8] Translated Summaries (English → Punjabi):")

    # First, translate the OpenAI response
    openai_summary = openai_response[:500]  # Limit to first 500 chars to avoid token limits
    openai_punjabi = translate_text(openai_summary, "eng_Latn", "pan_Guru", en_indic_model, en_indic_tokenizer, ip)

    # Add URLs to the translation if any were found
    if urls:
        url_text = "\n\nਹੋਰ ਜਾਣਕਾਰੀ ਲਈ ਵੈੱਬਸਾਈਟਾਂ: " + ", ".join(urls)
        openai_punjabi += url_text

    print(f"OpenAI Response (Punjabi): {openai_punjabi}")
    punjabi_summaries.append(f"ਏਆਈ ਜਵਾਬ: {openai_punjabi}")

    # Then translate the other documents
    for i, (title, content) in enumerate(ranked_docs[:3], 1):  # Limit to top 3 docs
        summary_en = summarize_document(content)
        summary_pa = translate_text(summary_en, "eng_Latn", "pan_Guru", en_indic_model, en_indic_tokenizer, ip)
        if summary_pa:
            print(f"{i}. {summary_pa}")
            punjabi_summaries.append(f"{i}. {summary_pa}")
        else:
            print(f"{i}. ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।")
            punjabi_summaries.append(f"{i}. ਸੰਖੇਪ ਅਨੁਵਾਦ ਵਿੱਚ ਗਲਤੀ।")

    return punjabi_summaries

def main():
    # Sample Punjabi medical query
    punjabi_query = "ਮੈਂ ਹਾਲ ਹੀ ਵਿੱਚ ਮਧੁਮੇਹ ਦੀ ਪਛਾਣ ਹੋਈ ਹੈ, ਅਤੇ ਮੈਨੂੰ ਸਮਝ ਨਹੀਂ ਆਉਂਦਾ ਕਿ ਆਪਣੀ ਡਾਇਟ ਕਿਵੇਂ ਪ੍ਰਬੰਧਿਤ ਕਰਨੀ ਹੈ। ਕੀ ਤੁਸੀਂ ਮੈਨੂੰ ਦੱਸ ਸਕਦੇ ਹੋ ਕਿ ਮੈਨੂੰ ਕਿਸ ਤਰ੍ਹਾਂ ਦੇ ਖਾਣ-ਪੀਣ ਦੀ"
    results = punjabi_english_clir(punjabi_query, documents)

    # Print the results
    print("\nPunjabi Query Results:")
    for result in results:
        print(result)

# Ensure the script runs only when executed directly
if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/charupatelbaghi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

I am trying to extract the relevant information using We will use SpaCy (NER) + YAKE (Keyword Extraction) + WordNet (Synonyms Expansion).

In [14]:
import spacy
import yake
from nltk.corpus import wordnet

# Load English NLP Model
nlp = spacy.load("en_core_web_sm")

# Sample Query
query = "I have recently been diagnosed with diabetes, and I don't understand how to manage my diet. Can you tell me what kind of eating habits I should make and what things to avoid?"

### STEP 1: Named Entity Recognition (NER)
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text.lower() for ent in doc.ents if ent.label_ in ["DISEASE", "SYMPTOM", "FOOD", "NUTRIENT"]]
    return list(set(entities))  # Remove duplicates

### STEP 2: Keyword Extraction using YAKE
def extract_keywords(text):
    kw_extractor = yake.KeywordExtractor(lan="en", n=2, top=7)  # Extract 2-word phrases
    keywords = kw_extractor.extract_keywords(text)
    return [kw[0] for kw in keywords]

### STEP 3: Synonym Expansion (WordNet)
def expand_synonyms(words):
    synonyms = set(words)
    for word in words:
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().replace("_", " "))  # Convert underscores to spaces
    return list(synonyms)

### FINAL FUNCTION: Extract Meaningful Tokens
def extract_relevant_tokens(text):
    entities = extract_entities(text)
    keywords = extract_keywords(text)
    expanded_terms = expand_synonyms(entities + keywords)

    return list(set(expanded_terms))  # Remove duplicates

# Run Token Extraction
tokens = extract_relevant_tokens(query)

# Print Results
print("Extracted Tokens:", tokens)


Extracted Tokens: ['wangle', 'diagnosed', 'do', 'diet', 'latterly', 'empathise', 'grapple', 'finagle', 'see', 'name', 'deal', 'handle', 'sympathize', 'contend', 'infer', 'supervise', 'sympathise', 'diagnose', 'manage', 'of late', 'get by', 'care', 'read', 'oversee', 'wield', 'understand', 'lately', 'make do', 'diabetes', 'superintend', 'interpret', 'carry off', 'realise', 'pull off', 'bring off', 'realize', 'empathize', 'recently', 'translate', 'make out', 'cope', 'late', 'eating habits', 'dieting', 'negociate']


what I'm trying, is I'm training a model based on the data set medquad_answers.csv file and then get the relevant results

In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
from torch.utils.data import Dataset, DataLoader

# Step 1: Parse and prepare the dataset by extracting questions and answers
def parse_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Initialize lists to store extracted questions and answers
    question_list = []
    answer_list = []
    doc_id_list = []

    # Iterate through each row and extract the question and answer
    for idx, row in df.iterrows():
        answer_text = row['Answer']
        doc_id = row['AnswerID']

        # Extract question using regex pattern - looking for "Question: " followed by text
        question_match = re.search(r'Question:\s*(.*?)\s*(?:URL:|$)', answer_text)
        if question_match:
            question = question_match.group(1).strip()
        else:
            question = ""

        # Extract answer using regex pattern - looking for "Answer: " followed by text
        answer_match = re.search(r'Answer:\s*(.*?)(?:$)', answer_text)
        if answer_match:
            answer = answer_match.group(1).strip()
        else:
            answer = ""

        # Add to lists if both question and answer were found
        if question and answer:
            question_list.append(question)
            answer_list.append(answer)
            doc_id_list.append(doc_id)

    # Create a new DataFrame with separate question and answer columns
    processed_df = pd.DataFrame({
        'DocID': doc_id_list,
        'Question': question_list,
        'Answer': answer_list
    })

    return processed_df

# Step 2: Preprocess the text data
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'https?://\S+', '', text)
    # Remove special characters (keep some punctuation for medical terms)
    text = re.sub(r'[^\w\s.,;?!-]', '', text)
    # Convert to lowercase
    text = text.lower()

    # Download NLTK resources if needed
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords', quiet=True)
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet', quiet=True)

    # Tokenization and stopword removal
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)

    # Keep medical terms intact - don't remove all stopwords for medical text
    medical_stopwords = {'and', 'or', 'the', 'a', 'an'}
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens
                      if word not in medical_stopwords]

    return ' '.join(filtered_tokens)

# Step 3: Prepare dataset for training
def prepare_dataset(df):
    df['processed_question'] = df['Question'].apply(preprocess_text)
    df['processed_answer'] = df['Answer'].apply(preprocess_text)
    return df

# Option 1: Traditional ML approach using TF-IDF and classifier
def train_classifier_model(df):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        df['processed_question'], df['processed_answer'],
        test_size=0.2, random_state=42
    )

    # Convert text to features using TF-IDF
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    # Train a simple classifier as proof of concept
    # In a real scenario, you might need more sophisticated approach for text generation
    print("Training classifier model...")
    classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    classifier.fit(X_train_tfidf, y_train)

    # Make predictions and evaluate
    predictions = classifier.predict(X_test_tfidf)

    # This approach isn't ideal for text generation but can classify similar answers
    accuracy = accuracy_score(y_test, predictions)
    print(f"Model accuracy: {accuracy:.4f}")

    return {
        'model': classifier,
        'vectorizer': tfidf_vectorizer,
        'accuracy': accuracy
    }

# Option 2: BERT-based approach for Q&A
class MedicalQADataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length=512):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]

        # Tokenize inputs
        encoding = self.tokenizer(
            question,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Get the answer start and end positions
        # This is simplified and would need refinement for production
        answer_tokens = self.tokenizer(answer, return_tensors='pt')

        # For simplicity, we'll just use dummy values for start/end positions
        # In a real scenario, you'd need to find the actual positions
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([len(answer_tokens['input_ids'][0])])

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'start_positions': start_positions.flatten(),
            'end_positions': end_positions.flatten()
        }

def train_bert_model(df):
    print("Setting up BERT model...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

    # Split data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Create datasets
    train_dataset = MedicalQADataset(
        train_df['processed_question'].tolist(),
        train_df['processed_answer'].tolist(),
        tokenizer
    )

    # This is a simplified implementation
    # In a real scenario, you would:
    # 1. Create proper DataLoaders
    # 2. Set up training loop with optimizer
    # 3. Train for multiple epochs
    # 4. Save model checkpoints

    print("BERT model setup complete. Note: Actual training would require:")
    print("1. GPU resources")
    print("2. Training loop implementation")
    print("3. Proper answer span detection")

    return {
        'model': model,
        'tokenizer': tokenizer
    }

# Function to extract questions and answers from raw text
def extract_qa_from_text(text):
    # Extract question
    question_match = re.search(r'Question:\s*(.*?)\s*(?:URL:|$)', text, re.DOTALL)
    question = question_match.group(1).strip() if question_match else ""

    # Extract answer
    answer_match = re.search(r'Answer:\s*(.*?)(?:$)', text, re.DOTALL)
    answer = answer_match.group(1).strip() if answer_match else ""

    return question, answer

# Main function to run the pipeline
def main():
    # Step 1: Load and parse the data
    # Replace with your actual file path
    file_path = 'medical_qa_dataset.csv'

    print(f"Step 1: Parsing data from {file_path}...")

    # If you need to create the CSV first from the raw data
    # Create a simple CSV with AnswerID and Answer columns
    sample_data = {
        'AnswerID': ['ADAM_0003147_Sec1.txt', 'ADAM_0003147_Sec2.txt'],
        'Answer': [
            'Question: What is (are) Polycystic ovary syndrome? (Also called: Polycystic ovaries; Polycystic ovary disease; Stein-Leventhal syndrome; Polyfollicular ovarian disease) URL: https://www.nlm.nih.gov/medlineplus/ency/article/000369.htm Answer: Polycystic ovary syndrome is a condition in which a woman has an imbalance of female sex hormones.',
            'Question: What causes Polycystic ovary syndrome? (Also called: Polycystic ovaries; Polycystic ovary disease; Stein-Leventhal syndrome; Polyfollicular ovarian disease) URL: https://www.nlm.nih.gov/medlineplus/ency/article/000369.htm Answer: PCOS is linked to changes in hormone levels that make it harder for the ovaries to release fully-grown eggs.'
        ]
    }

    # For demo purposes, we'll create a dataframe from sample data
    # In a real scenario, replace this with reading from your CSV file:
    # df_raw = pd.read_csv(file_path)
    df_raw = pd.DataFrame(sample_data)

    # Step 2: Extract questions and answers from the data
    print("Step 2: Extracting questions and answers...")
    processed_df = pd.DataFrame(columns=['DocID', 'Question', 'Answer'])

    for idx, row in df_raw.iterrows():
        doc_id = row['AnswerID']
        question, answer = extract_qa_from_text(row['Answer'])
        if question and answer:
            new_row = pd.DataFrame({
                'DocID': [doc_id],
                'Question': [question],
                'Answer': [answer]
            })
            processed_df = pd.concat([processed_df, new_row], ignore_index=True)

    print(f"Extracted {len(processed_df)} question-answer pairs")
    print("Sample Q&A pair:")
    print(f"Q: {processed_df['Question'].iloc[0]}")
    print(f"A: {processed_df['Answer'].iloc[0]}")

    # Step 3: Preprocess the data
    print("Step 3: Preprocessing text data...")
    processed_df = prepare_dataset(processed_df)

    # Step 4: Choose and train model
    print("Step 4: Training model...")
    # Option 1: Traditional ML approach (faster)
    classifier_results = train_classifier_model(processed_df)

    # Option 2: BERT approach (better but requires more resources)
    # Uncomment to use BERT (requires GPU and more time)
    # bert_results = train_bert_model(processed_df)

    print("Training complete!")

    # Step 5: Save models (in a real implementation)
    print("Step 5: In a real implementation, models would be saved here")

    # Step 6: Create simple inference function
    print("Example of using the trained model:")

    def predict_answer(question, model=classifier_results['model'],
                       vectorizer=classifier_results['vectorizer']):
        # Preprocess question
        processed_q = preprocess_text(question)
        # Vectorize
        q_vector = vectorizer.transform([processed_q])
        # Predict
        prediction = model.predict(q_vector)[0]
        return prediction

    # Test with a question
    test_question = "What is Polycystic ovary syndrome?"
    predicted_answer = predict_answer(test_question)
    print(f"Q: {test_question}")
    print(f"Predicted A: {predicted_answer}")

if __name__ == "__main__":
    main()

Step 1: Parsing data from medical_qa_dataset.csv...
Step 2: Extracting questions and answers...
Extracted 2 question-answer pairs
Sample Q&A pair:
Q: What is (are) Polycystic ovary syndrome? (Also called: Polycystic ovaries; Polycystic ovary disease; Stein-Leventhal syndrome; Polyfollicular ovarian disease)
A: Polycystic ovary syndrome is a condition in which a woman has an imbalance of female sex hormones.
Step 3: Preprocessing text data...
Step 4: Training model...
Training classifier model...
Model accuracy: 0.0000
Training complete!
Step 5: In a real implementation, models would be saved here
Example of using the trained model:
Q: What is Polycystic ovary syndrome?
Predicted A: polycystic ovary syndrome is condition in which woman ha imbalance of female sex hormone .


In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'[^\w\s.,;?!-]', '', text).lower()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])

def train_and_evaluate_model(df, model_type='knn'):
    X_train, X_test, y_train, y_test = train_test_split(
        df['processed_question'], df['processed_answer'], test_size=0.2, random_state=42
    )
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    model = KNeighborsClassifier(n_neighbors=5) if model_type == 'knn' else RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_tfidf, y_train)
    predictions = model.predict(X_test_tfidf)

    exact_match_accuracy = accuracy_score(y_test, predictions)
    bleu_scores = [sentence_bleu([nltk.word_tokenize(ref.lower())], nltk.word_tokenize(pred.lower())) for ref, pred in zip(y_test, predictions)]
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    print(f"Accuracy: {exact_match_accuracy:.4f}")
    print(f"Average BLEU Score: {avg_bleu:.4f}\n")

    return model, tfidf_vectorizer

def create_inference_function(model, vectorizer):
    def predict_answer(question):
        processed_q = preprocess_text(question)
        q_vector = vectorizer.transform([processed_q])
        return model.predict(q_vector)[0]
    return predict_answer

def main():
    file_path = 'medquad_answers.csv'
    df = pd.read_csv(file_path)
    df['processed_question'] = df['Question'].apply(preprocess_text)
    df['processed_answer'] = df['Answer'].apply(preprocess_text)

    model, vectorizer = train_and_evaluate_model(df, model_type='knn')
    predict_answer = create_inference_function(model, vectorizer)

    new_questions = [
        "What is the treatment for Polycystic ovary syndrome?",
        "Can Noonan syndrome be diagnosed before birth?",
        "Are there any dietary recommendations for PCOS patients?",
        "What is the life expectancy for someone with Noonan syndrome?",
        "How is Neurofibromatosis-Noonan syndrome different from regular Noonan syndrome?"
    ]

    for question in new_questions:
        print(f"Q: {question}")
        print(f"Predicted A: {predict_answer(question)}\n")

if __name__ == "__main__":
    main()


KeyError: 'Question'