<a href="https://colab.research.google.com/github/dipakphp/COVID-19-Smoking-Analysis/blob/main/CORD19_AIAgent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# ---------------------- Installation of Core dependencies required for the Project ----------------------
!pip install gradio kagglehub llama_index.embeddings.huggingface llama-index llama-index-llms-huggingface transformers pandas tqdm



In [16]:
# ---------------------- import the libraries required for the Project ----------------------
import time
import gradio as gr
from transformers import pipeline
from llama_index.llms.huggingface import HuggingFaceLLM
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, Document
import torch
import pandas as pd
import os
import kagglehub
from tqdm import tqdm

In [17]:
# ---------------------- Load Embeddings ----------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    device=device
)
Settings.embed_model = embed_model
Settings.llm = None

LLM is explicitly disabled. Using MockLLM.


In [18]:
# ---------------------- Load Dataset ----------------------
def load_and_filter_data():
    path = kagglehub.dataset_download(handle="googleai/dataset-metadata-for-cord19")
    filename = path + "/" + os.listdir(path)[0]
    df = pd.read_csv(filename)
    keywords = ['smoking', 'tobacco', 'cigarette', 'nicotine', 'vaping']
    df_filtered = df[df['description'].notnull()]
    keyword_mask = df_filtered['description'].str.contains('|'.join(keywords), case=False, na=False)
    df_filtered = df_filtered[keyword_mask][['description']]
    df_filtered["word_count"] = df_filtered["description"].apply(lambda x: len(str(x).split(" ")))
    return df_filtered

In [19]:
# ---------------------- Vector Create and Store ----------------------
def create_vector_store(dataframe):
    chunks = []
    chunk_size = 150
    for text in tqdm(dataframe["description"].values):
        if isinstance(text, str):
            words = text.split()
            for i in range(0, len(words), chunk_size):
                chunks.append(Document(text=" ".join(words[i:i+chunk_size])))
    storage_context = StorageContext.from_defaults()
    index = VectorStoreIndex.from_documents(
        chunks,
        storage_context=storage_context,
        embed_model=embed_model,
        show_progress=True
    )
    index.storage_context.persist(persist_dir="covid_storage")
    return index

In [20]:
# ---------------------- Load LLM ----------------------
def load_models():
    tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    model = AutoModelForCausalLM.from_pretrained(
        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )
    answer_llm = HuggingFaceLLM(
        tokenizer=tokenizer,
        model=model,
        context_window=2048,
        max_new_tokens=256,
        generate_kwargs={"temperature": 0.7, "do_sample": True},
    )
    return answer_llm

In [21]:
# ---------------------- Initialize Components ----------------------
df_processed = load_and_filter_data()
vector_index = create_vector_store(df_processed)
llm = load_models()
Settings.llm = llm

chat_agent = vector_index.as_chat_engine(
    chat_mode="context",
    memory=ChatMemoryBuffer.from_defaults(token_limit=1500),
    system_prompt="You are a medical research assistant specializing in COVID-19 and smoking-related health impacts. Provide evidence-based answers using the CORD-19 dataset."
)

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=0 if torch.cuda.is_available() else -1
)

100%|██████████| 55/55 [00:00<00:00, 3459.30steps/s]
Parsing nodes: 100%|██████████| 91/91 [00:00<00:00, 616.01steps/s]
Generating embeddings: 100%|██████████| 91/91 [00:17<00:00,  5.22steps/s]
Device set to use cpu


In [None]:
# ---------------------- Chat + Analysis ----------------------
chat_history = []

def ask_research_assistant(question):
    if not question.strip():
        return "", chat_history

    start = time.time()
    response = chat_agent.chat(question)
    answer = response.response
    elapsed = round(time.time() - start, 1)

    sentiment = sentiment_pipeline(answer[:512])[0]
    keywords = list(set([
        word.lower() for word in answer.split()
        if word.lower() in ['smoking', 'tobacco', 'risk', 'covid', 'lung', 'health']
    ]))

    sources = getattr(response, "source_nodes", [])
    citations = "\n".join([
        f"- Score: {getattr(s, 'score', 'N/A'):.2f}, Source: {s.node.text[:150]}..."
        for s in sources
    ]) if sources else "No source information available."

    response_summary = (
        f"**Answer:**\n{answer}\n\n"
        f"**📊 Sentiment:** {sentiment['label']} ({sentiment['score']:.2f})\n"
        f"**🔑 Keywords:** {', '.join(keywords)}\n"
        f"**📈 Word Count:** {len(answer.split())}  |  🕒 Response Time: {elapsed}s"
    )

    chat_history.append((question, response_summary))
    return "", chat_history

In [32]:
# ---------------------- Dashboard Code Gradio----------------------
custom_theme = gr.themes.Base(
    primary_hue="blue",
    secondary_hue="pink",
    neutral_hue="slate",
    radius_size=gr.themes.sizes.radius_lg,
    font=[gr.themes.GoogleFont("Inter"), "sans-serif"]
)


with gr.Blocks(theme=custom_theme, title="COVID-19 & Smoking Research Assistant") as demo:
    gr.Markdown("# 🧬 COVID-19 Smoking Analysis Assistant")
    gr.Markdown("Ask anything related to **COVID-19 & smoking effects**, powered by the CORD-19 dataset.")

    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(label="💬 Assistant Chat", height=300)
            user_input = gr.Textbox(label="Type your question here...", placeholder="e.g. Does smoking increase COVID-19 risk?")
            submit_btn = gr.Button("🚀 Submit")

        with gr.Column(scale=2):
            with gr.Accordion("📊 Latest Response Analysis", open=True):
                analysis_output = gr.Markdown()
            with gr.Accordion("📚 Citation Sources", open=False):
                citation_box = gr.Markdown()

            gr.Markdown("""
            ### ℹ️ System Info
            - **Model**: TinyLlama-1.1B
            - **Embeddings**: MiniLM-L6-v2
            - **Dataset**: CORD-19 (Filtered)
            """)

    def update_ui(query, chat_history_internal):
        if not query.strip():
            return gr.update(value=""), chat_history_internal, "", ""

        start = time.time()
        response = chat_agent.chat(query)
        answer = response.response
        elapsed = round(time.time() - start, 1)

        sentiment = sentiment_pipeline(answer[:512])[0]
        keywords = list(set([
            word.lower() for word in answer.split()
            if word.lower() in ['smoking', 'tobacco', 'risk', 'covid', 'lung', 'health']
        ]))
        sources = getattr(response, "source_nodes", [])
        citations = "\n".join([
            f"- Score: {getattr(s, 'score', 'N/A'):.2f}, Source: {s.node.text[:150]}..."
            for s in sources
        ]) if sources else "No source information available."

        chat_history_internal.append((query, answer))
        analysis_text = (
            f"**Sentiment**: {sentiment['label']} ({sentiment['score']:.2f})  \n"
            f"**Keywords**: {', '.join(keywords)}  \n"
            f"**Word Count**: {len(answer.split())}  \n"
            f"**Response Time**: {elapsed}s"
        )
        return "", chat_history_internal, analysis_text, citations

    submit_btn.click(update_ui, inputs=[user_input, gr.State(chat_history)],
                     outputs=[user_input, chatbot, analysis_output, citation_box])

demo.launch(debug=True, share=True)

  chatbot = gr.Chatbot(label="💬 Assistant Chat", height=300)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://e75728e4a852392dd7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://e75728e4a852392dd7.gradio.live


