In [None]:
!pip install sentence-transformers faiss-cpu transformers feedparser streamlit
!pip install langchain langchain-community langgraph
!pip install crewai
!pip install --upgrade gradio


In [None]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import gradio as gr
import os
import sys
from langgraph.graph import StateGraph
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from crewai import Agent, Task, Crew
from crewai.tools import BaseTool
from pydantic import BaseModel, Field


In [None]:
class MCPServer:
    def __init__(self):
        self.context_history = []

    def save_context(self, query, summary, recommendations):
        self.context_history.append({
            "query": query,
            "summary": summary,
            "recommendations": recommendations
        })

    def get_latest(self):
        return self.context_history[-1] if self.context_history else None

# Global server object
mcp_server = MCPServer()

In [None]:
import feedparser
from urllib.parse import quote

def search_arxiv(query, max_results=20):
    base_url = 'http://export.arxiv.org/api/query?'
    search_query = f'search_query=all:{quote(query)}&start=0&max_results={max_results}'
    feed = feedparser.parse(base_url + search_query)
    return feed.entries




In [None]:
from tqdm import tqdm

# Keep domain structure as is
domain_structure = {
    "Artificial Intelligence": [
        "Machine Learning", "Deep Learning", "Reinforcement Learning", "Transfer Learning",
        "Self-Supervised Learning", "Meta-Learning", "Federated Learning", "Explainable AI",
        "Generative AI", "AI Ethics"
    ],
    "Healthcare & Biomedical AI": [
        "Medical Imaging", "Clinical NLP", "Drug Discovery", "Disease Diagnosis Models",
        "Predictive Healthcare", "Public Health Analytics", "Electronic Health Records", "Wearable Health Devices",
        "Mental Health AI", "COVID-19 Research"
    ],
    "Finance & Economics": [
        "Stock Price Prediction", "Fraud Detection", "Algorithmic Trading", "Credit Risk Modeling",
        "Financial Sentiment Analysis", "AI in Insurance", "Economic Forecasting", "Portfolio Optimization",
        "Blockchain & Crypto Analytics", "Regulatory Technology"
    ],
    "Law & Policy": [
        "Legal Document Summarization", "AI in Legal Reasoning", "Contract Analysis & NLP", "Legal QA",
        "Case Law Retrieval", "Judgment Prediction", "Legal Chatbots", "Privacy Law Compliance",
        "Policy Modeling", "AI in Legal Reasoning"
    ],
    "Education & Social Science": [
        "Intelligent Tutoring Systems", "AI in Education", "Student Performance Prediction", "Academic Plagiarism Detection",
        "Exam Question Generation", "Adaptive Learning", "Education Data Mining", "Misinformation in EdTech",
        "Educational NLP", "Conversational Agents in Education"
    ],
    "Sustainability, Industry & Robotics": [
        "Climate Modeling", "Renewable Energy", "Smart Grids", "AI in Agriculture",
        "Precision Farming", "Autonomous Vehicles", "Robotics and Automation", "Predictive Maintenance",
        "Industrial Automation", "Disaster Response"
    ]
}

In [None]:

# New function: fetch papers and summarize using MCP tools
#def fetch_and_summarize_papers(domain_structure, max_results=20):
    #all_papers = []
    #for domain, subdomains in domain_structure.items():
        #for sub in subdomains:
#             print(f"🔍 Fetching papers for: {domain} → {sub}")

#             # Use arxiv_tool to fetch papers
#             papers = arxiv_tool.run(sub)

#             for paper in papers:
#                 abstract_text = paper.get("summary", "")
#                 if abstract_text:
#                     # Use summarizer_tool to summarize abstract
#                     summarized = summarizer_tool.run(abstract_text)
#                 else:
#                     summarized = "No abstract available"

#                 paper['summary'] = summarized
#                 paper['domain'] = domain
#                 paper['subdomain'] = sub
#             all_papers.extend(papers)

#     df_all = pd.DataFrame(all_papers)
#     return df_all



# df_all = fetch_and_summarize_papers(domain_structure)
# df_all.to_csv("arxiv_1200_papers.csv", index=False)
# print("✅ Papers fetched, summarized, and saved to CSV!")

In [None]:
# # Check paper counts per subdomain (optional - uncomment when needed)
# if 'df_all' in globals():
#     subdomain_counts = df_all.groupby(['domain', 'subdomain']).size().reset_index(name='num_papers')
#     print(subdomain_counts.sort_values('num_papers'))
# else:
#     print("⚠️ Dataframe 'df_all' not loaded yet. Run fetch_and_summarize_papers() or load CSV first.")


In [None]:
import pandas as pd

# Load dataset if previously saved
df_loaded = pd.read_csv('arxiv_1200_papers (2).csv')
print(f"✅ Loaded CSV with {df_loaded.shape[0]} papers.")
display(df_loaded.head())

In [None]:
# Load embedding model (already used in previous block but kept here if re-run separately)
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Embedding model loaded!")

# Define MCP Tool for embedding if not already defined
def embed_text(text):
    """Embed a single text string into a vector."""
    return embed_model.encode([text]).tolist()

# Build FAISS index using abstracts
abstract_texts = df_loaded['summary'].fillna("").tolist()
abstract_embeddings = embed_model.encode(abstract_texts, show_progress_bar=True)
abstract_embeddings = np.array(abstract_embeddings).astype('float32')

dimension = abstract_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(abstract_embeddings)
print(f"✅ FAISS index created with {index.ntotal} papers!")


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="manjunathainti/fine_tuned_t5_summarizer")
print("✅ Summarizer model loaded!")

def summarize_text(text: str) -> str:
    return summarizer(text, max_length=200, min_length=50, do_sample=False)[0]["summary_text"]

# For long input with dynamic chunking logic
def summarize_input_text(text: str) -> str:
    text = text.strip().replace("\n", " ")
    word_count = len(text.split())

    if word_count < 50:
        return summarize_text(text)

    # Break into roughly ~300 word chunks
    chunks = []
    current_chunk = []
    for word in text.split():
        current_chunk.append(word)
        if len(current_chunk) >= 300:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    # Summarize each chunk individually
    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=200, min_length=50, do_sample=False)[0]["summary_text"]
        summaries.append(summary)

    return "\n".join(summaries)

In [None]:
def find_related_papers(user_input, top_k=5, domain_filter="All", subdomain_filter="All"):
    # Search FAISS index on the entire dataset
    user_vec = embed_model.encode([user_input]).astype('float32')
    D, I = index.search(user_vec, 50)  # fetch more than top_k initially

    results = []
    for idx, dist in zip(I[0], D[0]):
        if idx >= len(df_loaded):
            continue

        paper = df_loaded.iloc[idx].to_dict()

        # Apply filters AFTER search
        if domain_filter != "All" and paper['domain'] != domain_filter:
            continue
        if subdomain_filter != "All" and paper['subdomain'] != subdomain_filter:
            continue

        doc_vec = embed_model.encode([paper['abstract']]).astype('float32')
        user_norm = user_vec / np.linalg.norm(user_vec)
        doc_norm = doc_vec / np.linalg.norm(doc_vec)
        cos_sim = float(np.dot(user_norm, doc_norm.T))
        cos_sim = min(max(cos_sim, 0.0), 1.0)  # Clip for safety
        paper['similarity_score'] = round(cos_sim, 3)
        results.append(paper)

    results = sorted(results, key=lambda x: x['similarity_score'], reverse=True)

    # ✅ Return only top_k
    return results[:top_k]

In [None]:
from transformers import pipeline
from crewai.tools import BaseTool
from pydantic import BaseModel, Field

hf_tokenizer = AutoTokenizer.from_pretrained("t5-base")
hf_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
hf_pipeline = pipeline("text2text-generation", model=hf_model, tokenizer=hf_tokenizer)

hf_llm = HuggingFacePipeline(pipeline=hf_pipeline)

def summarize_text_tool(text: str) -> str:
    """Summarizes academic input using chunking if needed."""
    if len(text.split()) < 100:
        return summarize_text(text)
    else:
        return summarize_input_text(text)

class SummarizerTool(BaseTool):
    name: str = "summarizer_tool"
    description: str = "Summarizes academic input."

    def _run(self, text: str):
        return summarize_text_tool(text)

summarizer_tool_instance = SummarizerTool()

def embed_text_tool(text: str) -> list:
    """Encodes input text into embedding vector."""
    return embed_model.encode([text]).tolist()

class EmbedderTool(BaseTool):
    name: str = "embedder_tool"
    description: str = "Embeds academic input into a vector representation."

    def _run(self, text: str):
        return embed_text_tool(text)

embedder_tool_instance = EmbedderTool()

class RecommenderToolArgs(BaseModel):
    user_input: str = Field(..., description="User query for finding related papers")
    top_k: int = Field(5, description="Number of papers to return")
    domain_filter: str = Field("All", description="Domain to filter papers")
    subdomain_filter: str = Field("All", description="Subdomain to filter papers")

class RecommenderTool(BaseTool):
    name: str = "recommender_tool"
    description: str = "Recommends related papers with similarity scores and metadata"
    args_schema = RecommenderToolArgs

    def _run(self, user_input, top_k=5, domain_filter="All", subdomain_filter="All"):
        return find_related_papers(user_input, top_k, domain_filter, subdomain_filter)


# Instantiate
recommender_tool_instance = RecommenderTool()

summarizer_agent = Agent(
    role="Academic Summarizer",
    goal="Summarize academic research clearly and concisely.",
    backstory="An NLP agent trained to digest research papers.",
    verbose=True,
    tools=[summarizer_tool_instance], # Pass the tool instance
    llm=hf_llm

)

embedder_agent = Agent(
    role="Embedding Generator",
    goal="Convert academic abstracts into meaningful vector representations.",
    backstory="A specialist agent trained to map academic language to dense vector spaces for similarity search.",
    verbose=True,
    tools=[embedder_tool_instance],
    llm=hf_llm
)

recommender_agent = Agent(
    role="Paper Recommender",
    goal="Find top 5 academic papers based on input relevance.",
    backstory="Specialist in vector similarity and embeddings.",
    verbose=True,
    tools=[recommender_tool_instance],
    llm=hf_llm
)

In [None]:
# Define tasks
summarizer_task = Task(
    description="Summarize the following academic abstract: {input_text}",
    expected_output="A concise summary.",
    agent=summarizer_agent
)

embedder_task = Task(
    description="Generate an embedding vector for the input text: {input_text}",
    expected_output="A vector representation of the input.",
    agent=embedder_agent
)

recommender_task = Task(
    description="Recommend 5 relevant papers for: {input_text}",
    expected_output="A list of paper titles.",
    agent=recommender_agent
)

# Multi-agent crew for combined task
combined_crew = Crew(
    agents=[summarizer_agent, embedder_agent, recommender_agent],
    tasks=[summarizer_task, embedder_task, recommender_task],
    verbose=True
)


In [None]:
from transformers import pipeline

# Load zero-shot classification pipeline
classifier = pipeline(
    "zero-shot-classification",
    model="t5-base"
)
print(" Zero-shot classifier loaded!")
candidate_labels = [
    # AI
    "Machine Learning", "Deep Learning", "Reinforcement Learning", "Transfer Learning",
    "Self-Supervised Learning", "Meta-Learning", "Federated Learning", "Explainable AI",
    "Generative AI", "AI Ethics",

    # Healthcare
    "Medical Imaging", "Clinical NLP", "Drug Discovery", "Disease Diagnosis Models",
    "Predictive Healthcare", "Public Health Analytics", "Electronic Health Records",
    "Wearable Health Devices", "Mental Health AI", "COVID-19 Research",

    # Finance
    "Stock Price Prediction", "Fraud Detection", "Algorithmic Trading", "Credit Risk Modeling",
    "Financial Sentiment Analysis", "AI in Insurance", "Economic Forecasting",
    "Portfolio Optimization", "Blockchain & Crypto Analytics", "Regulatory Technology",

    # Law
    "Legal Document Summarization", "AI in Legal Reasoning", "Contract Analysis & NLP",
    "Legal QA", "Case Law Retrieval", "Judgment Prediction", "Legal Chatbots",
    "Privacy Law Compliance", "Policy Modeling",

    # Education
    "Intelligent Tutoring Systems", "AI in Education", "Student Performance Prediction",
    "Academic Plagiarism Detection", "Exam Question Generation", "Adaptive Learning",
    "Education Data Mining", "Misinformation in EdTech", "Educational NLP",
    "Conversational Agents in Education",

    # Sustainability & Robotics
    "Climate Modeling", "Renewable Energy", "Smart Grids", "AI in Agriculture",
    "Precision Farming", "Autonomous Vehicles", "Robotics & Control",
    "Predictive Maintenance", "Industrial Automation", "Disaster Response"
]

# Define classification function
def classify_domain(abstract):
    result = classifier(abstract, candidate_labels)
    return result['labels'][0]


In [None]:
def generate_csv(recommendations):
    """
    Save recommended papers to CSV.
    Now recommendations is a list of dicts (from find_related_papers) not a DataFrame.
    """
    if isinstance(recommendations, list):
        df_out = pd.DataFrame(recommendations)
    else:
        df_out = recommendations

    # Check if similarity_score exists in recommendations
    columns_to_save = ['title', 'summary', 'link', 'domain', 'subdomain']
    if 'similarity_score' in df_out.columns:
        columns_to_save.append('similarity_score')

    df_out = df_out[columns_to_save]
    df_out.to_csv("recommendations.csv", index=False)
    return "recommendations.csv"

In [None]:
def generate_csv_wrapper(user_input, domain, subdomain, task_choice):
    domain_filter = domain if domain and domain.strip() != "" else "All"
    subdomain_filter = subdomain if subdomain and subdomain.strip() != "" else "All"

    recommendations = recommender_tool_instance._run(
        user_input=user_input,
        top_k=5,
        domain_filter=domain_filter,
        subdomain_filter=subdomain_filter
    )

    return generate_csv(recommendations)


In [None]:
# Define your input text (replace with anything you want to test)
user_input_text = """
Federated Learning (FL) has emerged as a promising distributed machine learning paradigm that enables collaborative model training across multiple devices or institutions without sharing raw data. This approach preserves data privacy by transmitting only model updates, which are aggregated centrally to form a global model. By keeping data localized, FL addresses critical concerns about data ownership, confidentiality, and compliance with regulations such as GDPR and HIPAA. This makes it especially valuable in domains like healthcare, finance, and mobile applications where data sensitivity is paramount.

One of the key strengths of FL lies in its ability to leverage heterogeneous and decentralized data sources. Traditional centralized approaches often fail to capture the diversity inherent in real-world datasets, whereas FL can combine knowledge from varied distributions across clients. Techniques such as federated averaging (FedAvg) and secure aggregation have become foundational to address communication efficiency and privacy. However, challenges such as non-independent and identically distributed (non-IID) data, limited bandwidth, and varying client participation still pose significant obstacles to robust model performance.

Recent advances have focused on enhancing FL’s scalability and resilience. Personalization strategies aim to balance the global model’s performance with each client’s specific data distribution, while adaptive communication protocols reduce overhead in large-scale deployments. Additionally, integrating differential privacy and homomorphic encryption into FL pipelines has further strengthened the security guarantees of model updates. These developments demonstrate FL’s growing potential to serve as a practical framework for real-world distributed AI systems.

Despite its promise, FL remains an active area of research. Addressing fairness across clients, mitigating biases from skewed data distributions, and improving convergence speed are open challenges. Moreover, as FL expands into multimodal learning and cross-silo collaborations, designing algorithms that handle diverse data modalities and institutional requirements will be crucial. With continued innovation, Federated Learning could become a foundational building block for privacy-preserving and decentralized artificial intelligence.

"""
print(" USER INPUT TEXT")
print(user_input_text)
print("\n==============================\n")

# 1. Summarize input using the summarizer tool
user_summary = summarizer_tool_instance._run(user_input_text)
print(" SUMMARIZED INPUT")
print(user_summary)
print("\n==============================\n")

# 2. Get recommendations using the recommender tool (full metadata)
recommendations = recommender_tool_instance._run(user_input_text)

print(" RECOMMENDED PAPERS")
for rec in recommendations:
    print(f"- Title: {rec.get('title', 'N/A')}")

    # Authors
    if 'authors' in rec and rec['authors']:
      authors = rec['authors']

    # Handle different formats: string, list of strings, or list of dicts
      if isinstance(authors, str):
        try:
            authors = eval(authors) if authors.startswith("[") else [authors]
        except:
            authors = [authors]

      if isinstance(authors, list):
        processed_authors = []
        for a in authors:
            if isinstance(a, dict) and 'name' in a:  # list of dicts
                processed_authors.append(a['name'])
            elif isinstance(a, str):  # list of strings
                processed_authors.append(a)
            else:
                processed_authors.append(str(a))  # fallback

        if processed_authors:
            print(f"  Authors: {', '.join(processed_authors)}")


    # Domain
    print(f"  Domain: {rec.get('domain', 'N/A')}")

    # Published
    if 'published' in rec:
        print(f"  Published: {rec.get('published', 'N/A')}")

    # Link
    if 'link' in rec:
        print(f"  Link: {rec.get('link', '#')}")

    # Similarity Score
    if 'similarity_score' in rec:
        print(f"  Similarity Score: {rec.get('similarity_score', 0):.2f}")

    print(f"  Summary: {rec.get('summary', 'No summary available')}")
    print("--------------------------------")


In [None]:
def format_apa_citation(title, authors, published, link):
    # Handle year extraction
    try:
        year = published.split("-")[0]
    except:
        year = "n.d."

    # Normalize authors into a list of names
    authors_list = []

    # Handle if authors is a string
    if isinstance(authors, str):
        try:
            parsed_authors = eval(authors) if authors.strip().startswith("[") else [authors]
        except:
            parsed_authors = [authors]
    else:
        parsed_authors = authors

    # Handle if authors is a list of dicts or list of strings
    if isinstance(parsed_authors, list):
        for a in parsed_authors:
            if isinstance(a, dict) and "name" in a:  # dict format
                authors_list.append(a["name"])
            elif isinstance(a, str):
                authors_list.append(a)
            else:
                authors_list.append(str(a))  # fallback
    else:
        authors_list.append(str(parsed_authors))

    # Format APA author string
    author_str = ", ".join(authors_list[:3])
    if len(authors_list) > 3:
        author_str += ", et al."

    # Final APA-style string
    return f"{author_str} ({year}). <i>{title}</i>.<br>Retrieved from <a href='{link}' target='_blank'>{link}</a>"



In [None]:
import logging
import sys
try:
    import absl.logging
    from unittest.mock import MagicMock

    def safe_close(self):
        try:
            if hasattr(self.stream, 'close'):
                self.stream.close()
        except Exception:
            pass
    absl.logging.PythonHandler.close = safe_close
except ImportError:
    pass

for name in ['uvicorn', 'uvicorn.access', 'uvicorn.error', 'uvicorn.asgi', 'httpx', 'httpcore', 'asyncio', 'websockets']:
    logger = logging.getLogger(name)
    logger.disabled = True
    logger.propagate = False
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

logging.basicConfig(stream=sys.stderr, level=logging.ERROR, force=True)


In [None]:


def update_subdomains(domain):
    """Update subdomain dropdown based on selected domain."""
    if domain in domain_structure:
        return gr.update(choices=["All"] + domain_structure[domain], value="All")
    return gr.update(choices=[], value=None)

def run_agent_ui(user_input, domain, subdomain, task):
    if not user_input.strip():
        return "⚠️ Please enter an abstract.", "<p style='color:red;'>No papers found.</p>"

    # Handle filters
    domain_filter = domain if domain and domain.strip() != "" else "All"
    subdomain_filter = subdomain if subdomain and subdomain.strip() != "" else "All"

    summary = ""
    recommendations = []

    try:
        # 🟢 Only Summarize
        if task == "Summarize":
          summary = summarizer_tool_instance._run(user_input)
          recommendations_html = "📄 Only summarizer will run. Recommender not requested."
          return summary, recommendations_html

        elif task == "Recommend Papers":
          summary = "📚 Only recommender will run. Summary not requested."
          recommendations = recommender_tool_instance._run(
            user_input=user_input,
            top_k=5,
            domain_filter=domain_filter,
            subdomain_filter=subdomain_filter
          )

        elif task == "Summarize + Recommend Papers":
          summary = summarizer_tool_instance._run(user_input)
          recommendations = recommender_tool_instance._run(
            user_input=user_input,
            top_k=5,
            domain_filter=domain_filter,
            subdomain_filter=subdomain_filter
          )


        # 🧠 Format recommended papers
        if not recommendations:
            return summary, "<p>No recommended papers found for this input.</p>"

        papers_html = "<h3>Recommended Papers</h3>"
        for rec in recommendations:
            try:
                apa = format_apa_citation(
                    rec.get('title', 'N/A'),
                    rec.get('authors', []),
                    rec.get('published', 'n.d.'),
                    rec.get('link', '#')
                )
            except Exception as apa_error:
                apa = f"APA citation error: {str(apa_error)}"

            papers_html += f"""
            <div style="margin-bottom: 15px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
                <b>Title:</b> {rec.get('title', 'N/A')}<br>
                <b>Authors:</b> {", ".join(eval(rec['authors'])) if isinstance(rec.get('authors'), str) and rec['authors'].startswith("[") else rec.get('authors', 'N/A')}<br>
                <b>Domain:</b> {rec.get('domain', 'N/A')}<br>
                <b>Subdomain:</b> {rec.get('subdomain', 'N/A')}<br>
                <b>Published:</b> {rec.get('published', 'N/A')}<br>
                <b>Link:</b> <a href="{rec.get('link', '#')}" target="_blank">View Paper</a><br>
                <b>Similarity Score:</b> {rec.get('similarity_score', 0):.2f}<br>
                <b>Summary:</b> {rec.get('summary', 'No summary available')}<br><br>

                <details>
                    <summary><b>Show APA Citation</b></summary>
                    {apa}
                </details>
            </div>
            """

        return summary, papers_html

    except Exception as e:
        error_msg = f"❌ Error processing request: {str(e)}"
        return error_msg, error_msg

# Build the Gradio Interface
with gr.Blocks(title="🧠 Summarizer + Recommender Agent") as demo:
    gr.Markdown("## 🧠 Summarizer + Recommender Agent")

    with gr.Row():
        with gr.Column():
            user_input = gr.Textbox(label="Input Abstract", lines=8, placeholder="Paste abstract here...")
            domain = gr.Dropdown(choices=["All"] + list(domain_structure.keys()), label="Domain (Optional)", value=None)
            subdomain = gr.Dropdown(choices=[], label="Subdomain (Optional)", value=None)

            # Dynamic update of subdomains
            domain.change(fn=update_subdomains, inputs=domain, outputs=subdomain)

            task_choice = gr.Dropdown(
              choices=["Summarize", "Recommend Papers", "Summarize + Recommend Papers"],
              label="Select Task",
              value=None
            )

            submit_btn = gr.Button("▶️ Start Agent", variant="primary")

        with gr.Column():
            summary_output = gr.Textbox(label="Summary of Input Abstract", lines=6)
            recommendations_output = gr.HTML(label="Recommended Papers")
            download_btn = gr.Button("📥 Download Recommended Papers as CSV")


    # Connect button
    submit_btn.click(fn=run_agent_ui,
                     inputs=[user_input, domain, subdomain, task_choice],
                     outputs=[summary_output, recommendations_output])

    download_btn.click(
      fn=generate_csv_wrapper,
      inputs=[user_input, domain, subdomain, task_choice],
      outputs=gr.File(label="Download Recommendations CSV")
    )


def run_gradio_safe():
    print("🚀 Starting Gradio safely for Colab...")
    try:
        import os
        os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
        logging.getLogger().handlers.clear()
        demo.launch(share=True, show_error=True)
    except Exception as e:
        print(f"❌ Gradio launch failed: {e}")
run_gradio_safe()