In [1]:
import os
import io
import base64
import sqlite3
import tempfile
from typing import List, Optional, Dict, Any
from datetime import datetime
import re
import warnings

# Third-party imports
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough
from IPython.display import Markdown, display, HTML
import PyPDF2 
import docx  
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Suppress warnings
warnings.filterwarnings("ignore")

def load_environment():
    """Load environment variables"""
    load_dotenv()
    return os.getenv("GEMINI_API_KEY")

def initialize_model(api_key):
    """Initialize Gemini model"""
    return ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        google_api_key=api_key,
        temperature=0.7,
        max_tokens=4000,
    )

def define_output_schemas():
    """Define output schemas for topic and paper recommendations"""
    # Output schema for topic recommendations
    class RecommendedTopic(BaseModel):
        topic: str = Field(description="The name of the recommended topic")
        description: str = Field(description="A brief description of why this topic is relevant")
        resource_url: str = Field(description="A relevant resource URL for this topic")

    class TopicRecommendations(BaseModel):
        recommendations: List[RecommendedTopic] = Field(description="List of recommended related topics")

    # Output schema for paper recommendations
    class RecommendedPaper(BaseModel):
        title: str = Field(description="The title of the recommended research paper")
        authors: str = Field(description="The authors of the paper")
        year: str = Field(description="Publication year")
        description: str = Field(description="Brief description of relevance to the original paper")
        paper_url: str = Field(description="URL to access this paper", default="")

    class PaperRecommendations(BaseModel):
        recommendations: List[RecommendedPaper] = Field(description="List of recommended related papers")
        
    return TopicRecommendations, PaperRecommendations

def create_prompt_templates():
    """Create prompt templates for research tasks"""
    report_prompt = ChatPromptTemplate.from_template(
        """
        You are an AI research assistant. Create a comprehensive, detailed report on the following topic:
        
        Topic: {topic}
        
        Your report should include:
        1. Introduction to the topic
        2. Key concepts and definitions
        3. Historical context and development
        4. Current state and applications
        5. Future directions and potential developments
        6. Conclusion
        
        Format your report with clear markdown headings and subheadings. Use proper markdown formatting for emphasis, lists, and other elements.
        Make sure to provide in-depth analysis.
        """
    )

    recommendation_prompt = ChatPromptTemplate.from_template(
        """
        Based on the topic: {topic}
        
        Generate 5 relevant related topics that the user might be interested in researching next.
        For each recommendation, provide:
        1. The topic name
        2. A brief 1-2 sentence description of why it's relevant
        3. A relevant resource URL that would contain valuable information about this topic
        
        Your response must be formatted as a valid JSON object that matches this structure:
        {
            "recommendations": [
                {
                    "topic": "Topic Name",
                    "description": "Brief description of relevance",
                    "resource_url": "https://example.com/relevant-page"
                },
                ...
            ]
        }
        
        Use reputable sources for your resource URLs. While you can't verify if the exact URLs exist,
        make them realistic and likely to contain quality information.
        """
    )

    paper_summary_prompt = ChatPromptTemplate.from_template(
        """
        You are an AI research assistant. Create a concise but comprehensive summary of the following research paper:
        
        Paper content: {paper_content}
        
        Your summary should include:
        1. Main objective of the research
        2. Methodology used
        3. Key findings and results
        4. Main conclusions and implications
        5. Limitations (if mentioned)
        
        Format your summary with clear markdown headings and keep it concise yet informative.
        Focus on the most important aspects of the paper.
        """
    )

    paper_recommendation_prompt = ChatPromptTemplate.from_template(
    """
    Based on the following research paper:
    
    Paper content: {paper_content}
    
    Generate 5 relevant related research papers that the user might be interested in reading next.
    These should be real papers that likely exist in the academic literature.
    
    For each recommendation, provide:
    1. The paper title (use the actual title of a real paper if you know it)
    2. The authors (use "et al." for multiple authors after the first)
    3. Publication year (estimate if necessary)
    4. A brief description of why it's relevant to the original paper
    5. A URL where the paper might be found - THIS IS CRITICAL. 
    
    For URLs, use specific links from:
    - Google Scholar (https://scholar.google.com/scholar?q=PAPER_TITLE)
    - arXiv (https://arxiv.org/search/?query=PAPER_TITLE)
    - ResearchGate (https://www.researchgate.net/search.Search.html?query=PAPER_TITLE)
    - ACM Digital Library (https://dl.acm.org/action/doSearch?AllField=PAPER_TITLE)
    - IEEE Xplore (https://ieeexplore.ieee.org/search/searchresult.jsp?queryText=PAPER_TITLE)
    
    Replace PAPER_TITLE with URL-encoded paper title in these templates. Make sure EVERY recommendation has a working URL.
    
    Your response must be formatted as a valid JSON object that matches this structure:
    {{
        "recommendations": [
            {{
                "title": "Paper Title",
                "authors": "Author names",
                "year": "Publication year",
                "description": "Brief description of relevance",
                "paper_url": "https://example.com/paper-link"
            }},
            ...
        ]
    }}
    """
)
    
    return report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt

def create_chains(model, report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt, TopicRecommendations, PaperRecommendations):
    """Create processing chains for research tasks"""
    report_chain = (
        {"topic": RunnablePassthrough()}
        | report_prompt
        | model
        | StrOutputParser()
    )

    recommendation_chain = (
        {"topic": RunnablePassthrough()}
        | recommendation_prompt
        | model
        | JsonOutputParser(pydantic_object=TopicRecommendations)
    )

    paper_summary_chain = (
        {"paper_content": RunnablePassthrough()}
        | paper_summary_prompt
        | model
        | StrOutputParser()
    )

    paper_recommendation_chain = (
        {"paper_content": RunnablePassthrough()}
        | paper_recommendation_prompt
        | model
        | JsonOutputParser(pydantic_object=PaperRecommendations)
    )
    
    return report_chain, recommendation_chain, paper_summary_chain, paper_recommendation_chain

def initialize_database(db_path: str = "../data/research_papers.db"):
    """Initialize SQLite database for storing papers"""
    # Ensure the directory exists
    os.makedirs(os.path.dirname(db_path), exist_ok=True)
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS papers (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT NOT NULL,
            content TEXT NOT NULL,
            file_type TEXT NOT NULL,
            upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            summary TEXT
        )
    ''')
    conn.commit()
    conn.close()

def save_file_to_database(filename: str, content: str, file_type: str, db_path: str = "../data/research_papers.db"):
    """Save file content to SQLite database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(
        "INSERT INTO papers (filename, content, file_type) VALUES (?, ?, ?)",
        (filename, content, file_type)
    )
    paper_id = cursor.lastrowid
    conn.commit()
    conn.close()
    return paper_id

def save_summary_to_database(paper_id: int, summary: str, db_path: str = "../data/research_papers.db"):
    """Save paper summary to database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(
        "UPDATE papers SET summary = ? WHERE id = ?",
        (summary, paper_id)
    )
    conn.commit()
    conn.close()

def get_paper_from_database(paper_id: int, db_path: str = "../data/research_papers.db"):
    """Retrieve paper content from database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT filename, content, file_type, summary FROM papers WHERE id = ?", (paper_id,))
    result = cursor.fetchone()
    conn.close()
    if result:
        return {
            "filename": result[0],
            "content": result[1],
            "file_type": result[2],
            "summary": result[3]
        }
    else:
        return None

def extract_text_from_pdf(file_path: str) -> str:
    """Extract text content from a PDF file"""
    try:
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        text_content = "\n\n".join([doc.page_content for doc in documents])
        return text_content
    except Exception as e:
        print(f"Error extracting text from PDF: {str(e)}")
        # Fallback method
        text = ""
        with open(file_path, "rb") as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text

def extract_text_from_docx(file_path: str) -> str:
    """Extract text content from a DOCX file"""
    try:
        loader = Docx2txtLoader(file_path)
        documents = loader.load()
        text_content = "\n\n".join([doc.page_content for doc in documents])
        return text_content
    except Exception as e:
        print(f"Error extracting text from DOCX: {str(e)}")
        # Fallback method
        doc = docx.Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text

def generate_report(topic: str, report_chain) -> str:
    """Generate a detailed report on the given topic"""
    return report_chain.invoke(topic)

def generate_recommendations(topic: str, recommendation_chain, model) -> str:
    """Generate relevant topic recommendations using Gemini API"""
    try:
        recommendations_data = recommendation_chain.invoke(topic)
        formatted_recommendations = "# Related Topics You May Be Interested In\n\n"
        for i, rec in enumerate(recommendations_data.recommendations, 1):
            formatted_recommendations += f"## {i}. {rec.topic}\n"
            formatted_recommendations += f"{rec.description}\n"
            formatted_recommendations += f"[Learn more]({rec.resource_url})\n\n"
        return formatted_recommendations
    except Exception as e:
        backup_prompt = ChatPromptTemplate.from_template(
            """
            Based on the topic: {topic}
            
            Provide 5 relevant related topics that the user might be interested in researching next.
            For each recommendation, provide:
            1. The topic name
            2. A brief description of why it's relevant
            3. A relevant resource link
            
            Format your response as a markdown list.
            """
        )
        backup_chain = backup_prompt | model | StrOutputParser()
        return backup_chain.invoke({"topic": topic})

def create_full_report(topic: str, report_content: str, recommendations_content: str) -> str:
    """Create a full markdown report combining the report and recommendations"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    full_report = f"""
# Research Report: {topic}

*Generated on: {timestamp}*

---

{report_content}

---

{recommendations_content}

---

*This report was generated by AI Research Assistant using Gemini API*
"""
    return full_report

def create_full_paper_analysis(filename: str, summary_content: str, recommendations_content: str) -> str:
    """Create a full markdown report for paper analysis"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    full_report = f"""
# Research Paper Analysis: {filename}

*Generated on: {timestamp}*

---

## Paper Summary

{summary_content}

---

{recommendations_content}

---

*This analysis was generated by AI Research Assistant using Gemini API*
"""
    return full_report

def sanitize_filename(filename: str) -> str:
    """Convert a string to a valid filename"""
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

def save_markdown_file(topic: str, content: str) -> str:
    """Save content to a markdown file"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    safe_topic = sanitize_filename(topic)
    filename = f"research_{safe_topic}_{timestamp}.md"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)
    return filename

def display_markdown(content: str, use_markdown_display: bool = True):
    """Display content as rendered markdown if in IPython environment"""
    try:
        if use_markdown_display:
            display(Markdown(content))
        else:
            print(content)
    except:
        print(content)

def process_research_paper(file_path: str, original_filename: Optional[str] = None, 
                           db_path: str = "../data/research_papers.db", 
                           paper_summary_chain=None, 
                           paper_recommendation_chain=None, 
                           model=None) -> Dict[str, Any]:
    """Process a research paper file (PDF or DOCX)"""
    if not original_filename:
        original_filename = os.path.basename(file_path)
    file_extension = os.path.splitext(original_filename)[1].lower()
    if file_extension == '.pdf':
        text_content = extract_text_from_pdf(file_path)
        file_type = 'pdf'
    elif file_extension in ['.docx', '.doc']:
        text_content = extract_text_from_docx(file_path)
        file_type = 'docx'
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=12000,
        chunk_overlap=2000
    )
    chunks = text_splitter.split_text(text_content)
    processing_text = chunks[0] if len(chunks) > 0 else text_content
    paper_id = save_file_to_database(original_filename, text_content, file_type, db_path=db_path)
    try:
        print("- Generating research paper summary...")
        summary = paper_summary_chain.invoke(processing_text)
        save_summary_to_database(paper_id, summary, db_path=db_path)
        print("- Finding related research papers with access links...")
        try:
            recommendations_data = paper_recommendation_chain.invoke(processing_text)
            recs = recommendations_data["recommendations"]  # Access as a dict
            formatted_recommendations = "# Related Research Papers You May Be Interested In\n\n"
            for i, rec in enumerate(recs, 1):
                formatted_recommendations += f"## {i}. {rec['title']} ({rec['year']})\n"
                formatted_recommendations += f"**Authors:** {rec['authors']}\n\n"
                formatted_recommendations += f"{rec['description']}\n"
                paper_url = rec['paper_url'].strip()
                if not paper_url:
                    encoded_title = re.sub(r'\s+', '+', rec['title'])
                    paper_url = f"https://scholar.google.com/scholar?q={encoded_title}"
                formatted_recommendations += f"[Access Paper]({paper_url})\n\n"

        except Exception as e:
            print(f"Error generating paper recommendations: {str(e)}")
            backup_prompt = ChatPromptTemplate.from_template(
                """
                Based on the following research paper content:
                
                {paper_content}
                
                Provide 5 relevant related research papers that might be of interest.
                For each paper, include:
                1. Title (a real paper title if possible)
                2. Authors
                3. Year
                4. Brief description of relevance
                5. MOST IMPORTANTLY: A direct URL to access the paper (use Google Scholar, arXiv, or ResearchGate)
                
                Format your response in markdown with clear headings and clickable links.
                Make sure every recommendation has a working URL.
                """
            )
            backup_chain = backup_prompt | model | StrOutputParser()
            formatted_recommendations = backup_chain.invoke({"paper_content": processing_text})
        return {
            "paper_id": paper_id,
            "filename": original_filename,
            "summary": summary,
            "recommendations": formatted_recommendations,
            "success": True
        }
    except Exception as e:
        print(f"Error processing research paper: {str(e)}")
        return {
            "paper_id": paper_id,
            "filename": original_filename,
            "error": str(e),
            "success": False
        }

def run_research(topic: str, use_markdown_display: bool = True, db_path: str = "../data/research_papers.db",
                report_chain=None, recommendation_chain=None, model=None) -> Optional[Dict[str, Any]]:
    """Perform research on a specific topic"""
    if not topic.strip():
        print("Please enter a valid topic.")
        return
    print(f"\nResearching '{topic}'... This may take a moment.")
    try:
        initialize_database(db_path)
        print("- Generating detailed report...")
        report = generate_report(topic, report_chain)
        print("- Finding related topics...")
        recommendations = generate_recommendations(topic, recommendation_chain, model)
        full_report = create_full_report(topic, report, recommendations)
        filename = save_markdown_file(topic, full_report)
        print("\n" + "="*50)
        print(f"RESEARCH REPORT: {topic.upper()}")
        print("="*50 + "\n")
        display_markdown(report, use_markdown_display)
        print("\n" + "="*50)
        print("RECOMMENDED RELATED TOPICS")
        print("="*50 + "\n")
        display_markdown(recommendations, use_markdown_display)
        print("\n" + "="*50)
        print(f"Full report saved to: {filename}")
        print("="*50)
        return {
            "report": report,
            "recommendations": recommendations,
            "full_report": full_report,
            "filename": filename
        }
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def run_paper_analysis(file_path: str, original_filename: Optional[str] = None, 
                      use_markdown_display: bool = True, db_path: str = "../data/research_papers.db",
                      paper_summary_chain=None, paper_recommendation_chain=None, model=None) -> Optional[Dict[str, Any]]:
    """Analyze a research paper file"""
    try:
        initialize_database(db_path)
        result = process_research_paper(file_path, original_filename, db_path, 
                                       paper_summary_chain, paper_recommendation_chain, model)
        if result["success"]:
            print("\n" + "="*50)
            print(f"PAPER ANALYSIS: {result['filename']}")
            print("="*50 + "\n")
            display_markdown(result["summary"], use_markdown_display)
            print("\n" + "="*50)
            print("RECOMMENDED RELATED PAPERS")
            print("="*50 + "\n")
            display_markdown(result["recommendations"], use_markdown_display)
            full_analysis = create_full_paper_analysis(
                result["filename"],
                result["summary"],
                result["recommendations"]
            )
            analysis_filename = save_markdown_file(f"paper_analysis_{sanitize_filename(result['filename'])}", full_analysis)
            print("\n" + "="*50)
            print(f"Full analysis saved to: {analysis_filename}")
            print("="*50)
            return {
                "summary": result["summary"],
                "recommendations": result["recommendations"],
                "full_analysis": full_analysis,
                "filename": analysis_filename
            }
        else:
            print(f"Failed to process paper: {result.get('error', 'Unknown error')}")
            return None
    except Exception as e:
        print(f"An error occurred while analyzing the paper: {str(e)}")
        return None

def run_web_interface(use_markdown_display: bool = True, db_path: str = "../data/research_papers.db",
                     report_chain=None, recommendation_chain=None, 
                     paper_summary_chain=None, paper_recommendation_chain=None, model=None):
    """Run a web interface using IPython widgets"""
    try:
        from ipywidgets import widgets
        from IPython.display import display, clear_output
        output = widgets.Output()
        topic_input = widgets.Text(description='Topic:', placeholder='Enter research topic')
        search_button = widgets.Button(description='Research Topic')
        file_upload = widgets.FileUpload(
            accept='.pdf,.docx,.doc',
            multiple=False,
            description='Upload Paper'
        )
        analyze_button = widgets.Button(description='Analyze Paper')
        def on_search_click(b):
            with output:
                clear_output()
                run_research(topic_input.value, use_markdown_display, db_path, 
                           report_chain, recommendation_chain, model)
        def on_analyze_click(b):
            with output:
                clear_output()
                if not file_upload.value:
                    print("Please upload a research paper file (PDF or DOCX).")
                    return
                file_data = next(iter(file_upload.value.values()))
                file_name = next(iter(file_upload.value.keys()))
                with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1]) as temp_file:
                    temp_file.write(file_data['content'])
                    temp_path = temp_file.name
                try:
                    run_paper_analysis(temp_path, file_name, use_markdown_display, db_path,
                                     paper_summary_chain, paper_recommendation_chain, model)
                finally:
                    os.unlink(temp_path)
        search_button.on_click(on_search_click)
        analyze_button.on_click(on_analyze_click)
        tab1 = widgets.VBox([topic_input, search_button])
        tab2 = widgets.VBox([file_upload, analyze_button])
        tabs = widgets.Tab(children=[tab1, tab2])
        tabs.set_title(0, 'Topic Research')
        tabs.set_title(1, 'Paper Analysis')
        display(tabs)
        display(output)
    except ImportError:
        print("This function requires ipywidgets. Please install with: pip install ipywidgets")
        print("Running in command line mode instead.")
        run()

def research_topic(topic: str, report_chain=None, recommendation_chain=None, model=None):
    """Helper function to research a topic directly from a Jupyter notebook"""
    return run_research(topic, use_markdown_display=True, 
                      report_chain=report_chain, recommendation_chain=recommendation_chain, model=model)

def analyze_paper(file_path: str, paper_summary_chain=None, paper_recommendation_chain=None, model=None):
    """Helper function to analyze a paper directly from a Jupyter notebook"""
    return run_paper_analysis(file_path, use_markdown_display=True,
                            paper_summary_chain=paper_summary_chain, 
                            paper_recommendation_chain=paper_recommendation_chain, model=model)

def run():
    """Main agent loop with support for both topic research and paper analysis"""
    # Initialize components
    api_key = load_environment()
    model = initialize_model(api_key)
    TopicRecommendations, PaperRecommendations = define_output_schemas()
    report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt = create_prompt_templates()
    report_chain, recommendation_chain, paper_summary_chain, paper_recommendation_chain = create_chains(
        model, report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt,
        TopicRecommendations, PaperRecommendations
    )
    
    initialize_database()
    print("🔍 AI Research Assistant Agent 🔍")
    print("--------------------------------")
    print("I can help you research topics and analyze research papers.")
    while True:
        print("\nWhat would you like to do?")
        print("1. Research a topic")
        print("2. Analyze a research paper")
        print("3. Exit")
        choice = input("Enter your choice (1-3): ")
        if choice == '1':
            topic = input("\nWhat topic would you like to research? ")
            if topic.strip():
                run_research(topic, report_chain=report_chain, 
                           recommendation_chain=recommendation_chain, model=model)
            else:
                print("Please enter a valid topic.")
        elif choice == '2':
            file_path = input("\nEnter the path to the research paper file (PDF or DOCX): ")
            if os.path.exists(file_path):
                run_paper_analysis(file_path, paper_summary_chain=paper_summary_chain, 
                                 paper_recommendation_chain=paper_recommendation_chain, model=model)
            else:
                print(f"File not found: {file_path}")
        elif choice == '3':
            print("Thank you for using the AI Research Assistant. Goodbye!")
            break
        else:
            print("Invalid choice. Please enter 1, 2, or 3.")

if __name__ == "__main__":
    run()


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


🔍 AI Research Assistant Agent 🔍
--------------------------------
I can help you research topics and analyze research papers.

What would you like to do?
1. Research a topic
2. Analyze a research paper
3. Exit

Researching 'Large Language Model'... This may take a moment.
- Generating detailed report...
- Finding related topics...

RESEARCH REPORT: LARGE LANGUAGE MODEL



# Large Language Models: A Comprehensive Overview

## 1. Introduction

Large Language Models (LLMs) represent a significant leap forward in the field of Artificial Intelligence, specifically within Natural Language Processing (NLP). These models, typically based on deep learning architectures, are trained on vast amounts of text data, enabling them to understand, generate, and manipulate human language with impressive fluency. Their ability to perform diverse tasks, from translation and summarization to creative content generation and question answering, has made them a focal point of research and development across various industries. This report will provide a comprehensive overview of LLMs, covering their key concepts, historical context, current state, applications, and potential future directions.

## 2. Key Concepts and Definitions

Understanding LLMs requires familiarity with several core concepts:

*   **Language Model (LM):** At its core, a language model is a probabilistic model that assigns a probability to a sequence of words. It aims to predict the likelihood of the next word in a sequence given the preceding words.  Mathematically, an LM estimates P(w1, w2, ..., wn), the probability of a sequence of words.

*   **Neural Networks:** LLMs are primarily built upon neural networks, specifically deep learning architectures. These networks consist of interconnected layers of nodes (neurons) that learn complex patterns from data.

*   **Deep Learning:** A subfield of machine learning that utilizes neural networks with multiple layers (deep neural networks) to analyze data with hierarchical representations. The "depth" of the network allows it to learn more abstract and intricate features.

*   **Transformer Architecture:** The dominant architecture for modern LLMs. Introduced in the "Attention is All You Need" paper (Vaswani et al., 2017), transformers rely heavily on the "attention mechanism" to weigh the importance of different parts of the input sequence when processing information.  They excel at capturing long-range dependencies in text, overcoming limitations of earlier recurrent neural network (RNN) based models.

*   **Attention Mechanism:** A core component of the transformer architecture. It allows the model to focus on the most relevant parts of the input sequence when processing each word.  Different types of attention exist, including self-attention (where the model attends to different parts of the same input sequence) and cross-attention (where the model attends to a different sequence, such as in machine translation).

*   **Self-Supervised Learning:**  A training paradigm where the model learns from unlabeled data by creating its own supervision signals. For instance, an LLM might be trained to predict masked words in a sentence (masked language modeling) or to predict the next sentence in a document. This allows LLMs to leverage the vast amounts of text data available on the internet without requiring manual annotation.

*   **Pre-training and Fine-tuning:** A common training strategy for LLMs.  The model is first pre-trained on a massive dataset of text to learn general language representations.  Then, it is fine-tuned on a smaller, task-specific dataset to optimize its performance for a particular application (e.g., question answering, text summarization).

*   **Tokenization:** The process of breaking down text into individual units (tokens), such as words or sub-word units (e.g., using Byte-Pair Encoding (BPE)).  Tokenization is necessary for LLMs to process text numerically.

*   **Embedding:** A numerical representation of words or tokens in a high-dimensional vector space. Embeddings capture semantic relationships between words, allowing the model to understand their meaning and context.

*   **Context Window:** The maximum length of the input sequence that the LLM can process at once.  A larger context window allows the model to consider more information when making predictions.

*   **Parameters:** The adjustable weights and biases within the neural network. The number of parameters is often used as a measure of the model's size and complexity.  LLMs are characterized by their large number of parameters, often in the billions or even trillions.

*   **Emergent Abilities:** Unexpected and often surprising abilities that emerge in LLMs as they scale in size (number of parameters and training data). These abilities may not be explicitly programmed but arise from the complex interactions within the network. Examples include in-context learning, where the model can perform new tasks based on a few examples provided in the input prompt.

## 3. Historical Context and Development

The development of LLMs has been a gradual process, building upon decades of research in NLP and machine learning:

*   **Early Language Models (pre-2010):** Early language models were primarily based on statistical methods, such as n-gram models. These models counted the frequency of word sequences and used these counts to estimate probabilities. They were limited by their inability to capture long-range dependencies and their reliance on manually engineered features.

*   **Recurrent Neural Networks (RNNs) (2010s):** RNNs, particularly LSTMs (Long Short-Term Memory) and GRUs (Gated Recurrent Units), offered improvements over n-gram models by allowing the model to retain information about previous words in the sequence. However, RNNs struggled with long sequences due to the vanishing gradient problem.

*   **Sequence-to-Sequence Models (2014):** Models like Seq2Seq with attention mechanisms, used for machine translation, were a step forward, but still suffered from limitations with long-range dependencies and parallelization.

*   **The Transformer Revolution (2017-present):** The introduction of the transformer architecture in 2017 marked a turning point.  The attention mechanism allowed the model to effectively capture long-range dependencies and enabled parallel processing, leading to significant improvements in performance.

    *   **BERT (Bidirectional Encoder Representations from Transformers) (2018):** BERT, developed by Google, was a groundbreaking model that used a transformer encoder to learn contextualized word embeddings. It was pre-trained on a massive corpus of text using masked language modeling and next sentence prediction.

    *   **GPT (Generative Pre-trained Transformer) (2018):** GPT, developed by OpenAI, was another important model that used a transformer decoder to generate text. It was pre-trained on a large corpus of text using a causal language modeling objective (predicting the next word in a sequence).  GPT models have been iteratively improved with larger sizes and more sophisticated training techniques (GPT-2, GPT-3, GPT-4).

    *   **T5 (Text-to-Text Transfer Transformer) (2019):** T5, also developed by Google, framed all NLP tasks as text-to-text problems, allowing for a unified approach to training and fine-tuning.

    *   **Scaling Laws (2020):** Research on scaling laws revealed that the performance of LLMs generally improves predictably as the model size, dataset size, and compute used for training increase. This encouraged the development of even larger models.

    *   **PaLM (Pathways Language Model) (2022):** Google's PaLM demonstrated advanced reasoning and few-shot learning capabilities.

    *   **LLaMA (Large Language Model Meta AI) (2023):** Meta's LLaMA demonstrated strong performance with a relatively smaller size, emphasizing the importance of training data quality and efficient architectures.

    *   **GPT-4 (2023):** OpenAI's GPT-4 is a multimodal model capable of processing both text and images, further expanding the capabilities of LLMs.

## 4. Current State and Applications

LLMs are currently deployed in a wide range of applications, transforming various industries:

*   **Natural Language Understanding (NLU):**
    *   **Sentiment Analysis:** Determining the emotional tone of text.
    *   **Text Classification:** Categorizing text into predefined categories.
    *   **Named Entity Recognition (NER):** Identifying and classifying named entities in text (e.g., people, organizations, locations).
    *   **Question Answering:** Answering questions posed in natural language.
    *   **Reading Comprehension:** Understanding and extracting information from text passages.

*   **Natural Language Generation (NLG):**
    *   **Text Summarization:** Generating concise summaries of longer texts.
    *   **Machine Translation:** Translating text from one language to another.
    *   **Content Creation:** Generating articles, blog posts, scripts, and other creative content.
    *   **Chatbots and Virtual Assistants:** Creating conversational agents that can interact with users in natural language.
    *   **Code Generation:** Generating code from natural language descriptions.

*   **Search and Information Retrieval:**
    *   **Improving Search Relevance:** Understanding the intent behind search queries and providing more relevant results.
    *   **Knowledge Graph Construction:** Automatically extracting information from text to build knowledge graphs.

*   **Healthcare:**
    *   **Medical Diagnosis:** Assisting doctors in diagnosing diseases by analyzing patient records and medical literature.
    *   **Drug Discovery:** Identifying potential drug candidates by analyzing scientific publications and databases.
    *   **Personalized Medicine:** Tailoring treatment plans to individual patients based on their genetic information and medical history.

*   **Finance:**
    *   **Fraud Detection:** Identifying fraudulent transactions by analyzing financial data and news articles.
    *   **Risk Management:** Assessing and managing financial risks by analyzing market trends and economic indicators.
    *   **Algorithmic Trading:** Developing automated trading strategies based on market analysis.

*   **Education:**
    *   **Personalized Learning:** Tailoring educational content to individual students' needs and learning styles.
    *   **Automated Grading:** Automating the grading of essays and other written assignments.
    *   **Tutoring Systems:** Providing students with personalized tutoring and feedback.

*   **Legal:**
     *   **Legal Document Review:** Analyzing large volumes of legal documents to identify relevant information.
     *   **Contract Drafting:** Assisting lawyers in drafting contracts and other legal documents.
     *   **Legal Research:** Conducting legal research by searching and analyzing legal databases.

## 5. Future Directions and Potential Developments

The field of LLMs is rapidly evolving, with several promising directions for future research and development:

*   **Improved Reasoning and Common Sense:** Current LLMs often struggle with tasks that require reasoning and common sense. Future research will focus on improving these capabilities by incorporating knowledge graphs, symbolic reasoning, and other techniques.

*   **Multimodal Learning:** Integrating information from multiple modalities, such as text, images, audio, and video, to create more powerful and versatile models.  GPT-4 is a significant step in this direction.

*   **Explainability and Interpretability:** Making LLMs more transparent and understandable, so that users can understand how they arrive at their predictions. This is crucial for building trust and ensuring responsible use.

*   **Reducing Bias and Ensuring Fairness:** Addressing biases in training data and model architectures to ensure that LLMs are fair and equitable.

*   **Efficient Training and Deployment:** Developing more efficient training algorithms and model architectures to reduce the computational cost and energy consumption of LLMs. This is essential for making LLMs accessible to a wider range of users and organizations. Quantization and pruning techniques are being explored.

*   **Longer Context Windows:** Expanding the context window of LLMs to allow them to process longer sequences of text. This will enable them to handle more complex tasks, such as summarizing entire books or analyzing long conversations.  Architectural innovations are needed to maintain efficiency as context window size increases.

*   **Continual Learning:** Enabling LLMs to continuously learn and adapt to new information without forgetting what they have already learned.

*   **Personalized LLMs:** Creating LLMs that are tailored to individual users' needs and preferences. This could involve fine-tuning the model on a user's personal data or allowing the user to customize the model's behavior.

*   **Agent-Based LLMs:** Using LLMs as the core component of autonomous agents that can interact with the world and perform complex tasks.  This involves equipping LLMs with tools and APIs that allow them to access and manipulate external resources.

*   **Edge Computing with LLMs:** Deploying smaller, more efficient LLMs on edge devices (e.g., smartphones, wearables) to enable real-time processing of natural language data without relying on cloud connectivity.

## 6. Conclusion

Large Language Models represent a transformative technology with the potential to revolutionize how we interact with computers and information.  Their impressive capabilities in understanding, generating, and manipulating human language have already led to a wide range of applications across various industries.  While challenges remain in areas such as reasoning, bias, and explainability, ongoing research and development are rapidly pushing the boundaries of what is possible. As LLMs continue to evolve, they are poised to play an increasingly important role in shaping the future of AI and our relationship with technology. Their ethical implications, particularly concerning misinformation and job displacement, also require careful consideration and proactive mitigation strategies.


RECOMMENDED RELATED TOPICS



Okay, here are 5 related topics to Large Language Models (LLMs) that a user might find interesting, along with descriptions and relevant resource links:

*   **1. Topic Name:** Transformer Networks

    *   **Description:** LLMs are primarily built upon the Transformer architecture. Understanding the mechanics of Transformers, including self-attention, encoder-decoder structures, and positional encoding, is crucial for comprehending how LLMs function and are trained. It allows you to dive deeper into the core of what makes LLMs tick.
    *   **Relevant Resource Link:** [Attention is All You Need](https://arxiv.org/abs/1706.03762) - The original paper introducing the Transformer architecture.

*   **2. Topic Name:** Fine-tuning LLMs

    *   **Description:** Fine-tuning involves taking a pre-trained LLM and training it further on a smaller, task-specific dataset. This is a key technique for adapting general-purpose LLMs to specific applications, like sentiment analysis, question answering in a particular domain, or code generation. Understanding fine-tuning methods is essential for practically deploying LLMs.
    *   **Relevant Resource Link:** [Hugging Face Transformers Documentation on Fine-tuning](https://huggingface.co/docs/transformers/training) - Hugging Face provides comprehensive documentation and tutorials on fine-tuning various LLMs.

*   **3. Topic Name:** Prompt Engineering

    *   **Description:** Prompt engineering is the art and science of crafting effective prompts (input text) to elicit desired responses from LLMs. It involves understanding how different prompts can influence the model's behavior and optimizing prompts to achieve specific goals, such as generating creative content, summarizing text, or translating languages.
    *   **Relevant Resource Link:** [Prompt Engineering Guide](https://www.promptingguide.ai/) - A comprehensive guide covering various prompting techniques and best practices.

*   **4. Topic Name:** Ethical Considerations of LLMs

    *   **Description:** LLMs raise numerous ethical concerns, including bias amplification, misinformation generation, privacy violations, and job displacement. Exploring these ethical considerations is crucial for responsible development and deployment of LLMs. This includes understanding fairness, accountability, transparency, and societal impact.
    *   **Relevant Resource Link:** [AI Ethics Resources from the Partnership on AI](https://www.partnershiponai.org/ethical-frameworks-resources/) - Offers reports and resources on the ethical implications of AI, including LLMs.

*   **5. Topic Name:** Evaluation Metrics for LLMs

    *   **Description:** Evaluating the performance of LLMs is a complex task. Understanding the various metrics used to assess LLMs, such as perplexity, BLEU score, ROUGE score, and human evaluation metrics, is essential for comparing different models and understanding their strengths and weaknesses. It also includes understanding the limitations of these metrics.
    *   **Relevant Resource Link:** [Google's Research on Evaluating Large Language Models](https://ai.googleblog.com/2022/07/measuring-emergent-abilities-of-large.html) - A blog post discussing the challenges and approaches to evaluating LLMs.


Full report saved to: research_Large Language Model_20250321_110246.md

What would you like to do?
1. Research a topic
2. Analyze a research paper
3. Exit
Thank you for using the AI Research Assistant. Goodbye!


In [2]:
import os
import io
import base64
import sqlite3
import tempfile
import re
import warnings
from datetime import datetime
from typing import List, Optional, Dict, Any
from collections import Counter
import string

# Third-party imports
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough
from IPython.display import Markdown, display, HTML
import PyPDF2 
import docx  
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# For analytics dashboard
import dash
from dash import html, dcc, Input, Output
import plotly.express as px

# Suppress warnings
warnings.filterwarnings("ignore")

#############################
# Existing Functionality
#############################

def load_environment():
    """Load environment variables"""
    load_dotenv()
    return os.getenv("GEMINI_API_KEY")

def initialize_model(api_key):
    """Initialize Gemini model"""
    return ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        google_api_key=api_key,
        temperature=0.7,
        max_tokens=4000,
    )

def define_output_schemas():
    """Define output schemas for topic and paper recommendations"""
    # Output schema for topic recommendations
    class RecommendedTopic(BaseModel):
        topic: str = Field(description="The name of the recommended topic")
        description: str = Field(description="A brief description of why this topic is relevant")
        resource_url: str = Field(description="A relevant resource URL for this topic")

    class TopicRecommendations(BaseModel):
        recommendations: List[RecommendedTopic] = Field(description="List of recommended related topics")

    # Output schema for paper recommendations
    class RecommendedPaper(BaseModel):
        title: str = Field(description="The title of the recommended research paper")
        authors: str = Field(description="The authors of the paper")
        year: str = Field(description="Publication year")
        description: str = Field(description="Brief description of relevance to the original paper")
        paper_url: str = Field(description="URL to access this paper", default="")

    class PaperRecommendations(BaseModel):
        recommendations: List[RecommendedPaper] = Field(description="List of recommended related papers")
        
    return TopicRecommendations, PaperRecommendations

def create_prompt_templates():
    """Create prompt templates for research tasks"""
    report_prompt = ChatPromptTemplate.from_template(
        """
        You are an AI research assistant. Create a comprehensive, detailed report on the following topic:
        
        Topic: {topic}
        
        Your report should include:
        1. Introduction to the topic
        2. Key concepts and definitions
        3. Historical context and development
        4. Current state and applications
        5. Future directions and potential developments
        6. Conclusion
        
        Format your report with clear markdown headings and subheadings. Use proper markdown formatting for emphasis, lists, and other elements.
        Make sure to provide in-depth analysis.
        """
    )

    recommendation_prompt = ChatPromptTemplate.from_template(
        """
        Based on the topic: {topic}
        
        Generate 5 relevant related topics that the user might be interested in researching next.
        For each recommendation, provide:
        1. The topic name
        2. A brief 1-2 sentence description of why it's relevant
        3. A relevant resource URL that would contain valuable information about this topic
        
        Your response must be formatted as a valid JSON object that matches this structure:
        {
            "recommendations": [
                {
                    "topic": "Topic Name",
                    "description": "Brief description of relevance",
                    "resource_url": "https://example.com/relevant-page"
                },
                ...
            ]
        }
        
        Use reputable sources for your resource URLs. While you can't verify if the exact URLs exist,
        make them realistic and likely to contain quality information.
        """
    )

    paper_summary_prompt = ChatPromptTemplate.from_template(
        """
        You are an AI research assistant. Create a concise but comprehensive summary of the following research paper:
        
        Paper content: {paper_content}
        
        Your summary should include:
        1. Main objective of the research
        2. Methodology used
        3. Key findings and results
        4. Main conclusions and implications
        5. Limitations (if mentioned)
        
        Format your summary with clear markdown headings and keep it concise yet informative.
        Focus on the most important aspects of the paper.
        """
    )

    paper_recommendation_prompt = ChatPromptTemplate.from_template(
    """
    Based on the following research paper:
    
    Paper content: {paper_content}
    
    Generate 5 relevant related research papers that the user might be interested in reading next.
    These should be real papers that likely exist in the academic literature.
    
    For each recommendation, provide:
    1. The paper title (use the actual title of a real paper if you know it)
    2. The authors (use "et al." for multiple authors after the first)
    3. Publication year (estimate if necessary)
    4. A brief description of why it's relevant to the original paper
    5. A URL where the paper might be found - THIS IS CRITICAL. 
    
    For URLs, use specific links from:
    - Google Scholar (https://scholar.google.com/scholar?q=PAPER_TITLE)
    - arXiv (https://arxiv.org/search/?query=PAPER_TITLE)
    - ResearchGate (https://www.researchgate.net/search.Search.html?query=PAPER_TITLE)
    - ACM Digital Library (https://dl.acm.org/action/doSearch?AllField=PAPER_TITLE)
    - IEEE Xplore (https://ieeexplore.ieee.org/search/searchresult.jsp?queryText=PAPER_TITLE)
    
    Replace PAPER_TITLE with URL-encoded paper title in these templates. Make sure EVERY recommendation has a working URL.
    
    Your response must be formatted as a valid JSON object that matches this structure:
    {{
        "recommendations": [
            {{
                "title": "Paper Title",
                "authors": "Author names",
                "year": "Publication year",
                "description": "Brief description of relevance",
                "paper_url": "https://example.com/paper-link"
            }},
            ...
        ]
    }}
    """
)
    
    return report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt

def create_chains(model, report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt, TopicRecommendations, PaperRecommendations):
    """Create processing chains for research tasks"""
    report_chain = (
        {"topic": RunnablePassthrough()}
        | report_prompt
        | model
        | StrOutputParser()
    )

    recommendation_chain = (
        {"topic": RunnablePassthrough()}
        | recommendation_prompt
        | model
        | JsonOutputParser(pydantic_object=TopicRecommendations)
    )

    paper_summary_chain = (
        {"paper_content": RunnablePassthrough()}
        | paper_summary_prompt
        | model
        | StrOutputParser()
    )

    paper_recommendation_chain = (
        {"paper_content": RunnablePassthrough()}
        | paper_recommendation_prompt
        | model
        | JsonOutputParser(pydantic_object=PaperRecommendations)
    )
    
    return report_chain, recommendation_chain, paper_summary_chain, paper_recommendation_chain

def initialize_database(db_path: str = "../data/research_papers.db"):
    """Initialize SQLite database for storing papers"""
    os.makedirs(os.path.dirname(db_path), exist_ok=True)
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS papers (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT NOT NULL,
            content TEXT NOT NULL,
            file_type TEXT NOT NULL,
            upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            summary TEXT
        )
    ''')
    conn.commit()
    conn.close()

def save_file_to_database(filename: str, content: str, file_type: str, db_path: str = "../data/research_papers.db"):
    """Save file content to SQLite database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(
        "INSERT INTO papers (filename, content, file_type) VALUES (?, ?, ?)",
        (filename, content, file_type)
    )
    paper_id = cursor.lastrowid
    conn.commit()
    conn.close()
    return paper_id

def save_summary_to_database(paper_id: int, summary: str, db_path: str = "../data/research_papers.db"):
    """Save paper summary to database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(
        "UPDATE papers SET summary = ? WHERE id = ?",
        (summary, paper_id)
    )
    conn.commit()
    conn.close()

def get_paper_from_database(paper_id: int, db_path: str = "../data/research_papers.db"):
    """Retrieve paper content from database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT filename, content, file_type, summary FROM papers WHERE id = ?", (paper_id,))
    result = cursor.fetchone()
    conn.close()
    if result:
        return {
            "filename": result[0],
            "content": result[1],
            "file_type": result[2],
            "summary": result[3]
        }
    else:
        return None

def extract_text_from_pdf(file_path: str) -> str:
    """Extract text content from a PDF file"""
    try:
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        text_content = "\n\n".join([doc.page_content for doc in documents])
        return text_content
    except Exception as e:
        print(f"Error extracting text from PDF: {str(e)}")
        text = ""
        with open(file_path, "rb") as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text

def extract_text_from_docx(file_path: str) -> str:
    """Extract text content from a DOCX file"""
    try:
        loader = Docx2txtLoader(file_path)
        documents = loader.load()
        text_content = "\n\n".join([doc.page_content for doc in documents])
        return text_content
    except Exception as e:
        print(f"Error extracting text from DOCX: {str(e)}")
        doc = docx.Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text

def generate_report(topic: str, report_chain) -> str:
    """Generate a detailed report on the given topic"""
    return report_chain.invoke(topic)

def generate_recommendations(topic: str, recommendation_chain, model) -> str:
    """Generate relevant topic recommendations using Gemini API"""
    try:
        recommendations_data = recommendation_chain.invoke(topic)
        formatted_recommendations = "# Related Topics You May Be Interested In\n\n"
        for i, rec in enumerate(recommendations_data.recommendations, 1):
            formatted_recommendations += f"## {i}. {rec.topic}\n"
            formatted_recommendations += f"{rec.description}\n"
            formatted_recommendations += f"[Learn more]({rec.resource_url})\n\n"
        return formatted_recommendations
    except Exception as e:
        backup_prompt = ChatPromptTemplate.from_template(
            """
            Based on the topic: {topic}
            
            Provide 5 relevant related topics that the user might be interested in researching next.
            For each recommendation, provide:
            1. The topic name
            2. A brief description of why it's relevant
            3. A relevant resource link
            
            Format your response as a markdown list.
            """
        )
        backup_chain = backup_prompt | model | StrOutputParser()
        return backup_chain.invoke({"topic": topic})

def create_full_report(topic: str, report_content: str, recommendations_content: str) -> str:
    """Create a full markdown report combining the report and recommendations"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    full_report = f"""
# Research Report: {topic}

*Generated on: {timestamp}*

---

{report_content}

---

{recommendations_content}

---

*This report was generated by AI Research Assistant using Gemini API*
"""
    return full_report

def create_full_paper_analysis(filename: str, summary_content: str, recommendations_content: str) -> str:
    """Create a full markdown report for paper analysis"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    full_report = f"""
# Research Paper Analysis: {filename}

*Generated on: {timestamp}*

---

## Paper Summary

{summary_content}

---

{recommendations_content}

---

*This analysis was generated by AI Research Assistant using Gemini API*
"""
    return full_report

def sanitize_filename(filename: str) -> str:
    """Convert a string to a valid filename"""
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

def save_markdown_file(topic: str, content: str) -> str:
    """Save content to a markdown file"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    safe_topic = sanitize_filename(topic)
    filename = f"research_{safe_topic}_{timestamp}.md"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)
    return filename

def display_markdown(content: str, use_markdown_display: bool = True):
    """Display content as rendered markdown if in IPython environment"""
    try:
        if use_markdown_display:
            display(Markdown(content))
        else:
            print(content)
    except:
        print(content)

def process_research_paper(file_path: str, original_filename: Optional[str] = None, 
                           db_path: str = "../data/research_papers.db", 
                           paper_summary_chain=None, 
                           paper_recommendation_chain=None, 
                           model=None) -> Dict[str, Any]:
    """Process a research paper file (PDF or DOCX)"""
    if not original_filename:
        original_filename = os.path.basename(file_path)
    file_extension = os.path.splitext(original_filename)[1].lower()
    if file_extension == '.pdf':
        text_content = extract_text_from_pdf(file_path)
        file_type = 'pdf'
    elif file_extension in ['.docx', '.doc']:
        text_content = extract_text_from_docx(file_path)
        file_type = 'docx'
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=12000,
        chunk_overlap=2000
    )
    chunks = text_splitter.split_text(text_content)
    processing_text = chunks[0] if len(chunks) > 0 else text_content
    paper_id = save_file_to_database(original_filename, text_content, file_type, db_path=db_path)
    try:
        print("- Generating research paper summary...")
        summary = paper_summary_chain.invoke(processing_text)
        save_summary_to_database(paper_id, summary, db_path=db_path)
        print("- Finding related research papers with access links...")
        try:
            recommendations_data = paper_recommendation_chain.invoke(processing_text)
            recs = recommendations_data["recommendations"]  # Access as a dict
            formatted_recommendations = "# Related Research Papers You May Be Interested In\n\n"
            for i, rec in enumerate(recs, 1):
                formatted_recommendations += f"## {i}. {rec['title']} ({rec['year']})\n"
                formatted_recommendations += f"**Authors:** {rec['authors']}\n\n"
                formatted_recommendations += f"{rec['description']}\n"
                paper_url = rec['paper_url'].strip()
                if not paper_url:
                    encoded_title = re.sub(r'\s+', '+', rec['title'])
                    paper_url = f"https://scholar.google.com/scholar?q={encoded_title}"
                formatted_recommendations += f"[Access Paper]({paper_url})\n\n"
        except Exception as e:
            print(f"Error generating paper recommendations: {str(e)}")
            backup_prompt = ChatPromptTemplate.from_template(
                """
                Based on the following research paper content:
                
                {paper_content}
                
                Provide 5 relevant related research papers that might be of interest.
                For each paper, include:
                1. Title (a real paper title if possible)
                2. Authors
                3. Year
                4. Brief description of relevance
                5. MOST IMPORTANTLY: A direct URL to access the paper (use Google Scholar, arXiv, or ResearchGate)
                
                Format your response in markdown with clear headings and clickable links.
                Make sure every recommendation has a working URL.
                """
            )
            backup_chain = backup_prompt | model | StrOutputParser()
            formatted_recommendations = backup_chain.invoke({"paper_content": processing_text})
        return {
            "paper_id": paper_id,
            "filename": original_filename,
            "summary": summary,
            "recommendations": formatted_recommendations,
            "success": True
        }
    except Exception as e:
        print(f"Error processing research paper: {str(e)}")
        return {
            "paper_id": paper_id,
            "filename": original_filename,
            "error": str(e),
            "success": False
        }

def run_research(topic: str, use_markdown_display: bool = True, db_path: str = "../data/research_papers.db",
                report_chain=None, recommendation_chain=None, model=None) -> Optional[Dict[str, Any]]:
    """Perform research on a specific topic"""
    if not topic.strip():
        print("Please enter a valid topic.")
        return
    print(f"\nResearching '{topic}'... This may take a moment.")
    try:
        initialize_database(db_path)
        print("- Generating detailed report...")
        report = generate_report(topic, report_chain)
        print("- Finding related topics...")
        recommendations = generate_recommendations(topic, recommendation_chain, model)
        full_report = create_full_report(topic, report, recommendations)
        filename = save_markdown_file(topic, full_report)
        print("\n" + "="*50)
        print(f"RESEARCH REPORT: {topic.upper()}")
        print("="*50 + "\n")
        display_markdown(report, use_markdown_display)
        print("\n" + "="*50)
        print("RECOMMENDED RELATED TOPICS")
        print("="*50 + "\n")
        display_markdown(recommendations, use_markdown_display)
        print("\n" + "="*50)
        print(f"Full report saved to: {filename}")
        print("="*50)
        return {
            "report": report,
            "recommendations": recommendations,
            "full_report": full_report,
            "filename": filename
        }
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def run_paper_analysis(file_path: str, original_filename: Optional[str] = None, 
                      use_markdown_display: bool = True, db_path: str = "../data/research_papers.db",
                      paper_summary_chain=None, paper_recommendation_chain=None, model=None) -> Optional[Dict[str, Any]]:
    """Analyze a research paper file"""
    try:
        initialize_database(db_path)
        result = process_research_paper(file_path, original_filename, db_path, 
                                       paper_summary_chain, paper_recommendation_chain, model)
        if result["success"]:
            print("\n" + "="*50)
            print(f"PAPER ANALYSIS: {result['filename']}")
            print("="*50 + "\n")
            display_markdown(result["summary"], use_markdown_display)
            print("\n" + "="*50)
            print("RECOMMENDED RELATED PAPERS")
            print("="*50 + "\n")
            display_markdown(result["recommendations"], use_markdown_display)
            full_analysis = create_full_paper_analysis(
                result["filename"],
                result["summary"],
                result["recommendations"]
            )
            analysis_filename = save_markdown_file(f"paper_analysis_{sanitize_filename(result['filename'])}", full_analysis)
            print("\n" + "="*50)
            print(f"Full analysis saved to: {analysis_filename}")
            print("="*50)
            return {
                "summary": result["summary"],
                "recommendations": result["recommendations"],
                "full_analysis": full_analysis,
                "filename": analysis_filename
            }
        else:
            print(f"Failed to process paper: {result.get('error', 'Unknown error')}")
            return None
    except Exception as e:
        print(f"An error occurred while analyzing the paper: {str(e)}")
        return None

def research_topic(topic: str, report_chain=None, recommendation_chain=None, model=None):
    """Helper function to research a topic directly from a Jupyter notebook"""
    return run_research(topic, use_markdown_display=True, 
                      report_chain=report_chain, recommendation_chain=recommendation_chain, model=model)

def analyze_paper(file_path: str, paper_summary_chain=None, paper_recommendation_chain=None, model=None):
    """Helper function to analyze a paper directly from a Jupyter notebook"""
    return run_paper_analysis(file_path, use_markdown_display=True,
                            paper_summary_chain=paper_summary_chain, 
                            paper_recommendation_chain=paper_recommendation_chain, model=model)

#############################
# New Analytics Dashboard Code
#############################

def analyze_paper_for_dashboard(file_path: str, original_filename: Optional[str] = None) -> Dict[str, Any]:
    """
    Process the uploaded paper and extract analytics insights:
     - Metadata extraction (for PDFs)
     - Abstract extraction (simple regex)
     - Keyword frequency (after removing stop words)
     - Section breakdown (using simple heuristics)
     - Citation count (number of 'References' occurrences)
     - Visual elements (counts of "Figure" and "Table")
     - Readability (word count and average sentence length)
     - Future work / Limitations (attempt to extract the 'Future Work' section)
    """
    if not original_filename:
        original_filename = os.path.basename(file_path)
    file_extension = os.path.splitext(original_filename)[1].lower()
    if file_extension == '.pdf':
        text_content = extract_text_from_pdf(file_path)
        # Try to extract metadata using PyPDF2
        try:
            with open(file_path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                metadata = reader.metadata
                metadata_dict = {
                    "Title": metadata.title if metadata.title else "N/A",
                    "Author": metadata.author if metadata.author else "N/A",
                    "CreationDate": metadata.creation_date if metadata.creation_date else "N/A",
                    "Producer": metadata.producer if metadata.producer else "N/A",
                    "Subject": metadata.subject if metadata.subject else "N/A"
                }
        except Exception as e:
            metadata_dict = {}
    elif file_extension in ['.docx', '.doc']:
        text_content = extract_text_from_docx(file_path)
        metadata_dict = {}  # DOCX metadata extraction can be added if needed
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

    # Basic abstract extraction: look for the word "Abstract" and take the following 300 words.
    abstract = ""
    match = re.search(r'Abstract\s*[:\-]?\s*(.*)', text_content, re.IGNORECASE | re.DOTALL)
    if match:
        abstract = " ".join(match.group(1).split()[:300])
    else:
        abstract = "Abstract not found."

    # Keyword extraction: simple frequency analysis excluding common stopwords
    stopwords = set(["the", "and", "is", "in", "of", "to", "a", "for", "with", "that", "as", "on", "by", "an", "are"])
    words = [word.strip(string.punctuation).lower() for word in text_content.split()]
    filtered_words = [w for w in words if w and w not in stopwords and len(w) > 3]
    word_counts = Counter(filtered_words)
    most_common = word_counts.most_common(10)
    keywords = [{"word": word, "count": count} for word, count in most_common]

    # Section breakdown: try to count common sections (heuristic based on headings)
    sections = {}
    for sec in ["Introduction", "Methods", "Results", "Discussion", "Conclusion"]:
        pattern = re.compile(rf'{sec}', re.IGNORECASE)
        count = len(pattern.findall(text_content))
        sections[sec] = count

    # Citation analysis: count the occurrence of the word "Reference" (as a proxy)
    citations = text_content.lower().count("reference")

    # Visual elements: count "Figure" and "Table"
    figures = text_content.lower().count("figure")
    tables = text_content.lower().count("table")

    # Readability & text analytics
    words_total = len(words)
    sentences = re.split(r'[.!?]+', text_content)
    sentences = [s for s in sentences if s.strip()]
    avg_sentence_length = words_total / len(sentences) if sentences else 0

    # Future work / limitations: simple extraction if exists
    future_work = ""
    match_future = re.search(r'(Future Work|Limitations)\s*[:\-]?\s*(.*)', text_content, re.IGNORECASE | re.DOTALL)
    if match_future:
        future_work = " ".join(match_future.group(2).split()[:100])
    else:
        future_work = "Not clearly mentioned."

    analytics_data = {
        "metadata": metadata_dict,
        "abstract": abstract,
        "keywords": keywords,
        "sections": sections,
        "citations": citations,
        "figures": figures,
        "tables": tables,
        "word_count": words_total,
        "avg_sentence_length": avg_sentence_length,
        "future_work": future_work
    }
    return analytics_data

def create_analytics_dashboard_app(analytics_data: Dict[str, Any]) -> dash.Dash:
    """Create a Dash app to visualize the analytics data."""
    app = dash.Dash(__name__)
    # Metadata table as a simple HTML list
    metadata_items = [html.Li(f"{k}: {v}") for k, v in analytics_data["metadata"].items()] if analytics_data["metadata"] else [html.Li("No metadata available")]
    # Keyword bar chart
    kw_df = {"Word": [item["word"] for item in analytics_data["keywords"]],
             "Count": [item["count"] for item in analytics_data["keywords"]]}
    fig_keywords = px.bar(kw_df, x="Word", y="Count", title="Top Keywords")
    # Section breakdown pie chart
    sec_df = {"Section": list(analytics_data["sections"].keys()),
              "Count": list(analytics_data["sections"].values())}
    fig_sections = px.pie(sec_df, names="Section", values="Count", title="Section Occurrences")
    # Visual elements summary
    visual_summary = f"Figures: {analytics_data['figures']} | Tables: {analytics_data['tables']}"
    # Layout
    app.layout = html.Div([
        html.H1("Research Paper Analytics Dashboard"),
        html.H2("Metadata Overview"),
        html.Ul(metadata_items),
        html.H2("Abstract & Summary Analysis"),
        html.P(analytics_data["abstract"]),
        html.H2("Keyword & Topic Extraction"),
        dcc.Graph(figure=fig_keywords),
        html.H2("Section Breakdown"),
        dcc.Graph(figure=fig_sections),
        html.H2("Citation & Bibliometric Analysis"),
        html.P(f"Total References (proxy count): {analytics_data['citations']}"),
        html.H2("Visual Elements Extraction"),
        html.P(visual_summary),
        html.H2("Readability & Text Analytics"),
        html.P(f"Word Count: {analytics_data['word_count']}"),
        html.P(f"Average Sentence Length: {analytics_data['avg_sentence_length']:.2f} words"),
        html.H2("Future Work & Limitations"),
        html.P(analytics_data["future_work"])
    ])
    return app

def run_analytics_dashboard(file_path: str, original_filename: Optional[str] = None):
    """Process the uploaded paper and launch the analytics dashboard."""
    print("Analyzing paper for dashboard insights...")
    analytics_data = analyze_paper_for_dashboard(file_path, original_filename)
    app = create_analytics_dashboard_app(analytics_data)
    print("Launching analytics dashboard. Close the browser window to return.")
    # Running on a local server; adjust debug and port as needed.
    app.run(debug=True, port=8050)


#############################
# Web Interface with 3 Tabs
#############################

def run_web_interface(use_markdown_display: bool = True, db_path: str = "../data/research_papers.db",
                     report_chain=None, recommendation_chain=None, 
                     paper_summary_chain=None, paper_recommendation_chain=None, model=None):
    """Run a web interface using IPython widgets with three tabs: Topic Research, Paper Analysis, and Analytics Dashboard."""
    try:
        from ipywidgets import widgets
        from IPython.display import display, clear_output
        output = widgets.Output()

        # Tab 1: Topic Research
        topic_input = widgets.Text(description='Topic:', placeholder='Enter research topic')
        search_button = widgets.Button(description='Research Topic')
        def on_search_click(b):
            with output:
                clear_output()
                run_research(topic_input.value, use_markdown_display, db_path, 
                           report_chain, recommendation_chain, model)
        search_button.on_click(on_search_click)
        tab1 = widgets.VBox([topic_input, search_button])
        
        # Tab 2: Paper Analysis
        file_upload_analysis = widgets.FileUpload(
            accept='.pdf,.docx,.doc',
            multiple=False,
            description='Upload Paper'
        )
        analyze_button = widgets.Button(description='Analyze Paper')
        def on_analyze_click(b):
            with output:
                clear_output()
                if not file_upload_analysis.value:
                    print("Please upload a research paper file (PDF or DOCX).")
                    return
                file_data = next(iter(file_upload_analysis.value.values()))
                file_name = next(iter(file_upload_analysis.value.keys()))
                with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1]) as temp_file:
                    temp_file.write(file_data['content'])
                    temp_path = temp_file.name
                try:
                    run_paper_analysis(temp_path, file_name, use_markdown_display, db_path,
                                     paper_summary_chain, paper_recommendation_chain, model)
                finally:
                    os.unlink(temp_path)
        analyze_button.on_click(on_analyze_click)
        tab2 = widgets.VBox([file_upload_analysis, analyze_button])
        
        # Tab 3: Analytics Dashboard
        file_upload_dashboard = widgets.FileUpload(
            accept='.pdf,.docx,.doc',
            multiple=False,
            description='Upload Paper'
        )
        dashboard_button = widgets.Button(description='Show Analytics Dashboard')
        def on_dashboard_click(b):
            with output:
                clear_output()
                if not file_upload_dashboard.value:
                    print("Please upload a research paper file (PDF or DOCX) for analytics.")
                    return
                file_data = next(iter(file_upload_dashboard.value.values()))
                file_name = next(iter(file_upload_dashboard.value.keys()))
                with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_name)[1]) as temp_file:
                    temp_file.write(file_data['content'])
                    temp_path = temp_file.name
                try:
                    # This function will start a Dash server and open the dashboard in your browser.
                    run_analytics_dashboard(temp_path, file_name)
                finally:
                    os.unlink(temp_path)
        dashboard_button.on_click(on_dashboard_click)
        tab3 = widgets.VBox([file_upload_dashboard, dashboard_button])
        
        tabs = widgets.Tab(children=[tab1, tab2, tab3])
        tabs.set_title(0, 'Topic Research')
        tabs.set_title(1, 'Paper Analysis')
        tabs.set_title(2, 'Analytics Dashboard')
        display(tabs)
        display(output)
    except ImportError:
        print("This function requires ipywidgets. Please install with: pip install ipywidgets")
        print("Running in command line mode instead.")
        run()

def run():
    """Main agent loop with support for topic research, paper analysis, and analytics dashboard."""
    # Initialize components
    api_key = load_environment()
    model = initialize_model(api_key)
    TopicRecommendations, PaperRecommendations = define_output_schemas()
    report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt = create_prompt_templates()
    report_chain, recommendation_chain, paper_summary_chain, paper_recommendation_chain = create_chains(
        model, report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt,
        TopicRecommendations, PaperRecommendations
    )
    
    initialize_database()
    print("🔍 AI Research Assistant Agent 🔍")
    print("--------------------------------")
    print("I can help you research topics, analyze research papers, and now provide an analytics dashboard for deeper insights.")
    while True:
        print("\nWhat would you like to do?")
        print("1. Research a topic")
        print("2. Analyze a research paper")
        print("3. Launch Analytics Dashboard for a research paper")
        print("4. Exit")
        choice = input("Enter your choice (1-4): ")
        if choice == '1':
            topic = input("\nWhat topic would you like to research? ")
            if topic.strip():
                run_research(topic, report_chain=report_chain, 
                           recommendation_chain=recommendation_chain, model=model)
            else:
                print("Please enter a valid topic.")
        elif choice == '2':
            file_path = input("\nEnter the path to the research paper file (PDF or DOCX): ")
            if os.path.exists(file_path):
                run_paper_analysis(file_path, paper_summary_chain=paper_summary_chain, 
                                 paper_recommendation_chain=paper_recommendation_chain, model=model)
            else:
                print(f"File not found: {file_path}")
        elif choice == '3':
            file_path = input("\nEnter the path to the research paper file (PDF or DOCX) for analytics: ")
            if os.path.exists(file_path):
                run_analytics_dashboard(file_path)
            else:
                print(f"File not found: {file_path}")
        elif choice == '4':
            print("Thank you for using the AI Research Assistant. Goodbye!")
            break
        else:
            print("Invalid choice. Please enter 1, 2, 3, or 4.")

if __name__ == "__main__":
    run()


🔍 AI Research Assistant Agent 🔍
--------------------------------
I can help you research topics, analyze research papers, and now provide an analytics dashboard for deeper insights.

What would you like to do?
1. Research a topic
2. Analyze a research paper
3. Launch Analytics Dashboard for a research paper
4. Exit
Analyzing paper for dashboard insights...
Launching analytics dashboard. Close the browser window to return.



What would you like to do?
1. Research a topic
2. Analyze a research paper
3. Launch Analytics Dashboard for a research paper
4. Exit
Thank you for using the AI Research Assistant. Goodbye!
