In [None]:
import os
import io
import base64
import sqlite3
import tempfile
from typing import List, Optional, Dict, Any
from datetime import datetime
import re
import warnings
import json
from urllib.parse import quote_plus
import time
import random

from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough
from IPython.display import Markdown, display, HTML
import PyPDF2 
import docx  
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import concurrent.futures

warnings.filterwarnings("ignore")

load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

gemini_model = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    google_api_key=GEMINI_API_KEY,
    temperature=0.7,
    max_tokens=4000,
)

# Create output schema for topic recommendations
class RecommendedTopic(BaseModel):
    topic: str = Field(description="The name of the recommended topic")
    description: str = Field(description="A brief description of why this topic is relevant")
    resource_url: str = Field(description="A relevant resource URL for this topic")

class TopicRecommendations(BaseModel):
    recommendations: List[RecommendedTopic] = Field(description="List of recommended related topics")

# Create output schema for paper recommendations
class RecommendedPaper(BaseModel):
    title: str = Field(description="The title of the recommended research paper")
    authors: str = Field(description="The authors of the paper")
    year: str = Field(description="Publication year")
    description: str = Field(description="Brief description of relevance to the original paper")
    paper_url: str = Field(description="URL to access this paper", default="")

class PaperRecommendations(BaseModel):
    recommendations: List[RecommendedPaper] = Field(description="List of recommended related papers")

# Create prompt templates
report_prompt = ChatPromptTemplate.from_template(
    """
    You are an AI research assistant. Create a comprehensive, detailed report on the following topic:
    
    Topic: {topic}
    
    Your report should include:
    1. Introduction to the topic
    2. Key concepts and definitions
    3. Historical context and development
    4. Current state and applications
    5. Future directions and potential developments
    6. Conclusion
    
    Format your report with clear markdown headings and subheadings. Use proper markdown formatting for emphasis, lists, and other elements.
    Make sure to provide in-depth analysis.
    """
)

paper_summary_prompt = ChatPromptTemplate.from_template(
    """
    You are an AI research assistant. Create a concise but comprehensive summary of the following research paper:
    
    Paper content: {paper_content}
    
    Your summary should include:
    1. Main objective of the research
    2. Methodology used
    3. Key findings and results
    4. Main conclusions and implications
    5. Limitations (if mentioned)
    
    Format your summary with clear markdown headings and keep it concise yet informative.
    Focus on the most important aspects of the paper.
    """
)

# Create chains
report_chain = (
    {"topic": RunnablePassthrough()}
    | report_prompt
    | gemini_model
    | StrOutputParser()
)

paper_summary_chain = (
    {"paper_content": RunnablePassthrough()}
    | paper_summary_prompt
    | gemini_model
    | StrOutputParser()
)

# --------------------- Web Crawling Functions ---------------------
def get_user_agent():
    """Get a random user agent to avoid detection"""
    try:
        ua = UserAgent()
        return ua.random
    except:
        # Fallback user agents if fake_useragent fails
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
        ]
        return random.choice(user_agents)

def make_request(url, max_retries=3, timeout=10):
    """Make a request to a URL with retries and random delays"""
    headers = {"User-Agent": get_user_agent()}
    for attempt in range(max_retries):
        try:
            # Add a small random delay to avoid rate limiting
            time.sleep(random.uniform(0.5, 2.0))
            response = requests.get(url, headers=headers, timeout=timeout)
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:
                print(f"Failed to retrieve {url}: {e}")
                return None
            # Exponential backoff
            time.sleep(2 ** attempt)
    return None

def extract_text_from_soup(soup):
    """Extract clean text from BeautifulSoup object"""
    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()
    
    # Get text
    text = soup.get_text(separator='\n', strip=True)
    
    # Clean text
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    
    return text

def extract_metadata_from_soup(soup):
    """Extract metadata like title, description from BeautifulSoup object"""
    metadata = {}
    
    # Extract title
    title_tag = soup.find('title')
    if title_tag:
        metadata['title'] = title_tag.text.strip()
    
    # Extract meta description
    description_tag = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
    if description_tag and description_tag.get('content'):
        metadata['description'] = description_tag.get('content').strip()
    
    return metadata

def crawl_website(url):
    """Crawl a website and return text content and metadata"""
    response = make_request(url)
    if not response:
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    text = extract_text_from_soup(soup)
    metadata = extract_metadata_from_soup(soup)
    
    return {
        'url': url,
        'text': text,
        'metadata': metadata
    }

def search_google(query, num_results=10):
    """Search Google and return a list of search result URLs"""
    # Use Google Search API or scrape Google Search results
    # This is a placeholder implementation using direct Google search
    # In a production environment, you might want to use a proper API
    
    # Encode the query for URL
    encoded_query = quote_plus(query)
    search_url = f"https://www.google.com/search?q={encoded_query}"
    
    response = make_request(search_url)
    if not response:
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract links from search results
    results = []
    for g in soup.find_all('div', class_='g'):
        link_elements = g.find_all('a')
        for link in link_elements:
            href = link.get('href')
            if href and href.startswith('http') and not href.startswith('https://www.google.com'):
                results.append(href)
                if len(results) >= num_results:
                    return results
    
    return results

def search_academic_sources(query, num_results=10):
    """Search academic sources like Google Scholar and return paper URLs"""
    # Encode the query for URL
    encoded_query = quote_plus(query)
    
    # Search Google Scholar
    scholar_url = f"https://scholar.google.com/scholar?q={encoded_query}"
    response = make_request(scholar_url)
    
    if not response:
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract paper information
    papers = []
    for result in soup.find_all('div', class_='gs_ri'):
        try:
            title_element = result.find('h3', class_='gs_rt')
            title = title_element.text if title_element else "Unknown Title"
            
            authors_year_element = result.find('div', class_='gs_a')
            authors_year_text = authors_year_element.text if authors_year_element else ""
            
            # Extract authors and year
            authors = authors_year_text.split('-')[0].strip() if '-' in authors_year_text else authors_year_text
            year_match = re.search(r'\b(19|20)\d{2}\b', authors_year_text)
            year = year_match.group(0) if year_match else "Unknown"
            
            # Get the paper URL
            link_element = title_element.find('a') if title_element else None
            url = link_element.get('href') if link_element else f"https://scholar.google.com/scholar?q={quote_plus(title)}"
            
            papers.append({
                'title': title,
                'authors': authors,
                'year': year,
                'url': url
            })
            
            if len(papers) >= num_results:
                break
        except Exception as e:
            print(f"Error parsing Google Scholar result: {e}")
            continue
    
    # If we didn't get enough results, try other sources
    if len(papers) < num_results:
        # Try arXiv
        arxiv_url = f"https://arxiv.org/search/?query={encoded_query}&searchtype=all"
        response = make_request(arxiv_url)
        
        if response:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            for result in soup.find_all('li', class_='arxiv-result'):
                try:
                    title_element = result.find('p', class_='title')
                    title = title_element.text.strip() if title_element else "Unknown Title"
                    
                    authors_element = result.find('p', class_='authors')
                    authors = authors_element.text.replace('Authors:', '').strip() if authors_element else "Unknown Authors"
                    
                    # Extract year from the submission date
                    submitted_element = result.find('p', class_='submitted')
                    year = "Unknown"
                    if submitted_element:
                        year_match = re.search(r'\b(19|20)\d{2}\b', submitted_element.text)
                        year = year_match.group(0) if year_match else "Unknown"
                    
                    # Get the paper URL
                    link_element = result.find('a', title='Abstract')
                    url = "https://arxiv.org" + link_element.get('href') if link_element else f"https://arxiv.org/search/?query={quote_plus(title)}"
                    
                    papers.append({
                        'title': title,
                        'authors': authors,
                        'year': year,
                        'url': url
                    })
                    
                    if len(papers) >= num_results:
                        break
                except Exception as e:
                    print(f"Error parsing arXiv result: {e}")
                    continue
    
    return papers

def get_topic_recommendations(topic, num_recommendations=5):
    """Use web crawling to find related topics and resources"""
    # First, search for the topic
    search_results = search_google(f"{topic} related topics", num_results=15)
    
    # Crawl the top results to find related topics
    crawled_data = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {executor.submit(crawl_website, url): url for url in search_results[:10]}
        for future in concurrent.futures.as_completed(future_to_url):
            try:
                data = future.result()
                if data:
                    crawled_data.append(data)
            except Exception as e:
                print(f"Error crawling website: {e}")
    
    # If we didn't get enough data, try another search
    if len(crawled_data) < 3:
        alternative_search = search_google(f"{topic} guide tutorial", num_results=10)
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            future_to_url = {executor.submit(crawl_website, url): url for url in alternative_search[:5]}
            for future in concurrent.futures.as_completed(future_to_url):
                try:
                    data = future.result()
                    if data:
                        crawled_data.append(data)
                except Exception as e:
                    print(f"Error crawling website: {e}")
    
    # Extract potential related topics from the crawled data
    related_topics = []
    for data in crawled_data:
        url = data['url']
        site_title = data['metadata'].get('title', '')
        site_description = data['metadata'].get('description', '')
        
        # Skip if the site is not relevant
        if not any(keyword in site_title.lower() or keyword in site_description.lower() for keyword in topic.lower().split()):
            continue
        
        # Create a recommendation
        topic_name = site_title.replace(" - ", " ").split(" | ")[0].strip()
        
        # Avoid duplicate topics
        if any(topic_name.lower() == existing['topic'].lower() for existing in related_topics):
            continue
        
        recommendation = {
            'topic': topic_name,
            'description': site_description[:150] + "..." if len(site_description) > 150 else site_description,
            'resource_url': url
        }
        
        related_topics.append(recommendation)
        
        if len(related_topics) >= num_recommendations:
            break
    
    # If we didn't get enough recommendations, create some from the topic itself
    while len(related_topics) < num_recommendations:
        # Use the topic name to generate a fake related topic
        fake_topic = f"{topic} {['applications', 'implementations', 'examples', 'tutorials', 'case studies'][len(related_topics) % 5]}"
        related_topics.append({
            'topic': fake_topic,
            'description': f"Learn more about {fake_topic} and how it relates to {topic}.",
            'resource_url': f"https://www.google.com/search?q={quote_plus(fake_topic)}"
        })
    
    return related_topics

def get_paper_recommendations(paper_content, num_recommendations=5):
    """Use web crawling to find related research papers"""
    # Extract key terms from the paper content
    key_terms = extract_key_terms(paper_content)
    
    # Search for related papers
    papers = []
    for term in key_terms[:3]:  # Use the top 3 key terms
        term_papers = search_academic_sources(term, num_results=5)
        papers.extend(term_papers)
        
        if len(papers) >= num_recommendations * 2:  # Get more than needed to filter
            break
    
    # Remove duplicates based on title
    unique_papers = []
    seen_titles = set()
    for paper in papers:
        title_lower = paper['title'].lower()
        if title_lower not in seen_titles:
            seen_titles.add(title_lower)
            unique_papers.append(paper)
    
    # Take the top N papers
    recommended_papers = unique_papers[:num_recommendations]
    
    # Generate descriptions for the papers
    for paper in recommended_papers:
        paper['description'] = generate_paper_description(paper['title'], key_terms)
    
    return recommended_papers

def extract_key_terms(text):
    """Extract key terms from text using simple frequency analysis"""
    # Remove common words and punctuation
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    
    # Filter out common stop words
    stop_words = set(['the', 'and', 'is', 'in', 'to', 'of', 'for', 'a', 'with', 'as', 'an', 'by', 'on', 'are', 'that', 'this', 'it', 'from', 'be', 'was', 'were', 'we', 'they', 'our', 'their', 'these', 'those', 'has', 'have', 'had', 'not', 'can', 'will', 'should', 'would', 'could', 'may', 'might', 'must', 'shall', 'which', 'what', 'who', 'whom', 'whose', 'when', 'where', 'why', 'how'])
    filtered_words = [word for word in words if word not in stop_words]
    
    # Count word frequencies
    word_counts = {}
    for word in filtered_words:
        word_counts[word] = word_counts.get(word, 0) + 1
    
    # Sort by frequency
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    
    # Extract multi-word terms
    bigrams = []
    for i in range(len(filtered_words) - 1):
        bigrams.append(filtered_words[i] + " " + filtered_words[i+1])
    
    # Count bigram frequencies
    bigram_counts = {}
    for bigram in bigrams:
        bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1
    
    # Sort by frequency
    sorted_bigrams = sorted(bigram_counts.items(), key=lambda x: x[1], reverse=True)
    
    # Combine top single words and bigrams
    key_terms = [word for word, _ in sorted_words[:10]]
    key_terms.extend([bigram for bigram, _ in sorted_bigrams[:5]])
    
    return key_terms

def generate_paper_description(title, key_terms):
    """Generate a description for a related paper based on the key terms"""
    # Check if the title contains any of the key terms
    matching_terms = [term for term in key_terms if term.lower() in title.lower()]
    
    if matching_terms:
        term = matching_terms[0]
        return f"This paper explores {term} which is directly related to the core concepts in your research. It offers additional insights and perspectives that could complement your work."
    else:
        return f"This paper provides complementary research that relates to your work. It discusses concepts and methodologies that could enhance your understanding of the subject matter."

# --------------------- Database Functions ---------------------
def initialize_database(db_path: str = "research_papers.db"):
    """Initialize SQLite database for storing papers"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS papers (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT NOT NULL,
            content TEXT NOT NULL,
            file_type TEXT NOT NULL,
            upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            summary TEXT
        )
    ''')
    conn.commit()
    conn.close()

def save_file_to_database(filename: str, content: str, file_type: str, db_path: str = "research_papers.db"):
    """Save file content to SQLite database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(
        "INSERT INTO papers (filename, content, file_type) VALUES (?, ?, ?)",
        (filename, content, file_type)
    )
    paper_id = cursor.lastrowid
    conn.commit()
    conn.close()
    return paper_id

def save_summary_to_database(paper_id: int, summary: str, db_path: str = "research_papers.db"):
    """Save paper summary to database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(
        "UPDATE papers SET summary = ? WHERE id = ?",
        (summary, paper_id)
    )
    conn.commit()
    conn.close()

def get_paper_from_database(paper_id: int, db_path: str = "research_papers.db"):
    """Retrieve paper content from database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT filename, content, file_type, summary FROM papers WHERE id = ?", (paper_id,))
    result = cursor.fetchone()
    conn.close()
    if result:
        return {
            "filename": result[0],
            "content": result[1],
            "file_type": result[2],
            "summary": result[3]
        }
    else:
        return None

# --------------------- Text Extraction Functions ---------------------
def extract_text_from_pdf(file_path: str) -> str:
    """Extract text content from a PDF file"""
    try:
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        text_content = "\n\n".join([doc.page_content for doc in documents])
        return text_content
    except Exception as e:
        print(f"Error extracting text from PDF: {str(e)}")
        # Fallback method
        text = ""
        with open(file_path, "rb") as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text

def extract_text_from_docx(file_path: str) -> str:
    """Extract text content from a DOCX file"""
    try:
        loader = Docx2txtLoader(file_path)
        documents = loader.load()
        text_content = "\n\n".join([doc.page_content for doc in documents])
        return text_content
    except Exception as e:
        print(f"Error extracting text from DOCX: {str(e)}")
        # Fallback method
        doc = docx.Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text

# --------------------- Report and Recommendation Functions ---------------------
def generate_report(topic: str) -> str:
    """Generate a detailed report on the given topic"""
    return report_chain.invoke(topic)

def generate_recommendations(topic: str) -> str:
    """Generate relevant topic recommendations using web crawling"""
    try:
        recommendations = get_topic_recommendations(topic)
        formatted_recommendations = "# Related Topics You May Be Interested In\n\n"
        for i, rec in enumerate(recommendations, 1):
            formatted_recommendations += f"## {i}. {rec['topic']}\n"
            formatted_recommendations += f"{rec['description']}\n"
            formatted_recommendations += f"[Learn more]({rec['resource_url']})\n\n"
        return formatted_recommendations
    except Exception as e:
        print(f"Error generating recommendations: {str(e)}")
        # Fallback to basic recommendations
        formatted_recommendations = "# Related Topics You May Be Interested In\n\n"
        for i in range(1, 6):
            formatted_recommendations += f"## {i}. {topic} {'applications' if i == 1 else 'examples' if i == 2 else 'tutorials' if i == 3 else 'case studies' if i == 4 else 'best practices'}\n"
            formatted_recommendations += f"Learn more about {topic} and related concepts.\n"
            formatted_recommendations += f"[Learn more](https://www.google.com/search?q={quote_plus(topic + ' ' + ('applications' if i == 1 else 'examples' if i == 2 else 'tutorials' if i == 3 else 'case studies' if i == 4 else 'best practices'))})\n\n"
        return formatted_recommendations

def create_full_report(topic: str, report_content: str, recommendations_content: str) -> str:
    """Create a full markdown report combining the report and recommendations"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    full_report = f"""
# Research Report: {topic}

*Generated on: {timestamp}*

---

{report_content}

---

{recommendations_content}

---

*This report was generated by AI Research Assistant using web crawling for recommendations*
"""
    return full_report

def generate_paper_recommendations_content(paper_content: str) -> str:
    """Generate paper recommendations content using web crawling"""
    try:
        recommendations = get_paper_recommendations(paper_content)
        formatted_recommendations = "# Related Research Papers You May Be Interested In\n\n"
        for i, rec in enumerate(recommendations, 1):
            formatted_recommendations += f"## {i}. {rec['title']} ({rec['year']})\n"
            formatted_recommendations += f"**Authors:** {rec['authors']}\n\n"
            formatted_recommendations += f"{rec['description']}\n"
            formatted_recommendations += f"[Access Paper]({rec['url']})\n\n"
        return formatted_recommendations
    except Exception as e:
        print(f"Error generating paper recommendations: {str(e)}")
        # Fallback to basic recommendations
        key_terms = extract_key_terms(paper_content)
        formatted_recommendations = "# Related Research Papers You May Be Interested In\n\n"
        for i in range(1, 6):
            term = key_terms[i % len(key_terms)] if key_terms else "research"
            encoded_term = quote_plus(term)
            formatted_recommendations += f"## {i}. Recent advances in {term} research\n"
            formatted_recommendations += f"**Authors:** Various researchers\n\n"
            formatted_recommendations += f"This paper explores {term} which is related to concepts in your research.\n"
            formatted_recommendations += f"[Access Paper](https://scholar.google.com/scholar?q={encoded_term})\n\n"
        return formatted_recommendations

def create_full_paper_analysis(filename: str, summary_content: str, recommendations_content: str) -> str:
    """Create a full markdown report for paper analysis"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    full_report = f"""
# Research Paper Analysis: {filename}

*Generated on: {timestamp}*

---

## Paper Summary

{summary_content}

---

{recommendations_content}

---

*This analysis was generated by AI Research Assistant using web crawling for recommendations*
"""
    return full_report

# --------------------- File and Display Functions ---------------------
def sanitize_filename(filename: str) -> str:
    """Convert a string to a valid filename"""
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

def save_markdown_file(topic: str, content: str) -> str:
    """Save content to a markdown file"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    safe_topic = sanitize_filename(topic)
    filename = f"research_{safe_topic}_{timestamp}.md"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)
    return filename

def display_markdown(content: str, use_markdown_display: bool = True):
    """Display content as rendered markdown if in IPython environment"""
    try:
        if use_markdown_display:
            display(Markdown(content))
        else:
            print(content)
    except:
        print(content)

# --------------------- Paper Processing Functions ---------------------
def process_research_paper(file_path: str, original_filename: Optional[str] = None, db_path: str = "research_papers.db") -> Dict[str, Any]:
    """Process a research paper file (PDF or DOCX)"""
    if not original_filename:
        original_filename = os.path.basename(file_path)
    file_extension = os.path.splitext(original_filename)[1].lower()
    if file_extension == '.pdf':
        text_content = extract_text_from_pdf(file_path)
        file_type = 'pdf'
    elif file_extension in ['.docx', '.doc']:
        text_content = extract_text_from_docx(file_path)
        file_type = 'docx'
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=12000,
        chunk_overlap=2000
    )
    chunks = text_splitter.split_text(text_content)
    processing_text = chunks[0] if len(chunks) > 0 else text_content
    
    paper_id = save_file_to_database(original_filename, text_content, file_type, db_path=db_path)
    
    try:
        print("- Generating research paper summary...")
        summary = paper_summary_chain.invoke(processing_text)
        save_summary_to_database(paper_id, summary, db_path=db_path)
        
        print("- Finding related research papers using web crawling...")
        formatted_recommendations = generate_paper_recommendations_content(processing_text)
        
        return {
            "paper_id": paper_id,
            "filename": original_filename,
            "summary": summary,
            "recommendations": formatted_recommendations,
            "success": True
        }
    except Exception as e:
        print(f"Error processing research paper: {str(e)}")
        return {
            "paper_id": paper_id,
            "filename": original_filename,
            "error": str(e),
            "success": False
        }

# --------------------- Main Task Functions ---------------------
def run_research(topic: str, use_markdown_display: bool = True, db_path: str = "research_papers.db") -> Optional[Dict[str, Any]]:
    """Perform research on a specific topic"""
    if not topic.strip():
        print("Please enter a valid topic.")
        return
    
    print(f"\nResearching '{topic}'... This may take a moment.")
    
    try:
        initialize_database(db_path)
        
        print("- Generating detailed report...")
        report = generate_report(topic)
        
        print("- Finding related topics using web crawling...")
        recommendations = generate_recommendations(topic)
        
        full_report = create_full_report(topic, report, recommendations)
        filename = save_markdown_file(topic, full_report)
        
        print(f"\nResearch completed! Report saved as {filename}\n")
        
        # Display the report if requested
        if use_markdown_display:
            display_markdown(full_report, use_markdown_display)
        
        return {
            "topic": topic,
            "filename": filename,
            "report": report,
            "recommendations": recommendations,
            "success": True
        }
    except Exception as e:
        print(f"Error performing research: {str(e)}")
        return {
            "topic": topic,
            "error": str(e),
            "success": False
        }

def analyze_research_paper(file_content, filename: str, use_markdown_display: bool = True, db_path: str = "research_papers.db") -> Optional[Dict[str, Any]]:
    """Analyze a research paper from uploaded file content"""
    if not file_content or not filename:
        print("Please provide valid file content and filename.")
        return None
    
    print(f"\nAnalyzing research paper '{filename}'... This may take a moment.")
    
    try:
        initialize_database(db_path)
        
        # Save file content to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as temp_file:
            temp_file_path = temp_file.name
            temp_file.write(file_content)
        
        result = process_research_paper(temp_file_path, filename, db_path)
        
        # Clean up temporary file
        try:
            os.unlink(temp_file_path)
        except:
            pass
        
        if result["success"]:
            full_analysis = create_full_paper_analysis(
                filename, 
                result["summary"], 
                result["recommendations"]
            )
            
            # Save analysis to markdown file
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            safe_filename = sanitize_filename(os.path.splitext(filename)[0])
            output_filename = f"analysis_{safe_filename}_{timestamp}.md"
            
            with open(output_filename, "w", encoding="utf-8") as f:
                f.write(full_analysis)
            
            print(f"\nAnalysis completed! Report saved as {output_filename}\n")
            
            # Display the analysis if requested
            if use_markdown_display:
                display_markdown(full_analysis, use_markdown_display)
            
            result["output_filename"] = output_filename
            result["full_analysis"] = full_analysis
        
        return result
    except Exception as e:
        print(f"Error analyzing research paper: {str(e)}")
        return {
            "filename": filename,
            "error": str(e),
            "success": False
        }

def analyze_paper_by_id(paper_id: int, use_markdown_display: bool = True, db_path: str = "research_papers.db") -> Optional[Dict[str, Any]]:
    """Analyze a research paper by its ID in the database"""
    try:
        paper_data = get_paper_from_database(paper_id, db_path)
        if not paper_data:
            print(f"No paper found with ID {paper_id}")
            return None
        
        print(f"\nAnalyzing research paper ID {paper_id}... This may take a moment.")
        
        # Generate summary if not already present
        if not paper_data.get("summary"):
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=12000,
                chunk_overlap=2000
            )
            chunks = text_splitter.split_text(paper_data["content"])
            processing_text = chunks[0] if len(chunks) > 0 else paper_data["content"]
            
            print("- Generating research paper summary...")
            summary = paper_summary_chain.invoke(processing_text)
            save_summary_to_database(paper_id, summary, db_path)
            paper_data["summary"] = summary
        
        print("- Finding related research papers using web crawling...")
        recommendations = generate_paper_recommendations_content(paper_data["content"])
        
        full_analysis = create_full_paper_analysis(
            paper_data["filename"], 
            paper_data["summary"], 
            recommendations
        )
        
        # Save analysis to markdown file
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        safe_filename = sanitize_filename(os.path.splitext(paper_data["filename"])[0])
        output_filename = f"analysis_{safe_filename}_{timestamp}.md"
        
        with open(output_filename, "w", encoding="utf-8") as f:
            f.write(full_analysis)
        
        print(f"\nAnalysis completed! Report saved as {output_filename}\n")
        
        # Display the analysis if requested
        if use_markdown_display:
            display_markdown(full_analysis, use_markdown_display)
        
        return {
            "paper_id": paper_id,
            "filename": paper_data["filename"],
            "summary": paper_data["summary"],
            "recommendations": recommendations,
            "output_filename": output_filename,
            "full_analysis": full_analysis,
            "success": True
        }
    except Exception as e:
        print(f"Error analyzing paper from database: {str(e)}")
        return {
            "paper_id": paper_id,
            "error": str(e),
            "success": False
        }

# --------------------- File Upload Helpers ---------------------
def encode_file_to_base64(file_path: str) -> str:
    """Encode file content to base64 for display"""
    with open(file_path, "rb") as file:
        file_content = file.read()
        return base64.b64encode(file_content).decode('utf-8')

def process_base64_file(base64_data: str, filename: str) -> Dict[str, Any]:
    """Process file from base64 encoded data"""
    try:
        # Extract content type and actual base64 data if it's a data URL
        if base64_data.startswith('data:'):
            format_spec, base64_data = base64_data.split(';base64,', 1)
        
        # Decode base64 data
        file_content = base64.b64decode(base64_data)
        
        # Save to temporary file and process
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as temp_file:
            temp_file_path = temp_file.name
            temp_file.write(file_content)
        
        result = process_research_paper(temp_file_path, filename)
        
        # Clean up temporary file
        try:
            os.unlink(temp_file_path)
        except:
            pass
        
        return result
    except Exception as e:
        print(f"Error processing base64 file: {str(e)}")
        return {
            "filename": filename,
            "error": str(e),
            "success": False
        }

# --------------------- HTML Display Functions ---------------------
def get_html_report_display(report_content: str) -> str:
    """Convert markdown report to HTML for display"""
    try:
        from markdown import markdown
        html_content = markdown(report_content)
        return f"""
        <div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; line-height: 1.6;">
            {html_content}
        </div>
        """
    except:
        # Simple fallback conversion if markdown module is not available
        html_content = report_content.replace("\n\n", "<br><br>")
        html_content = html_content.replace("# ", "<h1>").replace("\n## ", "</h1><h2>")
        html_content = html_content.replace("## ", "<h2>").replace("\n### ", "</h2><h3>")
        html_content = html_content.replace("### ", "<h3>").replace("\n#### ", "</h3><h4>")
        html_content = html_content.replace("**", "<strong>").replace("**", "</strong>")
        html_content = html_content.replace("*", "<em>").replace("*", "</em>")
        return f"""
        <div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; line-height: 1.6;">
            {html_content}
        </div>
        """

def display_html_report(report_content: str):
    """Display report as HTML in IPython environment"""
    try:
        html_content = get_html_report_display(report_content)
        display(HTML(html_content))
    except:
        print(report_content)

# --------------------- Command Line Interface ---------------------
def main():
    """Command line interface for the research assistant"""
    import argparse
    
    parser = argparse.ArgumentParser(description="AI Research Assistant")
    subparsers = parser.add_subparsers(dest="command", help="Command to run")
    
    # Research topic command
    research_parser = subparsers.add_parser("research", help="Research a topic")
    research_parser.add_argument("topic", help="Topic to research")
    
    # Analyze paper command
    analyze_parser = subparsers.add_parser("analyze", help="Analyze a research paper")
    analyze_parser.add_argument("file_path", help="Path to research paper file (PDF or DOCX)")
    
    # Process database entry command
    db_parser = subparsers.add_parser("paper", help="Process a paper from the database")
    db_parser.add_argument("paper_id", type=int, help="ID of the paper in the database")
    
    args = parser.parse_args()
    
    if args.command == "research":
        run_research(args.topic, use_markdown_display=False)
    elif args.command == "analyze":
        with open(args.file_path, "rb") as file:
            file_content = file.read()
            analyze_research_paper(file_content, os.path.basename(args.file_path), use_markdown_display=False)
    elif args.command == "paper":
        analyze_paper_by_id(args.paper_id, use_markdown_display=False)
    else:
        parser.print_help()

if __name__ == "__main__":
    main()

usage: ipykernel_launcher.py [-h] {research,analyze,paper} ...
ipykernel_launcher.py: error: unrecognized arguments: --f=c:\Users\fahim\AppData\Roaming\jupyter\runtime\kernel-v3b5ad2c945842d6cf304a5d0565d67860bed15d7a.json


SystemExit: 2

In [1]:
import os
import io
import base64
import sqlite3
import tempfile
from typing import List, Optional, Dict, Any
from datetime import datetime
import re
import warnings
import requests
from bs4 import BeautifulSoup
import urllib.parse

# Third-party imports
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough
from IPython.display import Markdown, display, HTML
import PyPDF2 
import docx  
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Suppress warnings
warnings.filterwarnings("ignore")

def load_environment():
    """Load environment variables"""
    load_dotenv()
    return os.getenv("GEMINI_API_KEY")

def initialize_model(api_key):
    """Initialize Gemini model"""
    return ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        google_api_key=api_key,
        temperature=0.7,
        max_tokens=4000,
    )

def define_output_schemas():
    """Define output schemas for topic and paper recommendations"""
    # Output schema for topic recommendations
    class RecommendedTopic(BaseModel):
        topic: str = Field(description="The name of the recommended topic")
        description: str = Field(description="A brief description of why this topic is relevant")
        resource_url: str = Field(description="A relevant resource URL for this topic")

    class TopicRecommendations(BaseModel):
        recommendations: List[RecommendedTopic] = Field(description="List of recommended related topics")

    # Output schema for paper recommendations
    class RecommendedPaper(BaseModel):
        title: str = Field(description="The title of the recommended research paper")
        authors: str = Field(description="The authors of the paper")
        year: str = Field(description="Publication year")
        description: str = Field(description="Brief description of relevance to the original paper")
        paper_url: str = Field(description="URL to access this paper", default="")

    class PaperRecommendations(BaseModel):
        recommendations: List[RecommendedPaper] = Field(description="List of recommended related papers")
        
    return TopicRecommendations, PaperRecommendations

def create_prompt_templates():
    """Create prompt templates for research tasks"""
    report_prompt = ChatPromptTemplate.from_template(
        """
        You are an AI research assistant. Create a comprehensive, detailed report on the following topic:
        
        Topic: {topic}
        
        Your report should include:
        1. Introduction to the topic
        2. Key concepts and definitions
        3. Historical context and development
        4. Current state and applications
        5. Future directions and potential developments
        6. Conclusion
        
        Format your report with clear markdown headings and subheadings. Use proper markdown formatting for emphasis, lists, and other elements.
        Make sure to provide in-depth analysis.
        """
    )

    recommendation_prompt = ChatPromptTemplate.from_template(
        """
        Based on the topic: {topic}
        
        Generate 5 relevant related topics that the user might be interested in researching next.
        For each recommendation, provide:
        1. The topic name
        2. A brief 1-2 sentence description of why it's relevant
        3. A relevant resource URL that would contain valuable information about this topic
        
        Your response must be formatted as a valid JSON object that matches this structure:
        {
            "recommendations": [
                {
                    "topic": "Topic Name",
                    "description": "Brief description of relevance",
                    "resource_url": "https://example.com/relevant-page"
                },
                ...
            ]
        }
        
        Use reputable sources for your resource URLs. While you can't verify if the exact URLs exist,
        make them realistic and likely to contain quality information.
        """
    )

    paper_summary_prompt = ChatPromptTemplate.from_template(
        """
        You are an AI research assistant. Create a concise but comprehensive summary of the following research paper:
        
        Paper content: {paper_content}
        
        Your summary should include:
        1. Main objective of the research
        2. Methodology used
        3. Key findings and results
        4. Main conclusions and implications
        5. Limitations (if mentioned)
        
        Format your summary with clear markdown headings and keep it concise yet informative.
        Focus on the most important aspects of the paper.
        """
    )

    paper_recommendation_prompt = ChatPromptTemplate.from_template(
    """
    Based on the following research paper:
    
    Paper content: {paper_content}
    
    Generate 5 relevant related research papers that the user might be interested in reading next.
    These should be real papers that likely exist in the academic literature.
    
    For each recommendation, provide:
    1. The paper title (use the actual title of a real paper if you know it)
    2. The authors (use "et al." for multiple authors after the first)
    3. Publication year (estimate if necessary)
    4. A brief description of why it's relevant to the original paper
    5. A URL where the paper might be found - THIS IS CRITICAL. 
    
    For URLs, use specific links from:
    - Google Scholar (https://scholar.google.com/scholar?q=PAPER_TITLE)
    - arXiv (https://arxiv.org/search/?query=PAPER_TITLE)
    - ResearchGate (https://www.researchgate.net/search.Search.html?query=PAPER_TITLE)
    - ACM Digital Library (https://dl.acm.org/action/doSearch?AllField=PAPER_TITLE)
    - IEEE Xplore (https://ieeexplore.ieee.org/search/searchresult.jsp?queryText=PAPER_TITLE)
    
    Replace PAPER_TITLE with URL-encoded paper title in these templates. Make sure EVERY recommendation has a working URL.
    
    Your response must be formatted as a valid JSON object that matches this structure:
    {{
        "recommendations": [
            {{
                "title": "Paper Title",
                "authors": "Author names",
                "year": "Publication year",
                "description": "Brief description of relevance",
                "paper_url": "https://example.com/paper-link"
            }},
            ...
        ]
    }}
    """
)
    
    return report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt

def create_chains(model, report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt, TopicRecommendations, PaperRecommendations):
    """Create processing chains for research tasks"""
    report_chain = (
        {"topic": RunnablePassthrough()}
        | report_prompt
        | model
        | StrOutputParser()
    )

    recommendation_chain = (
        {"topic": RunnablePassthrough()}
        | recommendation_prompt
        | model
        | JsonOutputParser(pydantic_object=TopicRecommendations)
    )

    paper_summary_chain = (
        {"paper_content": RunnablePassthrough()}
        | paper_summary_prompt
        | model
        | StrOutputParser()
    )

    paper_recommendation_chain = (
        {"paper_content": RunnablePassthrough()}
        | paper_recommendation_prompt
        | model
        | JsonOutputParser(pydantic_object=PaperRecommendations)
    )
    
    return report_chain, recommendation_chain, paper_summary_chain, paper_recommendation_chain

def initialize_database(db_path: str = "../data/research_papers.db"):
    """Initialize SQLite database for storing papers"""
    # Ensure the directory exists
    os.makedirs(os.path.dirname(db_path), exist_ok=True)
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS papers (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT NOT NULL,
            content TEXT NOT NULL,
            file_type TEXT NOT NULL,
            upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            summary TEXT
        )
    ''')
    conn.commit()
    conn.close()

def save_file_to_database(filename: str, content: str, file_type: str, db_path: str = "../data/research_papers.db"):
    """Save file content to SQLite database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(
        "INSERT INTO papers (filename, content, file_type) VALUES (?, ?, ?)",
        (filename, content, file_type)
    )
    paper_id = cursor.lastrowid
    conn.commit()
    conn.close()
    return paper_id

def save_summary_to_database(paper_id: int, summary: str, db_path: str = "../data/research_papers.db"):
    """Save paper summary to database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(
        "UPDATE papers SET summary = ? WHERE id = ?",
        (summary, paper_id)
    )
    conn.commit()
    conn.close()

def get_paper_from_database(paper_id: int, db_path: str = "../data/research_papers.db"):
    """Retrieve paper content from database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT filename, content, file_type, summary FROM papers WHERE id = ?", (paper_id,))
    result = cursor.fetchone()
    conn.close()
    if result:
        return {
            "filename": result[0],
            "content": result[1],
            "file_type": result[2],
            "summary": result[3]
        }
    else:
        return None

def extract_text_from_pdf(file_path: str) -> str:
    """Extract text content from a PDF file"""
    try:
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        text_content = "\n\n".join([doc.page_content for doc in documents])
        return text_content
    except Exception as e:
        print(f"Error extracting text from PDF: {str(e)}")
        # Fallback method
        text = ""
        with open(file_path, "rb") as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text

def extract_text_from_docx(file_path: str) -> str:
    """Extract text content from a DOCX file"""
    try:
        loader = Docx2txtLoader(file_path)
        documents = loader.load()
        text_content = "\n\n".join([doc.page_content for doc in documents])
        return text_content
    except Exception as e:
        print(f"Error extracting text from DOCX: {str(e)}")
        # Fallback method
        doc = docx.Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text

def generate_report(topic: str, report_chain) -> str:
    """Generate a detailed report on the given topic"""
    return report_chain.invoke(topic)

def search_web(query, num_results=5):
    """Search the web for related topics"""
    try:
        # Format query for search engines
        search_query = urllib.parse.quote_plus(query)
        
        # List of search URLs to try
        search_urls = [
            f"https://www.google.com/search?q={search_query}",
            f"https://en.wikipedia.org/wiki/Special:Search?search={search_query}&go=Go",
            f"https://scholar.google.com/scholar?q={search_query}"
        ]
        
        results = []
        
        # Set user agent to avoid being blocked
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Try each search engine until we get enough results
        for search_url in search_urls:
            if len(results) >= num_results:
                break
                
            try:
                response = requests.get(search_url, headers=headers, timeout=10)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    # Extract links and titles (implementation varies by search engine)
                    if 'google.com/search' in search_url:
                        # For Google
                        search_results = soup.select('div.g')
                        for result in search_results:
                            title_element = result.select_one('h3')
                            link_element = result.select_one('a')
                            
                            if title_element and link_element and 'href' in link_element.attrs:
                                title = title_element.get_text()
                                link = link_element['href']
                                
                                # Remove Google redirects
                                if link.startswith('/url?q='):
                                    link = link.split('/url?q=')[1].split('&')[0]
                                
                                if link.startswith('http') and not any(x['url'] == link for x in results):
                                    results.append({
                                        'title': title,
                                        'url': link
                                    })
                                    
                                if len(results) >= num_results:
                                    break
                    
                    elif 'wikipedia.org' in search_url:
                        # For Wikipedia
                        search_results = soup.select('ul.mw-search-results li')
                        for result in search_results:
                            title_element = result.select_one('a')
                            if title_element and 'href' in title_element.attrs:
                                title = title_element.get_text()
                                link = 'https://en.wikipedia.org' + title_element['href']
                                
                                if not any(x['url'] == link for x in results):
                                    results.append({
                                        'title': title,
                                        'url': link
                                    })
                                    
                                if len(results) >= num_results:
                                    break
                    
                    elif 'scholar.google.com' in search_url:
                        # For Google Scholar
                        search_results = soup.select('div.gs_ri')
                        for result in search_results:
                            title_element = result.select_one('h3 a')
                            if title_element and 'href' in title_element.attrs:
                                title = title_element.get_text()
                                link = title_element['href']
                                
                                if not link.startswith('http'):
                                    link = 'https://scholar.google.com' + link
                                
                                if not any(x['url'] == link for x in results):
                                    results.append({
                                        'title': title,
                                        'url': link
                                    })
                                    
                                if len(results) >= num_results:
                                    break
            
            except Exception as e:
                print(f"Error searching {search_url}: {str(e)}")
                continue
        
        return results
    
    except Exception as e:
        print(f"Web search error: {str(e)}")
        return []

def generate_recommendations(topic: str, recommendation_chain=None, model=None) -> str:
    """Generate relevant topic recommendations using web crawling"""
    try:
        print("- Searching the web for related topics...")
        search_results = search_web(topic, num_results=15)
        
        # If we didn't get enough results, try searching for "related to [topic]"
        if len(search_results) < 5:
            additional_results = search_web(f"related to {topic}", num_results=10)
            for result in additional_results:
                if result not in search_results:
                    search_results.append(result)
        
        # Extract relevant topics from search results
        formatted_recommendations = "# Related Topics You May Be Interested In\n\n"
        used_topics = set()
        count = 0
        
        for result in search_results:
            if count >= 5:
                break
                
            # Extract a topic name from the search result title
            title = result['title']
            url = result['url']
            
            # Skip if URL is suspicious
            if not url.startswith(('http://', 'https://')):
                continue
                
            # Generate a clean topic name
            topic_name = re.sub(r'\s*\|.*$', '', title)  # Remove text after pipe symbol
            topic_name = re.sub(r'\s*-.*$', '', topic_name)  # Remove text after dash
            
            # Skip very short or very long topic names
            if len(topic_name) < 5 or len(topic_name) > 100:
                continue
                
            # Skip if too similar to original topic
            if topic_name.lower() == topic.lower():
                continue
                
            # Check if we've already used a similar topic
            similar = False
            for used_topic in used_topics:
                if (topic_name.lower() in used_topic.lower() or 
                    used_topic.lower() in topic_name.lower()):
                    similar = True
                    break
            
            if similar:
                continue
                
            used_topics.add(topic_name)
            count += 1
            
            # Generate a description using the topic and original query
            description = f"This topic is closely related to {topic} and offers additional perspectives and insights."
            
            formatted_recommendations += f"## {count}. {topic_name}\n"
            formatted_recommendations += f"{description}\n"
            formatted_recommendations += f"[Learn more]({url})\n\n"
        
        # If we didn't get enough recommendations from web search, generate the missing ones
        if count < 5:
            # Create a backup prompt for generating the remaining recommendations
            backup_prompt = ChatPromptTemplate.from_template(
                f"""
                I already have the following related topics for "{topic}":
                {", ".join(used_topics)}
                
                Please suggest {5 - count} more related topics that are different from the ones above.
                For each topic, provide:
                1. The topic name
                2. A brief description of why it's relevant to {topic}
                3. A reasonable URL where someone might learn about this topic (like a Wikipedia or educational site)
                
                Format each recommendation like this:
                Topic: [topic name]
                Description: [description]
                URL: [url]
                """
            )
            backup_chain = backup_prompt | model | StrOutputParser()
            additional_recs = backup_chain.invoke({})
            
            # Parse the generated recommendations
            for line in additional_recs.split("\n\n"):
                if count >= 5:
                    break
                    
                match_topic = re.search(r"Topic:(.+)", line)
                match_desc = re.search(r"Description:(.+)", line)
                match_url = re.search(r"URL:(.+)", line)
                
                if match_topic and match_desc and match_url:
                    topic_name = match_topic.group(1).strip()
                    description = match_desc.group(1).strip()
                    url = match_url.group(1).strip()
                    
                    count += 1
                    formatted_recommendations += f"## {count}. {topic_name}\n"
                    formatted_recommendations += f"{description}\n"
                    formatted_recommendations += f"[Learn more]({url})\n\n"
        
        return formatted_recommendations
    
    except Exception as e:
        print(f"Error in web-based recommendations: {str(e)}")
        # Fallback to using the LLM directly if web search fails
        backup_prompt = ChatPromptTemplate.from_template(
            """
            Based on the topic: {topic}
            
            Provide 5 relevant related topics that the user might be interested in researching next.
            For each recommendation, provide:
            1. The topic name
            2. A brief description of why it's relevant
            3. A relevant resource link
            
            Format your response as a markdown list.
            """
        )
        backup_chain = backup_prompt | model | StrOutputParser()
        return backup_chain.invoke({"topic": topic})

def generate_paper_recommendations(paper_content, paper_recommendation_chain=None, model=None) -> str:
    """Generate recommendations for related papers using web crawling"""
    try:
        # Extract key phrases from the paper
        key_phrases_prompt = ChatPromptTemplate.from_template(
            """
            Extract 5 key technical phrases or terms from the following paper that could be used to find related research.
            Only return the phrases as a comma-separated list with no additional text.
            
            Paper content: {paper_content}
            """
        )
        key_phrases_chain = key_phrases_prompt | model | StrOutputParser()
        key_phrases = key_phrases_chain.invoke({"paper_content": paper_content}).split(",")
        
        # Search for related papers using the key phrases
        papers = []
        for phrase in key_phrases:
            if len(papers) >= 10:
                break
                
            search_results = search_web(f"{phrase.strip()} research paper", num_results=5)
            for result in search_results:
                if len(papers) >= 10:
                    break
                
                title = result['title']
                url = result['url']
                
                # Skip non-academic-looking results
                if not any(domain in url for domain in ['.edu', 'arxiv.org', 'scholar.google', 'researchgate', 'ieee.org', 'acm.org']):
                    continue
                    
                # Skip very short titles
                if len(title) < 10:
                    continue
                    
                # Check if we already have this paper
                if any(p['title'] == title for p in papers):
                    continue
                    
                # Add the paper to our results
                papers.append({
                    'title': title,
                    'url': url,
                    'phrase': phrase.strip()
                })
        
        # Process the best 5 results
        top_papers = papers[:5]
        
        # If we don't have enough papers, generate some with the model
        if len(top_papers) < 5:
            remaining = 5 - len(top_papers)
            paper_gen_prompt = ChatPromptTemplate.from_template(
                f"""
                Based on the key phrases {', '.join(key_phrases)}, 
                suggest {remaining} academic papers that would be related to a paper discussing these topics.
                
                For each paper, provide:
                1. A realistic paper title
                2. Author names (use et al. for multiple authors)
                3. A realistic publication year (between 2015-2024)
                4. A URL where the paper might be found
                
                Format each paper like this:
                Title: [title]
                Authors: [authors]
                Year: [year]
                URL: [url]
                """
            )
            paper_gen_chain = paper_gen_prompt | model | StrOutputParser()
            additional_papers = paper_gen_chain.invoke({})
            
            # Parse the generated papers
            current_paper = {}
            for line in additional_papers.split('\n'):
                if line.startswith('Title:'):
                    if current_paper and 'title' in current_paper:
                        top_papers.append(current_paper)
                        current_paper = {}
                    current_paper['title'] = line.replace('Title:', '').strip()
                elif line.startswith('Authors:'):
                    current_paper['authors'] = line.replace('Authors:', '').strip()
                elif line.startswith('Year:'):
                    current_paper['year'] = line.replace('Year:', '').strip()
                elif line.startswith('URL:'):
                    current_paper['url'] = line.replace('URL:', '').strip()
            
            if current_paper and 'title' in current_paper:
                top_papers.append(current_paper)
        
        # Generate descriptions for each paper
        formatted_recommendations = "# Related Research Papers You May Be Interested In\n\n"
        for i, paper in enumerate(top_papers[:5], 1):
            title = paper.get('title', '')
            url = paper.get('url', '')
            
            # Generate paper metadata if missing
            authors = paper.get('authors', 'Various authors')
            if 'authors' not in paper:
                # Extract authors from title or URL if possible, otherwise use placeholder
                authors = "Various authors"
                
            year = paper.get('year', '2023')
            if 'year' not in paper:
                # Try to extract year from URL or title, otherwise use recent year
                year_match = re.search(r'20[12]\d', title)
                if year_match:
                    year = year_match.group(0)
                else:
                    year = "2023"
            
            # Generate a description based on the title and the original key phrase
            description = f"This paper relates to {paper.get('phrase', 'your research topic')} and expands on the concepts in your paper."
            
            # Add the recommendation
            formatted_recommendations += f"## {i}. {title} ({year})\n"
            formatted_recommendations += f"**Authors:** {authors}\n\n"
            formatted_recommendations += f"{description}\n"
            formatted_recommendations += f"[Access Paper]({url})\n\n"
            
        return formatted_recommendations
        
    except Exception as e:
        print(f"Error in web-based paper recommendations: {str(e)}")
        # Fallback to using the paper recommendation chain
        try:
            recommendations_data = paper_recommendation_chain.invoke(paper_content)
            recs = recommendations_data["recommendations"]  # Access as a dict
            formatted_recommendations = "# Related Research Papers You May Be Interested In\n\n"
            for i, rec in enumerate(recs, 1):
                formatted_recommendations += f"## {i}. {rec['title']} ({rec['year']})\n"
                formatted_recommendations += f"**Authors:** {rec['authors']}\n\n"
                formatted_recommendations += f"{rec['description']}\n"
                paper_url = rec['paper_url'].strip()
                if not paper_url:
                    encoded_title = re.sub(r'\s+', '+', rec['title'])
                    paper_url = f"https://scholar.google.com/scholar?q={encoded_title}"
                formatted_recommendations += f"[Access Paper]({paper_url})\n\n"
            return formatted_recommendations
        except Exception as e:
            # If everything fails, use a simple backup prompt
            backup_prompt = ChatPromptTemplate.from_template(
                """
                Based on the following research paper content:
                
                {paper_content}
                
                Provide 5 relevant related research papers that might be of interest.
                For each paper, include:
                1. Title (a real paper title if possible)
                2. Authors
                3. Year
                4. Brief description of relevance
                5. A direct URL to access the paper
                
                Format your response in markdown with clear headings and clickable links.
                """
            )
            backup_chain = backup_prompt | model | StrOutputParser()
            return backup_chain.invoke({"paper_content": paper_content})

def create_full_report(topic: str, report_content: str, recommendations_content: str) -> str:
    """Create a full markdown report combining the report and recommendations"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    full_report = f"""
# Research Report: {topic}

*Generated on: {timestamp}*

---

{report_content}

---

{recommendations_content}

---

*This report was generated by AI Research Assistant*
"""
    return full_report

def create_full_paper_analysis(filename: str, summary_content: str, recommendations_content: str) -> str:
    """Create a full markdown report for paper analysis"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    full_report = f"""
# Research Paper Analysis: {filename}

*Generated on: {timestamp}*

---

## Paper Summary

{summary_content}

---

{recommendations_content}

---

*This analysis was generated by AI Research Assistant*
"""
    return full_report

def sanitize_filename(filename: str) -> str:
    """Convert a string to a valid filename"""
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

def save_markdown_file(topic: str, content: str) -> str:
    """Save content to a markdown file"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    safe_topic = sanitize_filename(topic)
    filename = f"research_{safe_topic}_{timestamp}.md"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)
    return filename

def display_markdown(content: str, use_markdown_display: bool = True):
    """Display content as rendered markdown if in IPython environment"""
    try:
        if use_markdown_display:
            display(Markdown(content))
        else:
            print(content)
    except:
        print(content)

def process_research_paper(file_path: str, original_filename: Optional[str] = None, 
                           db_path: str = "../data/research_papers.db", 
                           paper_summary_chain=None, 
                           paper_recommendation_chain=None, 
                           model=None) -> Dict[str, Any]:
    """Process a research paper file (PDF or DOCX)"""
    if not original_filename:
        original_filename = os.path.basename(file_path)
    file_extension = os.path.splitext(original_filename)[1].lower()
    if file_extension == '.pdf':
        text_content = extract_text_from_pdf(file_path)
        file_type = 'pdf'
    elif file_extension in ['.docx', '.doc']:
        text_content = extract_text_from_docx(file_path)
        file_type = 'docx'
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")
        
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=12000,
        chunk_overlap=2000
    )
    chunks = text_splitter.split_text(text_content)
    processing_text = chunks[0] if len(chunks) > 0 else text_content
    
    paper_id = save_file_to_database(original_filename, text_content, file_type, db_path=db_path)
    
    try:
        print("- Generating research paper summary...")
        summary = paper_summary_chain.invoke(processing_text)
        save_summary_to_database(paper_id, summary, db_path=db_path)
        
        print("- Finding related research papers via web search...")
        formatted_recommendations = generate_paper_recommendations(processing_text, paper_recommendation_chain, model)
        
        return {
            "paper_id": paper_id,
            "filename": original_filename,
            "summary": summary,
            "recommendations": formatted_recommendations,
            "success": True
        }
    except Exception as e:
        print(f"Error processing research paper: {str(e)}")

        return {
            "paper_id": paper_id,
            "filename": original_filename,
            "success": False,
            "error": str(e)
        }

def upload_research_paper_file(file_data, filename: str, 
                               paper_summary_chain=None, 
                               paper_recommendation_chain=None, 
                               model=None) -> Dict[str, Any]:
    """Process an uploaded research paper file"""
    try:
        # Create a temporary file to work with
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as temp_file:
            temp_file.write(file_data)
            temp_path = temp_file.name
        
        # Process the paper
        result = process_research_paper(
            temp_path, 
            original_filename=filename,
            paper_summary_chain=paper_summary_chain,
            paper_recommendation_chain=paper_recommendation_chain,
            model=model
        )
        
        # Clean up the temporary file
        os.unlink(temp_path)
        
        if result["success"]:
            # Create full analysis report
            full_analysis = create_full_paper_analysis(
                filename,
                result["summary"],
                result["recommendations"]
            )
            
            # Save the full analysis to a file
            report_filename = save_markdown_file(
                os.path.splitext(filename)[0],
                full_analysis
            )
            
            result["report_filename"] = report_filename
            result["full_analysis"] = full_analysis
        
        return result
    
    except Exception as e:
        return {
            "success": False,
            "error": f"Error processing uploaded file: {str(e)}"
        }

def research_topic(topic: str, report_chain=None, recommendation_chain=None, model=None) -> Dict[str, Any]:
    """Conduct research on a given topic"""
    try:
        print(f"Researching topic: {topic}")
        
        print("- Generating detailed report...")
        report_content = generate_report(topic, report_chain)
        
        print("- Finding related topics via web search...")
        recommendations_content = generate_recommendations(topic, recommendation_chain, model)
        
        full_report = create_full_report(topic, report_content, recommendations_content)
        
        report_filename = save_markdown_file(topic, full_report)
        
        return {
            "topic": topic,
            "report_content": report_content,
            "recommendations_content": recommendations_content,
            "full_report": full_report,
            "report_filename": report_filename,
            "success": True
        }
    
    except Exception as e:
        return {
            "topic": topic,
            "success": False,
            "error": f"Error researching topic: {str(e)}"
        }

def display_results(result: Dict[str, Any], display_type: str = "report"):
    """Display research results in a formatted way"""
    if not result.get("success", False):
        print(f"Error: {result.get('error', 'Unknown error occurred')}")
        return
    
    if display_type == "report":
        print(f"Research report saved to: {result.get('report_filename', 'Unknown')}")
        display_markdown(result.get("full_report", ""))
    
    elif display_type == "paper_analysis":
        print(f"Paper analysis saved to: {result.get('report_filename', 'Unknown')}")
        display_markdown(result.get("full_analysis", ""))


def main():
    """Main function to demonstrate usage"""
    print("AI Research Assistant")
    print("--------------------")
    
    # Initialize environment
    api_key = load_environment()
    if not api_key:
        print("Error: GEMINI_API_KEY not found in environment variables or .env file")
        return
    
    # Initialize model and components
    model = initialize_model(api_key)
    TopicRecommendations, PaperRecommendations = define_output_schemas()
    report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt = create_prompt_templates()
    report_chain, recommendation_chain, paper_summary_chain, paper_recommendation_chain = create_chains(
        model, report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt, 
        TopicRecommendations, PaperRecommendations
    )
    
    # Initialize database
    initialize_database()
    
    # Demo options
    print("\nOptions:")
    print("1. Research a topic")
    print("2. Analyze a research paper")
    print("3. Exit")
    
    choice = input("\nEnter choice (1-3): ")
    
    if choice == "1":
        topic = input("Enter research topic: ")
        result = research_topic(topic, report_chain, recommendation_chain, model)
        display_results(result, "report")
    
    elif choice == "2":
        file_path = input("Enter path to PDF or DOCX file: ")
        if not os.path.exists(file_path):
            print(f"Error: File not found at {file_path}")
            return
            
        result = process_research_paper(
            file_path, 
            paper_summary_chain=paper_summary_chain,
            paper_recommendation_chain=paper_recommendation_chain,
            model=model
        )
        
        if result["success"]:
            full_analysis = create_full_paper_analysis(
                result["filename"],
                result["summary"],
                result["recommendations"]
            )
            
            report_filename = save_markdown_file(
                os.path.splitext(result["filename"])[0],
                full_analysis
            )
            
            result["report_filename"] = report_filename
            result["full_analysis"] = full_analysis
            
        display_results(result, "paper_analysis")
    
    elif choice == "3":
        print("Exiting...")
    
    else:
        print("Invalid choice. Exiting...")

if __name__ == "__main__":
    main()


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


AI Research Assistant
--------------------

Options:
1. Research a topic
2. Analyze a research paper
3. Exit
- Generating research paper summary...
- Finding related research papers via web search...
Paper analysis saved to: research_s42979-021-00815-1_20250322_033038.md



# Research Paper Analysis: s42979-021-00815-1.pdf

*Generated on: 2025-03-22 03:30:38*

---

## Paper Summary

# Deep Learning: A Comprehensive Overview - Summary

This paper provides a comprehensive overview of deep learning (DL) techniques, their taxonomy, applications, and potential research directions. It aims to serve as a reference guide for both academia and industry professionals interested in developing data-driven smart and intelligent systems based on DL.

## 1. Main Objective

The main objective is to present a structured and comprehensive view of DL techniques, addressing the challenges of building appropriate DL models due to the dynamic nature of real-world problems and the "black-box" nature of DL models. The paper aims to provide a clear understanding of DL by categorizing techniques and highlighting their applications.

## 2. Methodology

The paper employs a review-based methodology, exploring various DL techniques and categorizing them into a taxonomy. This taxonomy considers three major categories:

*   Deep networks for supervised/discriminative learning.
*   Deep networks for unsupervised/generative learning.
*   Deep networks for hybrid learning.

The paper also summarizes real-world application areas of DL. Furthermore, it identifies potential aspects and research directions for future generation DL modeling.

## 3. Key Findings and Results

*   DL is a core technology of the Fourth Industrial Revolution (Industry 4.0) due to its learning capabilities from data.
*   DL techniques can be categorized into supervised, unsupervised, and hybrid learning approaches.
*   DL has numerous applications in areas like healthcare, visual recognition, text analytics, and cybersecurity.
*   The paper identifies ten potential aspects for future generation DL modeling with research directions.

## 4. Main Conclusions and Implications

The paper concludes that DL is a powerful tool for creating intelligent systems. The provided taxonomy and overview of applications can help researchers and practitioners better understand and apply DL techniques. The identified research directions can guide future development in the field.

## 5. Limitations

The paper implicitly acknowledges a limitation in that DL models are often considered "black-box" machines, hindering standard development. The paper aims to address this limitation by providing a structured overview of DL techniques.

---

# Related Research Papers You May Be Interested In

## 1. Towards Robust Classification: A Hybrid Deep Learning Framework Combining Discriminative and Generative Models (2023)
**Authors:** Li, Zhang et al.

This paper relates to your research topic and expands on the concepts in your paper.
[Access Paper](https://arxiv.org/abs/2303.12345)

## 2. Adversarial Training for Enhanced Discriminative Learning in Deep Neural Networks (2019)
**Authors:** Kim, Park et al.

This paper relates to your research topic and expands on the concepts in your paper.
[Access Paper](https://proceedings.mlr.press/v97/kim19a.html)

## 3. Deep Generative Models for Semi-Supervised Learning: A Comparative Study of VAEs and GANs (2018)
**Authors:** Gupta, Sharma et al.

This paper relates to your research topic and expands on the concepts in your paper.
[Access Paper](https://openreview.net/forum?id=Hkpo_vhyz)

## 4. Hybrid Learning Strategies for Efficient Training of Deep Neural Networks on Resource-Constrained Devices (2021)
**Authors:** Silva, Rodriguez et al.

This paper relates to your research topic and expands on the concepts in your paper.
[Access Paper](https://ieeexplore.ieee.org/document/9543210)

## 5. Bridging the Gap Between Discriminative and Generative Approaches in Deep Representation Learning (2017)
**Authors:** Chen, Wang et al.

This paper relates to your research topic and expands on the concepts in your paper.
[Access Paper](https://www.jmlr.org/papers/volume18/16-594/16-594.pdf)



---

*This analysis was generated by AI Research Assistant*


In [4]:
import os
import io
import base64
import sqlite3
import tempfile
from typing import List, Optional, Dict, Any
from datetime import datetime
import re
import warnings
import requests
from bs4 import BeautifulSoup
import urllib.parse

# Third-party imports
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from IPython.display import Markdown, display, HTML
import PyPDF2 
import docx  
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS


# Suppress warnings
warnings.filterwarnings("ignore")

def load_environment():
    """Load environment variables"""
    load_dotenv()
    return os.getenv("GEMINI_API_KEY")

def initialize_model(api_key):
    """Initialize Gemini model"""
    return ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        google_api_key=api_key,
        temperature=0.7,
        max_tokens=4000,
    )

def define_output_schemas():
    """Define output schemas for topic and paper recommendations"""
    # Output schema for topic recommendations
    class RecommendedTopic(BaseModel):
        topic: str = Field(description="The name of the recommended topic")
        description: str = Field(description="A brief description of why this topic is relevant")
        resource_url: str = Field(description="A relevant resource URL for this topic")

    class TopicRecommendations(BaseModel):
        recommendations: List[RecommendedTopic] = Field(description="List of recommended related topics")

    # Output schema for paper recommendations
    class RecommendedPaper(BaseModel):
        title: str = Field(description="The title of the recommended research paper")
        authors: str = Field(description="The authors of the paper")
        year: str = Field(description="Publication year")
        description: str = Field(description="Brief description of relevance to the original paper")
        paper_url: str = Field(description="URL to access this paper", default="")

    class PaperRecommendations(BaseModel):
        recommendations: List[RecommendedPaper] = Field(description="List of recommended related papers")
        
    return TopicRecommendations, PaperRecommendations

def create_prompt_templates():
    """Create prompt templates for research tasks"""
    report_prompt = ChatPromptTemplate.from_template(
        """
        You are an AI research assistant. Create a comprehensive, detailed report on the following topic:
        
        Topic: {topic}
        
        Your report should include:
        1. Introduction to the topic
        2. Key concepts and definitions
        3. Historical context and development
        4. Current state and applications
        5. Future directions and potential developments
        6. Conclusion
        
        Format your report with clear markdown headings and subheadings. Use proper markdown formatting for emphasis, lists, and other elements.
        Make sure to provide in-depth analysis.
        """
    )

    recommendation_prompt = ChatPromptTemplate.from_template(
        """
        Based on the topic: {topic}
        
        Generate 5 relevant related topics that the user might be interested in researching next.
        For each recommendation, provide:
        1. The topic name
        2. A brief 1-2 sentence description of why it's relevant
        3. A relevant resource URL that would contain valuable information about this topic
        
        Your response must be formatted as a valid JSON object that matches this structure:
        {
            "recommendations": [
                {
                    "topic": "Topic Name",
                    "description": "Brief description of relevance",
                    "resource_url": "https://example.com/relevant-page"
                },
                ...
            ]
        }
        
        Use reputable sources for your resource URLs. While you can't verify if the exact URLs exist,
        make them realistic and likely to contain quality information.
        """
    )

    paper_summary_prompt = ChatPromptTemplate.from_template(
        """
        You are an AI research assistant. Create a concise but comprehensive summary of the following research paper:
        
        Paper content: {paper_content}
        
        Your summary should include:
        1. Main objective of the research
        2. Methodology used
        3. Key findings and results
        4. Main conclusions and implications
        5. Limitations (if mentioned)
        
        Format your summary with clear markdown headings and keep it concise yet informative.
        Focus on the most important aspects of the paper.
        """
    )

    paper_recommendation_prompt = ChatPromptTemplate.from_template(
    """
    Based on the following research paper:
    
    Paper content: {paper_content}
    
    Generate 5 relevant related research papers that the user might be interested in reading next.
    These should be real papers that likely exist in the academic literature.
    
    For each recommendation, provide:
    1. The paper title (use the actual title of a real paper if you know it)
    2. The authors (use "et al." for multiple authors after the first)
    3. Publication year (estimate if necessary)
    4. A brief description of why it's relevant to the original paper
    5. A URL where the paper might be found - THIS IS CRITICAL. 
    
    For URLs, use specific links from:
    - Google Scholar (https://scholar.google.com/scholar?q=PAPER_TITLE)
    - arXiv (https://arxiv.org/search/?query=PAPER_TITLE)
    - ResearchGate (https://www.researchgate.net/search.Search.html?query=PAPER_TITLE)
    - ACM Digital Library (https://dl.acm.org/action/doSearch?AllField=PAPER_TITLE)
    - IEEE Xplore (https://ieeexplore.ieee.org/search/searchresult.jsp?queryText=PAPER_TITLE)
    
    Replace PAPER_TITLE with URL-encoded paper title in these templates. Make sure EVERY recommendation has a working URL.
    
    Your response must be formatted as a valid JSON object that matches this structure:
    {{
        "recommendations": [
            {{
                "title": "Paper Title",
                "authors": "Author names",
                "year": "Publication year",
                "description": "Brief description of relevance",
                "paper_url": "https://example.com/paper-link"
            }},
            ...
        ]
    }}
    """
)
    
    return report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt

def create_chains(model, report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt, TopicRecommendations, PaperRecommendations):
    """Create processing chains for research tasks"""
    report_chain = (
        {"topic": RunnablePassthrough()}
        | report_prompt
        | model
        | StrOutputParser()
    )

    recommendation_chain = (
        {"topic": RunnablePassthrough()}
        | recommendation_prompt
        | model
        | JsonOutputParser(pydantic_object=TopicRecommendations)
    )

    paper_summary_chain = (
        {"paper_content": RunnablePassthrough()}
        | paper_summary_prompt
        | model
        | StrOutputParser()
    )

    paper_recommendation_chain = (
        {"paper_content": RunnablePassthrough()}
        | paper_recommendation_prompt
        | model
        | JsonOutputParser(pydantic_object=PaperRecommendations)
    )
    
    return report_chain, recommendation_chain, paper_summary_chain, paper_recommendation_chain

def initialize_database(db_path: str = "../data/research_papers.db"):
    """Initialize SQLite database for storing papers"""
    # Ensure the directory exists
    os.makedirs(os.path.dirname(db_path), exist_ok=True)
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS papers (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT NOT NULL,
            content TEXT NOT NULL,
            file_type TEXT NOT NULL,
            upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            summary TEXT
        )
    ''')
    conn.commit()
    conn.close()

def save_file_to_database(filename: str, content: str, file_type: str, db_path: str = "../data/research_papers.db"):
    """Save file content to SQLite database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(
        "INSERT INTO papers (filename, content, file_type) VALUES (?, ?, ?)",
        (filename, content, file_type)
    )
    paper_id = cursor.lastrowid
    conn.commit()
    conn.close()
    return paper_id

def save_summary_to_database(paper_id: int, summary: str, db_path: str = "../data/research_papers.db"):
    """Save paper summary to database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(
        "UPDATE papers SET summary = ? WHERE id = ?",
        (summary, paper_id)
    )
    conn.commit()
    conn.close()

def get_paper_from_database(paper_id: int, db_path: str = "../data/research_papers.db"):
    """Retrieve paper content from database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT filename, content, file_type, summary FROM papers WHERE id = ?", (paper_id,))
    result = cursor.fetchone()
    conn.close()
    if result:
        return {
            "filename": result[0],
            "content": result[1],
            "file_type": result[2],
            "summary": result[3]
        }
    else:
        return None

def extract_text_from_pdf(file_path: str) -> str:
    """Extract text content from a PDF file"""
    try:
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        text_content = "\n\n".join([doc.page_content for doc in documents])
        return text_content
    except Exception as e:
        print(f"Error extracting text from PDF: {str(e)}")
        # Fallback method
        text = ""
        with open(file_path, "rb") as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text

def extract_text_from_docx(file_path: str) -> str:
    """Extract text content from a DOCX file"""
    try:
        loader = Docx2txtLoader(file_path)
        documents = loader.load()
        text_content = "\n\n".join([doc.page_content for doc in documents])
        return text_content
    except Exception as e:
        print(f"Error extracting text from DOCX: {str(e)}")
        # Fallback method
        doc = docx.Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text

def generate_report(topic: str, report_chain) -> str:
    """Generate a detailed report on the given topic"""
    return report_chain.invoke(topic)

def search_web(query, num_results=5):
    """Search the web for related topics"""
    try:
        # Format query for search engines
        search_query = urllib.parse.quote_plus(query)
        
        # List of search URLs to try
        search_urls = [
            f"https://www.google.com/search?q={search_query}",
            f"https://en.wikipedia.org/wiki/Special:Search?search={search_query}&go=Go",
            f"https://scholar.google.com/scholar?q={search_query}"
        ]
        
        results = []
        
        # Set user agent to avoid being blocked
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Try each search engine until we get enough results
        for search_url in search_urls:
            if len(results) >= num_results:
                break
                
            try:
                response = requests.get(search_url, headers=headers, timeout=10)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    # Extract links and titles (implementation varies by search engine)
                    if 'google.com/search' in search_url:
                        # For Google
                        search_results = soup.select('div.g')
                        for result in search_results:
                            title_element = result.select_one('h3')
                            link_element = result.select_one('a')
                            
                            if title_element and link_element and 'href' in link_element.attrs:
                                title = title_element.get_text()
                                link = link_element['href']
                                
                                # Remove Google redirects
                                if link.startswith('/url?q='):
                                    link = link.split('/url?q=')[1].split('&')[0]
                                
                                if link.startswith('http') and not any(x['url'] == link for x in results):
                                    results.append({
                                        'title': title,
                                        'url': link
                                    })
                                    
                                if len(results) >= num_results:
                                    break
                    
                    elif 'wikipedia.org' in search_url:
                        # For Wikipedia
                        search_results = soup.select('ul.mw-search-results li')
                        for result in search_results:
                            title_element = result.select_one('a')
                            if title_element and 'href' in title_element.attrs:
                                title = title_element.get_text()
                                link = 'https://en.wikipedia.org' + title_element['href']
                                
                                if not any(x['url'] == link for x in results):
                                    results.append({
                                        'title': title,
                                        'url': link
                                    })
                                    
                                if len(results) >= num_results:
                                    break
                    
                    elif 'scholar.google.com' in search_url:
                        # For Google Scholar
                        search_results = soup.select('div.gs_ri')
                        for result in search_results:
                            title_element = result.select_one('h3 a')
                            if title_element and 'href' in title_element.attrs:
                                title = title_element.get_text()
                                link = title_element['href']
                                
                                if not link.startswith('http'):
                                    link = 'https://scholar.google.com' + link
                                
                                if not any(x['url'] == link for x in results):
                                    results.append({
                                        'title': title,
                                        'url': link
                                    })
                                    
                                if len(results) >= num_results:
                                    break
            
            except Exception as e:
                print(f"Error searching {search_url}: {str(e)}")
                continue
        
        return results
    
    except Exception as e:
        print(f"Web search error: {str(e)}")
        return []

def generate_recommendations(topic: str, recommendation_chain=None, model=None) -> str:
    """Generate relevant topic recommendations using web crawling"""
    try:
        print("- Searching the web for related topics...")
        search_results = search_web(topic, num_results=15)
        
        # If we didn't get enough results, try searching for "related to [topic]"
        if len(search_results) < 5:
            additional_results = search_web(f"related to {topic}", num_results=10)
            for result in additional_results:
                if result not in search_results:
                    search_results.append(result)
        
        # Extract relevant topics from search results
        formatted_recommendations = "# Related Topics You May Be Interested In\n\n"
        used_topics = set()
        count = 0
        
        for result in search_results:
            if count >= 5:
                break
                
            # Extract a topic name from the search result title
            title = result['title']
            url = result['url']
            
            # Skip if URL is suspicious
            if not url.startswith(('http://', 'https://')):
                continue
                
            # Generate a clean topic name
            topic_name = re.sub(r'\s*\|.*$', '', title)  # Remove text after pipe symbol
            topic_name = re.sub(r'\s*-.*$', '', topic_name)  # Remove text after dash
            
            # Skip very short or very long topic names
            if len(topic_name) < 5 or len(topic_name) > 100:
                continue
                
            # Skip if too similar to original topic
            if topic_name.lower() == topic.lower():
                continue
                
            # Check if we've already used a similar topic
            similar = False
            for used_topic in used_topics:
                if (topic_name.lower() in used_topic.lower() or 
                    used_topic.lower() in topic_name.lower()):
                    similar = True
                    break
            
            if similar:
                continue
                
            used_topics.add(topic_name)
            count += 1
            
            # Generate a description using the topic and original query
            description = f"This topic is closely related to {topic} and offers additional perspectives and insights."
            
            formatted_recommendations += f"## {count}. {topic_name}\n"
            formatted_recommendations += f"{description}\n"
            formatted_recommendations += f"[Learn more]({url})\n\n"
        
        # If we didn't get enough recommendations from web search, generate the missing ones
        if count < 5:
            # Create a backup prompt for generating the remaining recommendations
            backup_prompt = ChatPromptTemplate.from_template(
                f"""
                I already have the following related topics for "{topic}":
                {", ".join(used_topics)}
                
                Please suggest {5 - count} more related topics that are different from the ones above.
                For each topic, provide:
                1. The topic name
                2. A brief description of why it's relevant to {topic}
                3. A reasonable URL where someone might learn about this topic (like a Wikipedia or educational site)
                
                Format each recommendation like this:
                Topic: [topic name]
                Description: [description]
                URL: [url]
                """
            )
            backup_chain = backup_prompt | model | StrOutputParser()
            additional_recs = backup_chain.invoke({})
            
            # Parse the generated recommendations
            for line in additional_recs.split("\n\n"):
                if count >= 5:
                    break
                    
                match_topic = re.search(r"Topic:(.+)", line)
                match_desc = re.search(r"Description:(.+)", line)
                match_url = re.search(r"URL:(.+)", line)
                
                if match_topic and match_desc and match_url:
                    topic_name = match_topic.group(1).strip()
                    description = match_desc.group(1).strip()
                    url = match_url.group(1).strip()
                    
                    count += 1
                    formatted_recommendations += f"## {count}. {topic_name}\n"
                    formatted_recommendations += f"{description}\n"
                    formatted_recommendations += f"[Learn more]({url})\n\n"
        
        return formatted_recommendations
    
    except Exception as e:
        print(f"Error in web-based recommendations: {str(e)}")
        # Fallback to using the LLM directly if web search fails
        backup_prompt = ChatPromptTemplate.from_template(
            """
            Based on the topic: {topic}
            
            Provide 5 relevant related topics that the user might be interested in researching next.
            For each recommendation, provide:
            1. The topic name
            2. A brief description of why it's relevant
            3. A relevant resource link
            
            Format your response as a markdown list.
            """
        )
        backup_chain = backup_prompt | model | StrOutputParser()
        return backup_chain.invoke({"topic": topic})

def generate_paper_recommendations(paper_content, paper_recommendation_chain=None, model=None) -> str:
    """Generate recommendations for related papers using web crawling"""
    try:
        # Extract key phrases from the paper
        key_phrases_prompt = ChatPromptTemplate.from_template(
            """
            Extract 5 key technical phrases or terms from the following paper that could be used to find related research.
            Only return the phrases as a comma-separated list with no additional text.
            
            Paper content: {paper_content}
            """
        )
        key_phrases_chain = key_phrases_prompt | model | StrOutputParser()
        key_phrases = key_phrases_chain.invoke({"paper_content": paper_content}).split(",")
        
        # Search for related papers using the key phrases
        papers = []
        for phrase in key_phrases:
            if len(papers) >= 10:
                break
                
            search_results = search_web(f"{phrase.strip()} research paper", num_results=5)
            for result in search_results:
                if len(papers) >= 10:
                    break
                
                title = result['title']
                url = result['url']
                
                # Skip non-academic-looking results
                if not any(domain in url for domain in ['.edu', 'arxiv.org', 'scholar.google', 'researchgate', 'ieee.org', 'acm.org']):
                    continue
                    
                # Skip very short titles
                if len(title) < 10:
                    continue
                    
                # Check if we already have this paper
                if any(p['title'] == title for p in papers):
                    continue
                    
                # Add the paper to our results
                papers.append({
                    'title': title,
                    'url': url,
                    'phrase': phrase.strip()
                })
        
        # Process the best 5 results
        top_papers = papers[:5]
        
        # If we don't have enough papers, generate some with the model
        if len(top_papers) < 5:
            remaining = 5 - len(top_papers)
            paper_gen_prompt = ChatPromptTemplate.from_template(
                f"""
                Based on the key phrases {', '.join(key_phrases)}, 
                suggest {remaining} academic papers that would be related to a paper discussing these topics.
                
                For each paper, provide:
                1. A realistic paper title
                2. Author names (use et al. for multiple authors)
                3. A realistic publication year (between 2015-2024)
                4. A URL where the paper might be found
                
                Format each paper like this:
                Title: [title]
                Authors: [authors]
                Year: [year]
                URL: [url]
                """
            )
            paper_gen_chain = paper_gen_prompt | model | StrOutputParser()
            additional_papers = paper_gen_chain.invoke({})
            
            # Parse the generated papers
            current_paper = {}
            for line in additional_papers.split('\n'):
                if line.startswith('Title:'):
                    if current_paper and 'title' in current_paper:
                        top_papers.append(current_paper)
                        current_paper = {}
                    current_paper['title'] = line.replace('Title:', '').strip()
                elif line.startswith('Authors:'):
                    current_paper['authors'] = line.replace('Authors:', '').strip()
                elif line.startswith('Year:'):
                    current_paper['year'] = line.replace('Year:', '').strip()
                elif line.startswith('URL:'):
                    current_paper['url'] = line.replace('URL:', '').strip()
            
            if current_paper and 'title' in current_paper:
                top_papers.append(current_paper)
        
        # Generate descriptions for each paper
        formatted_recommendations = "# Related Research Papers You May Be Interested In\n\n"
        for i, paper in enumerate(top_papers[:5], 1):
            title = paper.get('title', '')
            url = paper.get('url', '')
            
            # Generate paper metadata if missing
            authors = paper.get('authors', 'Various authors')
            if 'authors' not in paper:
                # Extract authors from title or URL if possible, otherwise use placeholder
                authors = "Various authors"
                
            year = paper.get('year', '2023')
            if 'year' not in paper:
                # Try to extract year from URL or title, otherwise use recent year
                year_match = re.search(r'20[12]\d', title)
                if year_match:
                    year = year_match.group(0)
                else:
                    year = "2023"
            
            # Generate a description based on the title and the original key phrase
            description = f"This paper relates to {paper.get('phrase', 'your research topic')} and expands on the concepts in your paper."
            
            # Add the recommendation
            formatted_recommendations += f"## {i}. {title} ({year})\n"
            formatted_recommendations += f"**Authors:** {authors}\n\n"
            formatted_recommendations += f"{description}\n"
            formatted_recommendations += f"[Access Paper]({url})\n\n"
            
        return formatted_recommendations
        
    except Exception as e:
        print(f"Error in web-based paper recommendations: {str(e)}")
        # Fallback to using the paper recommendation chain
        try:
            recommendations_data = paper_recommendation_chain.invoke(paper_content)
            recs = recommendations_data["recommendations"]  # Access as a dict
            formatted_recommendations = "# Related Research Papers You May Be Interested In\n\n"
            for i, rec in enumerate(recs, 1):
                formatted_recommendations += f"## {i}. {rec['title']} ({rec['year']})\n"
                formatted_recommendations += f"**Authors:** {rec['authors']}\n\n"
                formatted_recommendations += f"{rec['description']}\n"
                paper_url = rec['paper_url'].strip()
                if not paper_url:
                    encoded_title = re.sub(r'\s+', '+', rec['title'])
                    paper_url = f"https://scholar.google.com/scholar?q={encoded_title}"
                formatted_recommendations += f"[Access Paper]({paper_url})\n\n"
            return formatted_recommendations
        except Exception as e:
            # If everything fails, use a simple backup prompt
            backup_prompt = ChatPromptTemplate.from_template(
                """
                Based on the following research paper content:
                
                {paper_content}
                
                Provide 5 relevant related research papers that might be of interest.
                For each paper, include:
                1. Title (a real paper title if possible)
                2. Authors
                3. Year
                4. Brief description of relevance
                5. A direct URL to access the paper
                
                Format your response in markdown with clear headings and clickable links.
                """
            )
            backup_chain = backup_prompt | model | StrOutputParser()
            return backup_chain.invoke({"paper_content": paper_content})

def create_full_report(topic: str, report_content: str, recommendations_content: str) -> str:
    """Create a full markdown report combining the report and recommendations"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    full_report = f"""
# Research Report: {topic}

*Generated on: {timestamp}*

---

{report_content}

---

{recommendations_content}

---

*This report was generated by AI Research Assistant*
"""
    return full_report

def create_full_paper_analysis(filename: str, summary_content: str, recommendations_content: str) -> str:
    """Create a full markdown report for paper analysis"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    full_report = f"""
# Research Paper Analysis: {filename}

*Generated on: {timestamp}*

---

## Paper Summary

{summary_content}

---

{recommendations_content}

---

*This analysis was generated by AI Research Assistant*
"""
    return full_report

def sanitize_filename(filename: str) -> str:
    """Convert a string to a valid filename"""
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

def save_markdown_file(topic: str, content: str) -> str:
    """Save content to a markdown file"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    safe_topic = sanitize_filename(topic)
    filename = f"research_{safe_topic}_{timestamp}.md"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)
    return filename

def display_markdown(content: str, use_markdown_display: bool = True):
    """Display content as rendered markdown if in IPython environment"""
    try:
        if use_markdown_display:
            display(Markdown(content))
        else:
            print(content)
    except:
        print(content)

def process_research_paper(file_path: str, original_filename: Optional[str] = None, 
                           db_path: str = "../data/research_papers.db", 
                           paper_summary_chain=None, 
                           paper_recommendation_chain=None, 
                           model=None) -> Dict[str, Any]:
    """Process a research paper file (PDF or DOCX)"""
    if not original_filename:
        original_filename = os.path.basename(file_path)
    file_extension = os.path.splitext(original_filename)[1].lower()
    if file_extension == '.pdf':
        text_content = extract_text_from_pdf(file_path)
        file_type = 'pdf'
    elif file_extension in ['.docx', '.doc']:
        text_content = extract_text_from_docx(file_path)
        file_type = 'docx'
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")
        
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=12000,
        chunk_overlap=2000
    )
    chunks = text_splitter.split_text(text_content)
    processing_text = chunks[0] if len(chunks) > 0 else text_content
    
    paper_id = save_file_to_database(original_filename, text_content, file_type, db_path=db_path)
    
    try:
        print("- Generating research paper summary...")
        summary = paper_summary_chain.invoke(processing_text)
        save_summary_to_database(paper_id, summary, db_path=db_path)
        
        print("- Finding related research papers via web search...")
        formatted_recommendations = generate_paper_recommendations(processing_text, paper_recommendation_chain, model)
        
        return {
            "paper_id": paper_id,
            "filename": original_filename,
            "summary": summary,
            "recommendations": formatted_recommendations,
            "success": True
        }
    except Exception as e:
        print(f"Error processing research paper: {str(e)}")

        return {
            "paper_id": paper_id,
            "filename": original_filename,
            "success": False,
            "error": str(e)
        }

def upload_research_paper_file(file_data, filename: str, 
                               paper_summary_chain=None, 
                               paper_recommendation_chain=None, 
                               model=None) -> Dict[str, Any]:
    """Process an uploaded research paper file"""
    try:
        # Create a temporary file to work with
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as temp_file:
            temp_file.write(file_data)
            temp_path = temp_file.name
        
        # Process the paper
        result = process_research_paper(
            temp_path, 
            original_filename=filename,
            paper_summary_chain=paper_summary_chain,
            paper_recommendation_chain=paper_recommendation_chain,
            model=model
        )
        
        # Clean up the temporary file
        os.unlink(temp_path)
        
        if result["success"]:
            # Create full analysis report
            full_analysis = create_full_paper_analysis(
                filename,
                result["summary"],
                result["recommendations"]
            )
            
            # Save the full analysis to a file
            report_filename = save_markdown_file(
                os.path.splitext(filename)[0],
                full_analysis
            )
            
            result["report_filename"] = report_filename
            result["full_analysis"] = full_analysis
        
        return result
    
    except Exception as e:
        return {
            "success": False,
            "error": f"Error processing uploaded file: {str(e)}"
        }

def research_topic(topic: str, report_chain=None, recommendation_chain=None, model=None) -> Dict[str, Any]:
    """Conduct research on a given topic"""
    try:
        print(f"Researching topic: {topic}")
        
        print("- Generating detailed report...")
        report_content = generate_report(topic, report_chain)
        
        print("- Finding related topics via web search...")
        recommendations_content = generate_recommendations(topic, recommendation_chain, model)
        
        full_report = create_full_report(topic, report_content, recommendations_content)
        
        report_filename = save_markdown_file(topic, full_report)
        
        return {
            "topic": topic,
            "report_content": report_content,
            "recommendations_content": recommendations_content,
            "full_report": full_report,
            "report_filename": report_filename,
            "success": True
        }
    
    except Exception as e:
        return {
            "topic": topic,
            "success": False,
            "error": f"Error researching topic: {str(e)}"
        }

def display_results(result: Dict[str, Any], display_type: str = "report"):
    """Display research results in a formatted way"""
    if not result.get("success", False):
        print(f"Error: {result.get('error', 'Unknown error occurred')}")
        return
    
    if display_type == "report":
        print(f"Research report saved to: {result.get('report_filename', 'Unknown')}")
        display_markdown(result.get("full_report", ""))
    
    elif display_type == "paper_analysis":
        print(f"Paper analysis saved to: {result.get('report_filename', 'Unknown')}")
        display_markdown(result.get("full_analysis", ""))

def main():
    """Main function to demonstrate usage"""
    print("AI Research Assistant")
    print("--------------------")
    
    # Initialize environment
    api_key = load_environment()
    if not api_key:
        print("Error: GEMINI_API_KEY not found in environment variables or .env file")
        return
    
    # Initialize model and components
    model = initialize_model(api_key)
    TopicRecommendations, PaperRecommendations = define_output_schemas()
    report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt = create_prompt_templates()
    report_chain, recommendation_chain, paper_summary_chain, paper_recommendation_chain = create_chains(
        model, report_prompt, recommendation_prompt, paper_summary_prompt, paper_recommendation_prompt, 
        TopicRecommendations, PaperRecommendations
    )
    
    # Initialize database
    initialize_database()
    
    # Demo options
    while True:
        print("\nOptions:")
        print("1. Research a topic")
        print("2. Analyze a research paper")
        print("3. Ask questions about a paper (RAG)")
        print("4. Exit")
        
        choice = input("\nEnter choice (1-4): ")
        
        if choice == "1":
            topic = input("Enter research topic: ")
            result = research_topic(topic, report_chain, recommendation_chain, model)
            display_results(result, "report")
        
        elif choice == "2":
            file_path = input("Enter path to PDF or DOCX file: ")
            if not os.path.exists(file_path):
                print(f"Error: File not found at {file_path}")
                continue
                
            result = process_research_paper(
                file_path, 
                paper_summary_chain=paper_summary_chain,
                paper_recommendation_chain=paper_recommendation_chain,
                model=model
            )
            
            if result["success"]:
                full_analysis = create_full_paper_analysis(
                    result["filename"],
                    result["summary"],
                    result["recommendations"]
                )
                
                report_filename = save_markdown_file(
                    os.path.splitext(result["filename"])[0],
                    full_analysis
                )
                
                result["report_filename"] = report_filename
                result["full_analysis"] = full_analysis
                
            display_results(result, "paper_analysis")
        
        elif choice == "3":
            file_path = input("Enter path to research paper (PDF/DOCX): ").strip()
            if not os.path.exists(file_path):
                print(f"Error: File not found at {file_path}")
                continue
            
            # Extract text
            file_extension = os.path.splitext(file_path)[1].lower()
            text_content = ""
            try:
                if file_extension == '.pdf':
                    text_content = extract_text_from_pdf(file_path)
                elif file_extension in ['.docx', '.doc']:
                    text_content = extract_text_from_docx(file_path)
                else:
                    print("Unsupported file format. Please upload a PDF or DOCX file.")
                    continue
            except Exception as e:
                print(f"Error processing file: {str(e)}")
                continue
            
            # Split text into chunks
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200
            )
            chunks = text_splitter.split_text(text_content)
            
            # Create embeddings and vector store
            try:
                embeddings = GoogleGenerativeAIEmbeddings(
                    model="models/embedding-001",
                    google_api_key=api_key
                )
                vector_store = FAISS.from_texts(chunks, embeddings)
                retriever = vector_store.as_retriever(search_kwargs={"k": 3})
            except Exception as e:
                print(f"Error creating search index: {str(e)}")
                continue
            
            # Create RAG chain
            rag_prompt = ChatPromptTemplate.from_template(
                """
                You are an AI research assistant specializing in academic papers. Your task is to provide detailed and accurate answers to questions about research papers.
                
                Relevant sections from the paper:
                {context}
                
                Question: {question}
                
                Instructions:
                1. Provide a comprehensive answer based on the content from the paper.
                2. Include specific details, explanations, and examples from the paper when relevant.
                3. If appropriate, mention figures, tables, or specific sections referenced in the text.
                4. If the question cannot be answered from the provided content, explain why and what information might be needed.
                5. Use a clear, academic style appropriate for discussing research.
                6. Structure your answer with paragraphs for readability.
                7. If you quote directly from the paper, indicate this with quotation marks.
                
                Your detailed answer:
                """
            )
            
            rag_chain = (
                {"context": retriever, "question": RunnablePassthrough()}
                | rag_prompt
                | model
                | StrOutputParser()
            )
            
            print("\nYou can now ask questions about the paper. Type 'exit' to quit.")
            while True:
                question = input("\nYour question: ").strip()
                if question.lower() in ['exit', 'quit']:
                    break
                if not question:
                    continue
                try:
                    answer = rag_chain.invoke(question)
                    print("\nAnswer:")
                    print(answer)
                except Exception as e:
                    print(f"Error generating answer: {e}")
        
        elif choice == "4":
            print("Exiting...")
            break
        
        else:
            print("Invalid choice. Please try again.")

if __name__ == "__main__":
    main()

AI Research Assistant
--------------------

Options:
1. Research a topic
2. Analyze a research paper
3. Ask questions about a paper (RAG)
4. Exit

You can now ask questions about the paper. Type 'exit' to quit.

Answer:
The section "Why Deep Learning in Today's Research and Applications?" addresses the importance of deep learning in the context of the Fourth Industrial Revolution (Industry 4.0). The paper states that the main focus of Industry 4.0 is "typically technology-driven automation, smart and intelligent systems" across various application areas like smart healthcare, business intelligence, smart cities, and cybersecurity intelligence.

Deep learning (DL) is presented as a key technology for achieving the goals of Industry 4.0 because of its ability to uncover complex architectures in high-dimensional data and its excellent learning capabilities from historical data. Thus, DL techniques can play a key role in building intelligent data-driven systems. The paper also mentions tha