In [1]:
# Step 0: Install Dependencies (only once)
!pip install gradio pymupdf faiss-cpu sentence-transformers transformers bertopic umap-learn hdbscan



In [2]:
!pip install langdetect



In [3]:
# Step 1: Imports & Paths Setup
import os
import sys
import fitz
import faiss
import numpy as np
import pandas as pd
import gradio as gr
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from transformers import MarianMTModel, MarianTokenizer

# Set base directory
BASE_DIR = r"C:\Users\intel\Desktop\draft RAG"
sys.path.append(os.path.join(BASE_DIR, "app"))

# Import citation generator
from citation_utils import generate_apa_citation

In [4]:
# Step 2: Load FAISS Index, Data & Models
# Load CSV
df = pd.read_csv(os.path.join(BASE_DIR, "data", "combined_final_papers.csv")).fillna("")
df["Combined_Text"] = df["Title"] + ". " + df["Abstract"] + ". " + df["Keyword"]

# Load FAISS index
index = faiss.read_index(os.path.join(BASE_DIR, "models", "semantic_index.faiss"))
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Device set to use cpu


In [5]:
 # Step 3: Core Functions

           # 3.1 🧠 Semantic Search
def semantic_search(query, cutoff_year=None, top_k=5):
    query_embedding = embedding_model.encode([query]).astype("float32")
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        paper = df.iloc[idx]
        
        pub_date = pd.to_datetime(paper['Published Date'], errors='coerce')
        if cutoff_year is not None and pub_date.year >= cutoff_year:
            continue
        
        citation = generate_apa_citation(
            paper['Title'], paper['Authors'], paper['Published Date'], paper['PDF Link']
        )
        results.append(
            f"📄 **{paper['Title']}**\n\n🧑‍🔬 *{paper['Authors']}* | 🗓️ {paper['Published Date']}\n"
            f"🔗 [PDF Link]({paper['PDF Link']})\n\n🧾 **Citation**: {citation}\n\n📝 **Abstract:**\n"
            f"{paper['Abstract']}\n\n---"
        )
    
    if not results:
        return f"No papers found before {cutoff_year} for the query: '{query}'"
    
    return "\n\n".join(results)



    # 3.2 📄 PDF + Text Summarizer
def chunk_text(text, max_tokens=500):
    words = text.split()
    return [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]

def summarize_long_text(text):
    chunks = chunk_text(text)
    summaries = []
    for chunk in chunks:
        if len(chunk.split()) > 30:
            summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
            summaries.append(summary)
    return "\n\n".join(summaries)

def summarize_pdf(pdf):
    import fitz  # PyMuPDF

def summarize_pdf(pdf):
    doc = fitz.open(pdf.name)  # <- Use .name to get the actual file path
    text = ""
    for page in doc:
        text += page.get_text()
    return summarize_long_text(text)


  # 3.3 🌍 Multilingual Search

from transformers import MarianMTModel, MarianTokenizer

def load_translation_model(src_lang, tgt_lang):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return model, tokenizer

import torch

def translate(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        translated = model.generate(**inputs)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]


from langdetect import detect

def multilingual_search(query, lang):
    lang_map = {
        "French": ("fr", "en", "en", "fr"),
        "German": ("de", "en", "en", "de"),
        "Spanish": ("es", "en", "en", "es"),
        "Hindi": ("hi", "en", "en", "hi"),
        "Chinese": ("zh", "en", "en", "zh")
    }

    if lang not in lang_map:
        return "❌ Language not supported."

    src, tgt, back_src, back_tgt = lang_map[lang]

    model_to_en, tokenizer_to_en = load_translation_model(src, tgt)
    model_back, tokenizer_back = load_translation_model(back_src, back_tgt)

    translated_query = translate(query, model_to_en, tokenizer_to_en)
    english_results = semantic_search(translated_query)

    results = english_results.split("\n\n---")  # Split each paper block
    final_translations = []

    for result in results:
        try:
            # Extract the abstract section only
            abstract_start = result.find("**Abstract:**")
            if abstract_start != -1:
                prefix = result[:abstract_start]
                abstract = result[abstract_start:]
                translated_abstract = translate(abstract, model_back, tokenizer_back)
                final_translations.append(prefix + translated_abstract)
            else:
                final_translations.append(result)
        except Exception as e:
            final_translations.append("❌ Translation failed for one result.")

    return "\n\n---".join(final_translations)
    


# 3.4 📎 Citation Generator
def citation_generator(title, authors, date, pdf_link):
    return generate_apa_citation(title, authors, date, pdf_link)

# 3.5 📊 Trend Visualization (graph)
import matplotlib.pyplot as plt
from io import BytesIO
from PIL import Image

def show_trend_plot():
    topic_model = BERTopic.load(os.path.join(BASE_DIR, "models", "bertopic_model"))
    topic_info = topic_model.get_topic_info()
    top_topics = topic_info[1:11]  # Skip -1 (outliers)

    fig, ax = plt.subplots(figsize=(10, 5))
    ax.barh(top_topics['Name'], top_topics['Count'], color='skyblue')
    ax.set_xlabel("Paper Count")
    ax.set_title("Top 10 Research Topics")
    plt.tight_layout()

    buf = BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    return Image.open(buf)

In [6]:
import pandas as pd
import numpy as np
import plotly.express as px
import gradio as gr

# Topic mapping
topic_mapping = {
    "ai": "Artificial Intelligence",
    "deepfake": "Deepfake",
    "blockchain": "Blockchain",
    "cybersecurity": "Cyber Security",
    "machine": "Machine Learning",
    "data": "Data Science",
    "robotics": "Robotics",
    "quantum": "Quantum Computing",
    "healthcare": "Healthcare",
    "sustainability": "Sustainability",
    "innovation": "Innovation"
}

# Dummy data
years = list(range(2010, 2021))
data = {
    "Artificial Intelligence": np.random.randint(0, 20, len(years)),
    "Deepfake": np.random.randint(0, 15, len(years)),
    "Blockchain": np.random.randint(0, 10, len(years)),
    "Cyber Security": np.random.randint(5, 25, len(years)),
    "Machine Learning": np.random.randint(10, 30, len(years)),
    "Data Science": np.random.randint(15, 40, len(years)),
    "Robotics": np.random.randint(0, 12, len(years)),
    "Quantum Computing": np.random.randint(0, 8, len(years)),
    "Healthcare": np.random.randint(3, 18, len(years)),
    "Sustainability": np.random.randint(1, 10, len(years)),
    "Innovation": np.random.randint(0, 15, len(years)),
}
topics_df = pd.DataFrame(data, index=years)

# Reverse column mapping
col_mapping = {}
for col in topics_df.columns:
    normalized = col.lower().replace(' ', '')
    if normalized in topic_mapping:
        col_mapping[col] = normalized
    else:
        for k, v in topic_mapping.items():
            if normalized == v.lower().replace(' ', ''):
                col_mapping[col] = k

def plot_topic_trends(selected_keys):
    if not selected_keys:
        return gr.Plot.update(value=None), "⚠️ Please select at least one topic."

    matched_cols = [col for col, key in col_mapping.items() if key in selected_keys]
    if not matched_cols:
        return gr.Plot.update(value=None), "⚠️ None of the selected topics were found in the data."

    rename_dict = {col: topic_mapping[col_mapping[col]] for col in matched_cols}
    topics_df_filtered = topics_df[matched_cols].rename(columns=rename_dict)

    topics_long = topics_df_filtered.reset_index().melt(id_vars='index', var_name='Topic', value_name='Count')
    topics_long = topics_long.rename(columns={'index': 'Year'})

    fig = px.line(topics_long, x='Year', y='Count', color='Topic',
                  title="📊 Research Paper Trends Over Time",
                  labels={'Year': 'Year', 'Count': 'Number of Papers', 'Topic': 'Topic'},
                  markers=True)
    fig.update_layout(legend_title_text='Topics', legend=dict(x=1.05, y=1), hovermode='x unified')

    return fig, ""

In [7]:
# Step 4: Gradio Interface
import gradio as gr

# 🔍 English Search
search_interface = gr.Interface(
    fn=semantic_search,
    inputs=[
        gr.Textbox(label="🔍 Enter Research Query (English)"),
        gr.Number(label="📅 Show papers published before year (optional)", precision=0, value=None),
        gr.Slider(label="📈 Number of papers to fetch", minimum=1, maximum=20, value=5, step=1)
    ],
    outputs=gr.Markdown(label="📄 Top Matching Papers"),
    title="📚 Semantic Search",
    description="Search academic papers using semantic understanding. Filter by year and control result count."
)

# 🌍 Multilingual Search
multilingual_interface = gr.Interface(
    fn=multilingual_search,
    inputs=[
        gr.Textbox(label="🔍 Enter your query"),
        gr.Dropdown(choices=["French", "German", "Spanish", "Hindi", "Chinese"], label="🌐 Select Language")
    ],
    outputs=gr.Textbox(label="📑 Translated Results"),
    title="🌍 Multilingual Semantic Search",
    description="Enter your research query in a different language. Results will be translated back to your language."
)

# 📄 PDF Summarizer
pdf_interface = gr.Interface(
    fn=summarize_pdf,
    inputs=gr.File(label="📄 Upload PDF"),
    outputs=gr.Textbox(label="📝 Summary"),
    title="🧾 PDF Summarizer",
    description="Upload a research PDF to summarize it."
)

# 📝 Long Text Summarizer
text_interface = gr.Interface(
    fn=summarize_long_text,
    inputs=gr.Textbox(lines=15, label="📝 Paste Long Text or Abstract"),
    outputs=gr.Textbox(label="📌 Summary"),
    title="🧠 Text Summarizer",
    description="Paste long research text for summarization."
)

# 📎 Citation Generator
citation_interface = gr.Interface(
    fn=citation_generator,
    inputs=[
        gr.Textbox(label="📘 Title"),
        gr.Textbox(label="✍️ Authors (comma-separated)"),
        gr.Textbox(label="📅 Published Date (YYYY-MM-DD)"),
        gr.Textbox(label="🔗 PDF Link")
    ],
    outputs=gr.Textbox(label="📎 APA Citation"),
    title="🧾 Citation Generator",
    description="Auto-generate APA-style citations for your papers."
)

# 📊 Interactive Trend Plot
interactive_trend_interface = gr.Interface(
    fn=plot_topic_trends,
    inputs=gr.CheckboxGroup(
        choices=[key for key in topic_mapping.keys()],
        label="📌 Select Research Topics",
        info="Choose one or more topics to visualize trends."
    ),
    outputs=[
        gr.Plot(label="📈 Trend Over Time"),
        gr.Textbox(label="ℹ️ Status Message", interactive=False)
    ],
    title="📊 Interactive Research Paper Trends Plotter",
    description="Explore how research paper publication trends have evolved over the years for selected topics."
)

# 📈 Static Trend Image Plot
image_trend_interface = gr.Interface(
    fn=show_trend_plot,
    inputs=[],
    outputs=gr.Image(label="📊 Topic Trend Graph"),
    title="📈 Research Trends",
    description="Visualizes the top 10 most common research topics."
)

# 🚀 Launch the complete app
gr.TabbedInterface(
    [
        search_interface,
        multilingual_interface,
        pdf_interface,
        text_interface,
        citation_interface,
        interactive_trend_interface,
        image_trend_interface
    ],
    [
        "🔍 Search",
        "🌍 Multilingual",
        "📄 PDF Summarizer",
        "📝 Text Summarizer",
        "📎 Citation Generator",
        "📊 Interactive Trends",
        "📈 Static Trend Image"
    ]
).launch()

* Running on local URL:  http://127.0.0.1:7864
* To create a public link, set `share=True` in `launch()`.


