In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import gradio as gr

# ----------------------------
# Load Dataset
# ----------------------------
file_path = r"D:\Downloads\arXiv-DataFrame.csv\arXiv-DataFrame.csv"   # update this to your actual file
df = pd.read_csv(file_path)
print("Columns found:", df.columns.tolist())
print(f"Loaded {len(df)} papers.")

# Combine title + summary for richer embeddings
df["text"] = df["Title"].fillna('') + " " + df["Summary"].fillna('')

# ----------------------------
# Compute TF-IDF Embeddings
# ----------------------------
print("Computing TF-IDF embeddings...")
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df["text"])
print("TF-IDF shape:", tfidf_matrix.shape)

# ----------------------------
# Cluster Papers
# ----------------------------
print("Clustering papers into 10 groups...")
kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(tfidf_matrix)

# ----------------------------
# Define Search Function
# ----------------------------
def search_papers(query, top_k=5):
    if not query.strip():
        return "Please enter a search query."

    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-top_k:][::-1]

    results = []
    for idx in top_indices:
        title = df.iloc[idx]["Title"]
        summary = df.iloc[idx]["Summary"]
        author = df.iloc[idx]["Author"]
        link = df.iloc[idx]["Link"]
        sim = similarities[idx]
        cluster = df.iloc[idx]["cluster"]

        result = (
            f"### üß† {title}\n"
            f"**Author(s):** {author}\n\n"
            f"**Cluster:** {cluster}\n\n"
            f"**Similarity:** {sim:.2f}\n\n"
            f"**Summary:** {summary}\n\n"
            f"[üîó View Paper]({link})\n"
            f"---\n"
        )
        results.append(result)

    return "\n".join(results)

# ----------------------------
# Gradio Interface
# ----------------------------
interface = gr.Interface(
    fn=search_papers,
    inputs=gr.Textbox(label="üîç Enter your query", placeholder="e.g. backtracking algorithms for constraint satisfaction"),
    outputs="markdown",
    title="üìö Academic Paper Search (Local)",
    description="Search and cluster ~50k papers using TF-IDF + KMeans. Runs locally only."
)

# ----------------------------
# Launch Locally (No Public Link)
# ----------------------------
interface.launch(server_name="127.0.0.1", server_port=7860, share=False)


Columns found: ['Unnamed: 0', 'id', 'Title', 'Summary', 'Author', 'Link', 'Publish Date', 'Update Date', 'Primary Category', 'Category']
Loaded 53474 papers.
Computing TF-IDF embeddings...
TF-IDF shape: (53474, 5000)
Clustering papers into 10 groups...
* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


