<a href="https://colab.research.google.com/github/fidelis2025/stankelly-website/blob/main/Intelligent_Manuscript_Routing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pypdf python-docx

Collecting pypdf
  Downloading pypdf-6.7.4-py3-none-any.whl.metadata (7.1 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading pypdf-6.7.4-py3-none-any.whl (331 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m331.5/331.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, pypdf
Successfully installed pypdf-6.7.4 python-docx-1.2.0


In [4]:
import gradio as gr
import pandas as pd
import numpy as np
import datetime
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sentence_transformers import SentenceTransformer
from pypdf import PdfReader
import docx

# --------------------------------------------------
# LOAD NLP MODEL
# --------------------------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
SIMILARITY_THRESHOLD = 0.20

# --------------------------------------------------
# JOURNAL DATABASE
# --------------------------------------------------
journals = [
    {
        "name": "FNAS Journal of Mathematical Modeling and Numerical Simulation",
        "scope": "It publishes original researched articles and reviews in Mathematical Modeling and Numerical Simulation, and other related disciplines and it is international in scope."
    },
    {
        "name": "FNAS Journal of Computing and Applications",
        "scope": "It publishes original researched articles and reviews in Computer Science, Computing, Robotics, IoT, Artificial Intelligence, Machine Learning and other related disciplines."
    },
    {
        "name": "FNAS Journal of Mathematical and Statistical Computing",
        "scope": "It publishes original researched articles and reviews in Mathematics, Statistics, Computing and other related disciplines."
    },
    {
        "name": "FNAS Journal of Applied Chemical Science Research",
        "scope": "It publishes original researched articles and reviews in Chemistry, Biochemistry, and other related disciplines."
    },
    {
        "name": "FNAS Journal of Health, Sports Science and Recreation",
        "scope": "It publishes original researched articles and reviews in Health, Sports and Recreation and other related disciplines."
    },
     {
        "name": "FNAS Journal of Basic and Environmental Research",
        "scope": "It publishes original researched articles and reviews in Basic and Environmental research, and other related disciplines."
    },
    {
        "name": "FNAS Journal of Applied and Physical Sciences",
        "scope": "It publishes original researched articles and reviews in Applied and Physical Sciences, and other related disciplines."
    },
    {
        "name": "FNAS Journal of Applied Biological Sciences",
        "scope": "It publishes original researched articles and reviews in Applied Biological Sciences, and other related disciplines."
    },
    {
        "name": "FNAS Journal of Mathematics, and Science Education",
        "scope": "It publishes original researched articles and reviews in Mathematics, Science Education and other related disciplines."
    },
    {
        "name": "FNAS Journal of Scientific Innovations",
        "scope": "It publishes original researched articles and reviews in Natural and Applied Sciences and other related disciplines."
    }
]

journal_df = pd.DataFrame(journals)
journal_embeddings = model.encode(journal_df["scope"].tolist())

classification_log = []
document_repository = []

# --------------------------------------------------
# FILE TEXT EXTRACTION
# --------------------------------------------------
def extract_text(file):
    if file is None:
        return "", "", ""

    text = ""

    try:
        if file.name.lower().endswith(".pdf"):
            reader = PdfReader(file)
            for page in reader.pages:
                if page.extract_text():
                    text += page.extract_text()

        elif file.name.lower().endswith(".docx"):
            document = docx.Document(file)
            for para in document.paragraphs:
                text += para.text + "\n"

        else:
            return "Unsupported file format", "", ""

    except Exception as e:
        return f"Error reading file: {str(e)}", "", ""

    if text.strip() == "":
        return "Empty document", "", ""

    lines = text.split("\n")
    title = lines[0] if len(lines) > 0 else "Untitled"

    abstract_match = re.search(r"abstract(.*?)(introduction|keywords)", text, re.IGNORECASE | re.DOTALL)
    abstract = abstract_match.group(1) if abstract_match else text[:1000]

    keywords_match = re.search(r"keywords(.*?)(introduction)", text, re.IGNORECASE | re.DOTALL)
    keywords = keywords_match.group(1) if keywords_match else ""

    document_repository.append(text)

    return title.strip(), abstract.strip(), keywords.strip()

# --------------------------------------------------
# CLASSIFICATION ENGINE
# --------------------------------------------------
def classify_manuscript(title, abstract):

    if abstract.strip() == "":
        return "", None, 0, "No Abstract"

    manuscript_embedding = model.encode(abstract)

    scores = []
    for idx, emb in enumerate(journal_embeddings):
        sim = cosine_similarity([manuscript_embedding], [emb])[0][0]
        scores.append((journal_df.iloc[idx]["name"], sim))

    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_journal, top_score = scores[0]

    confidence = round(top_score * 100, 2)
    alignment_status = "Aligned" if top_score >= SIMILARITY_THRESHOLD else "Not Aligned"

    ranked_df = pd.DataFrame(scores, columns=["Journal", "Similarity Score"])

    classification_log.append({
        "Title": title,
        "Top Journal": top_journal,
        "Confidence": confidence,
        "Status": alignment_status,
        "Date": datetime.datetime.now()
    })

    return top_journal, ranked_df, confidence, alignment_status

# --------------------------------------------------
# GRADIO DASHBOARD
# --------------------------------------------------
with gr.Blocks() as demo:

    gr.Markdown("## Intelligent Manuscript Routing and Editorial Decision Support System")

    with gr.Tab("Upload Manuscript"):

        file_input = gr.File(
            label="Upload PDF or Word Document",
            file_types=[".pdf", ".docx"]
        )

        title_box = gr.Textbox(label="Extracted Title")
        abstract_box = gr.Textbox(label="Extracted Abstract")
        keywords_box = gr.Textbox(label="Extracted Keywords")

        classify_btn = gr.Button("Classify Manuscript")

        result = gr.Textbox(label="Top Journal Recommendation")
        ranking = gr.Dataframe(label="Journal Ranking")
        confidence = gr.Number(label="Confidence Score")
        status = gr.Textbox(label="Alignment Status")

        file_input.change(
            extract_text,
            file_input,
            [title_box, abstract_box, keywords_box]
        )

        classify_btn.click(
            classify_manuscript,
            [title_box, abstract_box],
            [result, ranking, confidence, status]
        )

demo.launch()

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d14f446c1e01abf58d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


