In [None]:
pip install bio

In [None]:
pip install pandas

In [None]:
pip install openpyxl

In [None]:
!pip install ipython-autotime

%load_ext autotime

In [None]:

import pandas as pd
from Bio import Entrez
import re
from openpyxl import Workbook

# Set your email here for NCBI Entrez
Entrez.email = "<your email>"  # Replace with your actual email

# Define a set of common stop words to exclude from scoring
STOP_WORDS = {"a", "and", "as", "at", "but", "by", "for", "from", "if", "in", "into", "of", "on", "or", "to", "the"}

def extract_words(text):
    """Extract all words from a text, excluding stop words."""
    words = re.findall(r'\b\w+\b', text.lower())
    filtered_words = [word for word in words if word not in STOP_WORDS]
    return filtered_words

def search_pubmed(title, author=None, max_results=200):
    # Extract words from the title (case-insensitive) excluding stop words
    words = extract_words(title)
    lowercase_title = title.lower()  # Convert title to lowercase for comparison

    if not words:
        print(f"No significant words found in title: '{title}'")
        return []

    # Construct query string with title words
    word_query = " OR ".join([f"{word}[Title]" for word in words])

    # Add author to query if specified
    if author:
        query = f"({word_query}) AND {author}[Author]"
    else:
        query = word_query

    # Search PubMed with sorting by relevance
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results, sort="relevance")
    record = Entrez.read(handle)
    handle.close()

    # Get list of PubMed IDs
    id_list = record["IdList"]

    # Check if there are no results
    if not id_list:
        print(f"No results found for title: '{title}'")
        return [{"PubMed ID": "No results", "Title": "", "Score": 0}]

    # Fetch and rank titles for each PubMed ID
    return fetch_and_rank_pubmed_titles(id_list, words, lowercase_title)

def calculate_similarity_score(title, words):
    """Calculate the similarity score based on the number of matching words, case-insensitive."""
    score = sum(1 for word in words if re.search(rf'\b{word}\b', title, re.IGNORECASE))
    return score

def fetch_and_rank_pubmed_titles(id_list, words, lowercase_title, max_display=10):
    # Fetch titles for a list of PubMed IDs
    handle = Entrez.efetch(db="pubmed", id=",".join(id_list), retmode="xml")
    records = Entrez.read(handle)
    handle.close()

    # Collect articles with scores
    articles = []

    # Extract title and PubMed ID for each result
    for record in records["PubmedArticle"]:
        pubmed_id = record["MedlineCitation"]["PMID"]
        title = record["MedlineCitation"]["Article"]["ArticleTitle"]

        # Calculate similarity score
        score = calculate_similarity_score(title.lower(), words)

        # Append article information to list with score
        articles.append({
            "PubMed ID": pubmed_id,
            "Title": title,
            "Score": score
        })

    # Sort articles by similarity score in descending order and limit to top results
    articles = sorted(articles, key=lambda x: x["Score"], reverse=True)[:max_display]
    return articles

def process_titles_from_excel(input_file, output_file, title_column="Title", author_column="Author"):
    # Load Excel file and read titles and authors
    df = pd.read_excel(input_file)
    titles = df[title_column].dropna().tolist()  # Drop any empty cells in the title column
    authors = df.get(author_column, [None] * len(titles)).tolist()  # Handle optional author column

    # Prepare list to store results
    results = []

    for input_title, input_author in zip(titles, authors):
        print(f"Processing title: {input_title} with author: {input_author if input_author else 'N/A'}")

        # Get top 10 similar articles for each title and optional author
        top_articles = search_pubmed(input_title, input_author)

        # Store results with input title and author for each matched article
        for article in top_articles:
            results.append({
                "Input Title": input_title,
                "Author": input_author if input_author else "",
                "PubMed ID": article["PubMed ID"],
                "Matched Title": article["Title"],
                "Score": article["Score"]
            })

    # Convert results to a DataFrame and save to Excel
    output_df = pd.DataFrame(results)
    output_df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}")

# Example usage
input_file = "<input file>"    # Path to the input Excel file containing titles and authors
output_file = "<output file>"  # Path to save the output Excel file

# Process titles and save results to an output file
process_titles_from_excel(input_file, output_file)
