In [None]:
pip install bio

In [None]:
pip install pandas

In [None]:
import pandas as pd
from Bio import Entrez
import re

# Set your email here for NCBI Entrez
Entrez.email = "<your email>"  # Replace with your actual email

# Define a set of common stop words to exclude from scoring
STOP_WORDS = {"a", "and", "as", "at", "but", "by", "for", "from", "if", "in", "into", "of", "on", "or", "to", "the"}

def extract_words(title):
    """Extract all words from a title, excluding stop words."""
    words = re.findall(r'\b\w+\b', title.lower())
    filtered_words = [word for word in words if word not in STOP_WORDS]
    return filtered_words

def search_pubmed(title, max_results=200):
    # Extract words from the title (case-insensitive) excluding stop words
    words = extract_words(title)
    lowercase_title = title.lower()  # Convert title to lowercase for comparison

    if not words:
        print(f"No significant words found in title: '{title}'")
        return []

    # Construct query string with "OR" to allow for broad matching based on any title word
    word_query = " OR ".join([f"{word}[Title]" for word in words])

    # Search PubMed with sorting by relevance and a larger result count
    handle = Entrez.esearch(db="pubmed", term=word_query, retmax=max_results, sort="relevance")
    record = Entrez.read(handle)
    handle.close()

    # Get list of PubMed IDs
    id_list = record["IdList"]

    # Fetch and rank titles for each PubMed ID
    return fetch_and_rank_pubmed_titles(id_list, words, lowercase_title)

def calculate_similarity_score(title, words):
    """Calculate the similarity score based on the number of matching words, case-insensitive."""
    score = sum(1 for word in words if re.search(rf'\b{word}\b', title, re.IGNORECASE))
    return score

def fetch_and_rank_pubmed_titles(id_list, words, lowercase_title, max_display=10):
    # Fetch titles for a list of PubMed IDs
    handle = Entrez.efetch(db="pubmed", id=",".join(id_list), retmode="xml")
    records = Entrez.read(handle)
    handle.close()

    # Collect articles with scores
    articles = []

    # Extract title and PubMed ID for each result
    for record in records["PubmedArticle"]:
        pubmed_id = record["MedlineCitation"]["PMID"]
        title = record["MedlineCitation"]["Article"]["ArticleTitle"]

        # Calculate similarity score
        score = calculate_similarity_score(title.lower(), words)

        # Append article information to list with score
        articles.append({
            "PubMed ID": pubmed_id,
            "Title": title,
            "Score": score
        })

    # Sort articles by similarity score in descending order and limit to top results
    articles = sorted(articles, key=lambda x: x["Score"], reverse=True)[:max_display]
    return articles

def process_titles_from_excel(input_file, output_file, title_column="Title"):
    # Load Excel file and read titles
    df = pd.read_excel(input_file)
    titles = df[title_column].dropna().tolist()  # Drop any empty cells in the title column

    # Prepare list to store results
    results = []

    for input_title in titles:
        print(f"Processing title: {input_title}")

        # Get top 10 similar articles for each title
        top_articles = search_pubmed(input_title)

        # Store results with input title for each matched article
        for article in top_articles:
            results.append({
                "Input Title": input_title,
                "PubMed ID": article["PubMed ID"],
                "Matched Title": article["Title"],
                "Score": article["Score"]
            })

        # Append an empty row after each title's results
        results.append({"Input Title": "", "PubMed ID": "", "Matched Title": "", "Score": ""})

    # Convert results to a DataFrame and save to Excel
    output_df = pd.DataFrame(results)
    output_df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}")

# Example usage
input_file = "<your input file>"    # Path to the input Excel file containing titles
output_file = "<your output file>"  # Path to save the output Excel file

# Process titles and save results to an output file
process_titles_from_excel(input_file, output_file)
