In [None]:
#!pip install biopython

In [None]:
from Bio import Entrez

In [None]:
'''
sort_by possible strings for PubMed:
----
pub_date – descending sort by publication date
Author – ascending sort by first author
JournalName – ascending sort by journal name
relevance – default sort order, (“Best Match”) on web PubMed
'''
sort_by = 'pub_date'

def search_article(query, number_of_articles=50):
    Entrez.email = "YourEmail@example.com"  # Always provide your email

    handle = Entrez.esearch(db="pubmed", term=query, retmax=number_of_articles, sort=sort_by)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

def fetch_details(pmid):
    handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
    records = Entrez.read(handle)
    handle.close()
    return records


In [None]:
# Example usage
query = "what are the biomarkers for Alzheimer"
pmid_list = search_article(query)
citation_arr = []
abstract_arr = []
if pmid_list:
    for pmid in pmid_list:  # Fetch details for the first article
      details = fetch_details(pmid)
      ##########################
      ## abstract of the article
      ##########################
      try:
        abstract_text = details['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
      except:
        continue
      ##################
      ## article details
      ##################
      article = details['PubmedArticle'][0]['MedlineCitation']['Article']
      journal = article['Journal']
      pubmed_data = details['PubmedArticle'][0]['PubmedData']

      # Article Title
      title = article.get('ArticleTitle', 'No title available')

      try:
        # Authors
        authors = article['AuthorList']
        author_str = ', '.join([f"{a['LastName']} {a['ForeName'][0]}" for a in authors])

      # Journal Info
        journal_title = journal.get('Title', 'No journal title available')
        journal_volume = journal['JournalIssue'].get('Volume', 'No volume')
        journal_issue = journal['JournalIssue'].get('Issue', 'No issue')
        pub_date = article.get('ArticleDate', [{'Year': 'No year', 'Month': 'No month', 'Day': 'No day'}])[0]
        pub_year = pub_date['Year']
        pub_month = pub_date['Month']
        pub_day = pub_date['Day']
        pages = article['Pagination'].get('StartPage', 'No pages')
        citation = f"{author_str}. {title}. {journal_title}. {pub_year} {pub_month} {pub_day};{journal_volume}({journal_issue}):{pages}. PMID: {pmid}."
      except:
        citation = f"{title}. {journal_title}. PMID: {pmid}."
      
      citation_arr.append(citation)
      abstract_arr.append(abstract_text)

else:
    print("No articles found")

In [None]:
print(f"number of abstracts: {len(abstract_arr)}\nnumber of citations: {len(citation_arr)}")

In [None]:
from langchain_core.documents.base import Document

In [None]:
docs = [Document(page_content=abstract) for abstract in abstract_arr ]

In [None]:
docs[3]

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv("../.env")

openai_api_key = os.getenv('openai_api_key')


In [None]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
numeric_vectors = embeddings.embed_documents([doc.page_content for doc in docs])

In [None]:
numeric_vectors[0][0:5]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Calculate within-cluster sum of square (WCSS) for a range of number of clusters
wcss = []
for i in range(1,len(numeric_vectors)):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(numeric_vectors)
    wcss.append(kmeans.inertia_)

In [None]:
# Plotting the results onto a line graph
plt.figure(figsize=(10,5))
plt.plot(range(1,len(numeric_vectors)), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') # within cluster sum of squares
plt.show()

In [None]:
kmeans = KMeans(n_clusters=len(numeric_vectors))
kmeans.fit(numeric_vectors)

In [None]:
print(f"abstracts' category labels: \n\n{list(kmeans.labels_)}")

In [None]:
unique_indices = []
unique_labels  = []
docs_to_be_summarized = []
for i, label in enumerate(kmeans.labels_):
    if label in unique_labels: continue
    unique_indices.append(i)
    unique_labels.append(label)
    docs_to_be_summarized.append(docs[i])

In [None]:
## The citations for the selected abstracts
np.array(citation_arr)[unique_indices]

In [None]:
from openai import OpenAI
client = OpenAI(
    api_key=openai_api_key
)

In [None]:
documents = docs_to_be_summarized
# Construct the conversation
messages = [{"role": "system",
             "content": """
             You are a professional biomedical researcher.
             You will be given a series of article abstracts.
             Merge the abstracts into a concise, smooth, and meaningful summary. 
             The information in your response should exclusively come from the abstracts.
                """
            }]
for i, doc in enumerate(documents):
    content = doc.page_content if hasattr(doc, 'page_content') else ''
    messages.append({"role": "user", "content": f"abstract {i}:\n" + content})


In [None]:
messages

In [None]:
messages.append({"role": "user", "content": query})

In [None]:
# Make the API call
response = client.chat.completions.create(
    model="gpt-3.5-turbo-1106",
    messages=messages,
    temperature=0.7  # creativity
)
summary = response.choices[0].message.content

In [None]:
print(summary)