<a href="https://colab.research.google.com/github/elrf3lipes/Python_Automation_Projects/blob/master/Pubmed_Clinical_Trials_data_extraction_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PubMed - Search query
Enter a search expression, email address and date-range(YYYY/MM/DD), get total number of publications and 10 related PMIDs

In [None]:
# @title
!pip install Bio

from Bio import Entrez
from Bio import Medline
from datetime import date

MAX_COUNT = 10
TERM = input("Type your search term: ")

print('Getting {0} publications containing {1}...'.format(MAX_COUNT, TERM))
Entrez.email = input("Enter your email: ")
h = Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM, mindate = input("Start date: "),  maxdate = input("End date: "), datetype="pdat")
result = Entrez.read(h)
print('Total number of publications containing {0}: {1}'.format(TERM, result['Count']))
ids = result['IdList']
h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')
records = Medline.parse(h)

# Print each ID one per line
for article_id in ids:
    print(article_id)

# Search for PMID affiliations
Enter PMID, get all the affiliations listed

In [None]:
# @title
import xml.etree.ElementTree as ET
from urllib.request import urlopen

# Get the article ID from the user
article_id = input("Enter the article ID: ")

# Construct the efetch URL using the user's input
efetch_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?&db=pubmed&retmode=xml&id=%s" % (article_id)

# Fetch and parse the XML data
response = urlopen(efetch_url)
xml_data = response.read()
response.close()

# Parse the XML data
root = ET.fromstring(xml_data)

# Find and print affiliations
affiliations = []

# Look for Affiliation elements within Author elements
for author in root.findall(".//Author"):
    affiliation = author.find(".//Affiliation")
    if affiliation is not None:
        affiliations.append(affiliation.text.strip())

# Print affiliations, if any were found
if affiliations:
    print("Affiliations:")
    for i, affiliation in enumerate(affiliations, 1):
        print(f"{i}. {affiliation}")
else:
    print("No affiliations found for this article.")


Enter the article ID: 20301790
Affiliations:
1. Department of Pediatrics, Amalia Children’s Hospital, Radboud University Medical Center, Nijmegen, the Netherlands
2. Department of Neurology, Donders Institute for Brain, Cognition and Behaviour;, Radboud University Medical Center, Nijmegen, the Netherlands
3. Amalia Children's Hospital, Radboud University Medical Center, Nijmegen, the Netherlands
4. Department of Human Genetics, Radboud University Medical Center, Nijmegen, the Netherlands
5. Department of Pediatric Neurology, Donders Institute for Brain, Cognition and Behaviour;, Amalia Children's Hospital, Radboud University Medical Center, Nijmegen, the Netherlands


# Affiliations parser
Enter PMID, get the simplified afiliation names collapsed and their counts

In [None]:
# @title
import xml.etree.ElementTree as ET
from urllib.request import urlopen
import re
import pandas as pd
from IPython.display import display

# Get the article ID from the user
article_id = input("Enter the article ID: ")

# Construct the efetch URL using the user's input
efetch_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?&db=pubmed&retmode=xml&id=%s" % (article_id)

# Fetch and parse the XML data
response = urlopen(efetch_url)
xml_data = response.read()
response.close()

# Parse the XML data
root = ET.fromstring(xml_data)

# Find and print affiliations
affiliations = []

# Look for Affiliation elements within Author elements
for author in root.findall(".//Author"):
    affiliation = author.find(".//Affiliation")
    if affiliation is not None:
        affiliations.append(affiliation.text.strip())

def extract_phrases_and_countries(affiliations):
    # Initialize a dictionary to store the counts of extracted phrases
    extracted_phrases = {}

    # Define a regular expression pattern to match the specified words
    word_pattern = r'\b(?:Hospital|University|Institute|School|Academy|Clinic)\b'

    for affiliation in affiliations:
        # Split the affiliation into sentences using '.'
        sentences = affiliation.split('.')

        for sentence in sentences:
            # Initialize a flag to check if a group has already been found in the current sentence
            group_found = False

            # Split the sentence into groups using ','
            groups = re.split(r',', sentence)

            for group in groups:
                if re.search(word_pattern, group) and not group_found:
                    # Use the stripped group as the key in the dictionary
                    group_key = group.strip()
                    # If the key exists, increment the count; otherwise, initialize it to 1
                    if group_key in extracted_phrases:
                        extracted_phrases[group_key] += 1
                    else:
                        extracted_phrases[group_key] = 1
                    group_found = True

    # Create lists for the DataFrame
    phrases = []
    counts = []

    # Iterate over the keys and values in the extracted_phrases dictionary
    for phrase, count in extracted_phrases.items():
        phrases.append(phrase)
        counts.append(count)

    # Create a DataFrame
    df = pd.DataFrame({
        'Affiliation': phrases,
        'Count': counts,
    })

    return df

result_df = extract_phrases_and_countries(affiliations)

# Format and style the DataFrame
styled_df = result_df.style.format(precision=3, thousands=".", decimal=",")

# Display the styled DataFrame
display(styled_df)

Enter the article ID: 20301790


Unnamed: 0,Affiliation,Count
0,Amalia Children’s Hospital,1
1,Donders Institute for Brain,2
2,Amalia Children's Hospital,1
3,Radboud University Medical Center,1


# ClinicalTrials - Search query
Enter a search expression, get total number of publications and 10 related NTCIds with their respective affiliations

In [None]:
# @title
!pip install pytrials

from pytrials.client import ClinicalTrials
import pandas as pd

# Prompt the user for the search expression
search_expr = input("Enter your search expression: ")

# Initialize the ClinicalTrials client
ct = ClinicalTrials()

# Get 50 full studies related to the user-provided search expression in json format.
full_studies = ct.get_full_studies(search_expr=search_expr, max_studies=10)

# Get the NCTId, Condition, and Brief title fields from 50 studies related to the user-provided search expression, in csv format.
corona_fields = ct.get_study_fields(
    search_expr=search_expr,
    fields=["NCTId", "Condition", "StudyFirstPostDate", "CompletionDate", "OrgFullName"],
    max_studies=10,
    fmt="csv",
)

# Get the count of studies related to the user-provided search expression.
study_count = ct.get_study_count(search_expr=search_expr)

# Display the count of studies
print(f"Total studies related to '{search_expr}': {study_count}")

# Read the csv data in Pandas
df = pd.DataFrame.from_records(corona_fields[1:], columns=corona_fields[0])

# Apply styling to the DataFrame for formatting
styled_df = df.style \
    .format(precision=3, thousands=".", decimal=",") \
    .set_table_styles([{'selector': 'th', 'props': [('font-size', '12px')]}])  # You can customize styling as needed

# Display the styled DataFrame
styled_df

# Next-steps:
Produce report -
Export CSV or other file types