# Getting Query from PubMed

In [1]:
from pathlib import Path
import requests
import time

# 1. Create a list of all the URLs you want to download
URLS = [
    "https://pubmed.ncbi.nlm.nih.gov/27242579/",
    "https://pubmed.ncbi.nlm.nih.gov/32457675/",
    "https://pubmed.ncbi.nlm.nih.gov/32528365/"
]

# Set browser-like headers to avoid being blocked by the site
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://pubmed.ncbi.nlm.nih.gov/",
}

# Create a directory to store the output files
output_dir = Path("html_outputs")
output_dir.mkdir(exist_ok=True)


# Use a single session for all requests for efficiency
with requests.Session() as s:
    s.headers.update(headers)
    
    # 2. Loop through each URL in the list
    for i, url in enumerate(URLS):
        print(f"Downloading URL {i+1}/{len(URLS)}: {url[:50]}...")
        
        try:
            # 3. Create a unique filename for each URL
            outfile = output_dir / f"output_{i+1}.html"

            resp = s.get(url, timeout=30)
            resp.raise_for_status()  # raise an error for non-200 responses

            # Use server-provided encoding when available; default to utf-8
            if not resp.encoding:
                resp.encoding = "utf-8"
                
            outfile.write_text(resp.text, encoding=resp.encoding)

            print(f"-> Saved {outfile.resolve()} ({outfile.stat().st_size:,} bytes)\n")

        except requests.exceptions.RequestException as e:
            print(f"!! Failed to download URL {i+1}. Error: {e}\n")
        
        # Optional: Add a small delay to be respectful to the server
        time.sleep(1)

print("All downloads complete.")

Downloading URL 1/3: https://pubmed.ncbi.nlm.nih.gov/27242579/...
-> Saved C:\Users\User\Desktop\Psychoinformatics Neuroinformatics\ETL_HW\html_outputs\output_1.html (134,094 bytes)

Downloading URL 2/3: https://pubmed.ncbi.nlm.nih.gov/32457675/...
-> Saved C:\Users\User\Desktop\Psychoinformatics Neuroinformatics\ETL_HW\html_outputs\output_2.html (138,335 bytes)

Downloading URL 3/3: https://pubmed.ncbi.nlm.nih.gov/32528365/...
-> Saved C:\Users\User\Desktop\Psychoinformatics Neuroinformatics\ETL_HW\html_outputs\output_3.html (128,509 bytes)

All downloads complete.


# Extracting PMID from the HTMLs

In [16]:
from bs4 import BeautifulSoup
from pathlib import Path

# 1. Define the directory containing your HTML files
INPUT_DIR = Path("html_outputs")

# 2. Create a list to store the results as dictionaries
extraction_results = []

# 3. Find all .html files in the directory and loop through them
html_files = list(INPUT_DIR.glob("*.html"))
print(f"Found {len(html_files)} HTML files to process in '{INPUT_DIR}'...\n")

for html_file in html_files:
    print(f"Processing: {html_file.name}")
    
    html_text = html_file.read_text(encoding='utf-8', errors='ignore')
    soup = BeautifulSoup(html_text, 'html.parser')
    
    # Find the specific meta tag
    meta = soup.find('meta', attrs={'name': 'log_displayeduids'})
    
    if meta:
        pmids_str = meta.get('content', '')
        if pmids_str:
            # Split the string by comma to get individual IDs
            pmids = pmids_str.split(',')
            
            # For each PMID found, add it to our results with its source file
            for pmid in pmids:
                cleaned_pmid = pmid.strip()
                if cleaned_pmid: # Ensure it's not an empty string
                    extraction_results.append({
                        'source_file': html_file.name,
                        'pmid': cleaned_pmid
                    })
            
            print(f"  -> Found {len(pmids)} PMIDs.\n")
        else:
            print("  -> Meta tag found, but it has no content.\n")
    else:
        print(f"  -> WARNING: Could not find the 'log_displayeduids' meta tag in this file.\n")

## **Extraction Results**

# 4. Print the final results directly to the console
print(f"{'Source File':<25} | {'PMID'}")
print(f"{'-'*25}-|--------")

if not extraction_results:
    print("No PMIDs were found.")
else:
    for item in extraction_results:
        print(f"{item['source_file']:<25} | {item['pmid']}")

print("\n" + "="*40)
print("Processing complete!")
print(f"Total PMIDs extracted: {len(extraction_results)}")
extraction_results[:3]
pmids_only = [item['pmid'] for item in extraction_results]
print(pmids_only)


Found 3 HTML files to process in 'html_outputs'...

Processing: After the Honeymoon.html
  -> Found 1 PMIDs.

Processing: Attractive Alternative Partners.html
  -> Found 1 PMIDs.

Processing: Lucky Guy in Love.html
  -> Found 1 PMIDs.

Source File               | PMID
--------------------------|--------
After the Honeymoon.html  | 32457675
Attractive Alternative Partners.html | 32528365
Lucky Guy in Love.html    | 27242579

Processing complete!
Total PMIDs extracted: 3
['32457675', '32528365', '27242579']


# Extracting PMC ID from HTMLs

In [18]:
from bs4 import BeautifulSoup
from pathlib import Path

# 1. Define the directory containing your HTML files
INPUT_DIR = Path("html_outputs")

# 2. Create a list to store the results
pmc_results = []

# 3. Find all .html files in the directory and loop through them
html_files = list(INPUT_DIR.glob("*.html"))
print(f"Found {len(html_files)} HTML files to process in '{INPUT_DIR}'...\n")

for html_file in html_files:
    print(f"Processing: {html_file.name}")
    
    html_text = html_file.read_text(encoding='utf-8', errors='ignore')
    soup = BeautifulSoup(html_text, 'html.parser')
    
    # Find the meta tag with name="keywords"
    keywords_meta_tag = soup.find('meta', attrs={'name': 'keywords'})
    
    pmc_id_found = False
    if keywords_meta_tag:
        # Get the content string, which contains pmid, pmcid, doi, etc.
        content_str = keywords_meta_tag.get('content', '')
        
        # Split the string by commas to get individual parts
        content_parts = content_str.split(',')
        
        # Loop through the parts to find the one starting with "PMC"
        for part in content_parts:
            # .strip() removes any leading/trailing whitespace
            cleaned_part = part.strip()
            if cleaned_part.startswith('PMC'):
                pmc_results.append({
                    'source_file': html_file.name,
                    'pmc_id': cleaned_part
                })
                print(f"  -> Found PMC ID: {cleaned_part}\n")
                pmc_id_found = True
                break # Stop searching once the PMC ID is found
    
    # If the loop finishes and no PMC ID was found
    if not pmc_id_found:
        print(f"  -> No PMC ID found in this file.\n")

## **Extraction Results**

# 4. Print the final results directly to the console
print(f"{'Source File':<25} | {'PMC ID'}")
print(f"{'-'*25}-|-----------")

if not pmc_results:
    print("No PMC IDs were found in any of the files.")
else:
    for item in pmc_results:
        print(f"{item['source_file']:<25} | {item['pmc_id']}")

print("\n" + "="*40)
print("Processing complete!")
print(f"Total files with PMC IDs: {len(pmc_results)}")

Found 3 HTML files to process in 'html_outputs'...

Processing: After the Honeymoon.html
  -> Found PMC ID: PMC7223160

Processing: Attractive Alternative Partners.html
  -> Found PMC ID: PMC7264388

Processing: Lucky Guy in Love.html
  -> Found PMC ID: PMC4863427

Source File               | PMC ID
--------------------------|-----------
After the Honeymoon.html  | PMC7223160
Attractive Alternative Partners.html | PMC7264388
Lucky Guy in Love.html    | PMC4863427

Processing complete!
Total files with PMC IDs: 3


# Extracted Neuroimaging Coordinates using Gemini 2.5 Pro

In [19]:
import pandas as pd

# The extracted data stored in a list of lists
data = [
    ["After the Honeymoon", 3, -6, -24, 57],
    ["After the Honeymoon", 3, 15, -15, -12],
    ["After the Honeymoon", 3, 54, 21, 3],
    ["After the Honeymoon", 4, 6, -21, -21],
    ["After the Honeymoon", 4, 3, -33, -21],
    ["After the Honeymoon", 4, 39, -27, -9],
    ["After the Honeymoon", 4, 15, -90, 3],
    ["After the Honeymoon", 4, 45, -78, 24],
    ["After the Honeymoon", 5, -3, -15, -21],
    ["After the Honeymoon", 5, 0, 0, 23],
    ["After the Honeymoon", 5, 3, 0, 24],
    ["Attractive Alternative Partners", 1, -4, 22, 32],
    ["Attractive Alternative Partners", 1, -4, 40, 18],
    ["Attractive Alternative Partners", 1, 0, 26, 26],
    ["Attractive Alternative Partners", 1, 2, 50, 0],
    ["Attractive Alternative Partners", 3, 0, 64, 2],
    ["Attractive Alternative Partners", 3, -56, -14, -4],
    ["Attractive Alternative Partners", 3, 50, -2, -22],
    ["Attractive Alternative Partners", 3, 2, -6, 6],
    ["Attractive Alternative Partners", 3, 2, -90, 0],
    ["Attractive Alternative Partners", 3, -20, 16, -4],
    ["Attractive Alternative Partners", 3, -16, -26, -12],
    ["Attractive Alternative Partners", 3, -26, -98, -2],
    ["Attractive Alternative Partners", 3, 40, -10, -24],
    ["Attractive Alternative Partners", 3, 28, -98, -10],
    ["Attractive Alternative Partners", 3, -38, -20, -12],
    ["Attractive Alternative Partners", 3, -36, -74, 56],
    ["Attractive Alternative Partners", 3, -34, -78, 54],
    ["Attractive Alternative Partners", 3, 6, -26, 22],
    ["Attractive Alternative Partners", 3, -6, -30, 20],
    ["Lucky Guy in Love", 2, 8, 12, 58],
    ["Lucky Guy in Love", 2, -30, 22, 4],
    ["Lucky Guy in Love", 2, 42, 10, 0],
    ["Lucky Guy in Love", 2, -6, 14, 42],
    ["Lucky Guy in Love", 2, -62, -22, 34],
    ["Lucky Guy in Love", 2, 68, -24, 38],
    ["Lucky Guy in Love", 2, 18, -70, -22],
    ["Lucky Guy in Love", 2, 32, -94, -8],
    ["Lucky Guy in Love", 2, 18, -88, 20],
    ["Lucky Guy in Love", 2, 46, -60, 28],
    ["Lucky Guy in Love", 2, -10, -56, 20],
    ["Lucky Guy in Love", 2, 26, 28, 44],
    ["Lucky Guy in Love", 2, 8, 42, -12],
    ["Lucky Guy in Love", 2, 62, -4, -18],
    ["Lucky Guy in Love", 2, -46, -78, 30],
    ["Lucky Guy in Love", 2, 62, -4, -18],
    ["Lucky Guy in Love", 3, -36, -12, 44],
    ["Lucky Guy in Love", 3, -6, 10, 34],
    ["Lucky Guy in Love", 3, -10, -26, 16],
    ["Lucky Guy in Love", 3, -12, -54, 54],
    ["Lucky Guy in Love", 3, 36, 12, 14],
    ["Lucky Guy in Love", 3, 54, -58, 34],
    ["Lucky Guy in Love", 3, 16, 36, 46],
    ["Lucky Guy in Love", 3, 12, 54, 20],
]

# Define the column headers
headers = ["Article", "Table", "X", "Y", "Z"]

# Create a pandas DataFrame
df = pd.DataFrame(data, columns=headers)

# Save the DataFrame to a CSV file
# index=False prevents pandas from writing row indices into the file
df.to_csv("coordinates.csv", index=False)

print("✅ Successfully created coordinates.csv in your folder!")

✅ Successfully created coordinates.csv in your folder!


# Extracting the keywords from HTMLs

In [20]:
from bs4 import BeautifulSoup, NavigableString
from pathlib import Path
import re

# 1. Define the directory containing your HTML files
INPUT_DIR = Path("html_outputs")

# 2. Create a list to store the results
keyword_results = []

# 3. Find all .html files and loop through them
html_files = list(INPUT_DIR.glob("*.html"))
print(f"Found {len(html_files)} HTML files to process in '{INPUT_DIR}'...\n")

for html_file in html_files:
    print(f"Processing: {html_file.name}")
    
    html_text = html_file.read_text(encoding='utf-8', errors='ignore')
    soup = BeautifulSoup(html_text, 'html.parser')
    
    # Find the <strong> tag that contains the text "Keywords:"
    # We use a regex with re.IGNORECASE to match "Keywords:" or "keywords:"
    keyword_tag = soup.find('strong', string=re.compile(r'Keywords:', re.IGNORECASE))
    
    keywords_found = False
    if keyword_tag:
        # The keywords are in the text node immediately following the <strong> tag
        next_element = keyword_tag.next_sibling
        
        # Check if the next element is actually text (a NavigableString)
        if next_element and isinstance(next_element, NavigableString):
            # .strip() removes whitespace, .rstrip('.') removes the final period
            keyword_str = next_element.strip().rstrip('.')
            
            # Split the string by semicolon and clean up each keyword
            keywords = [kw.strip() for kw in keyword_str.split(';')]
            
            keyword_results.append({
                'source_file': html_file.name,
                'keywords': keywords
            })
            print(f"  -> Found {len(keywords)} keywords.\n")
            keywords_found = True

    if not keywords_found:
        print(f"  -> No keywords section found in this file.\n")

## **Extraction Results**

# 4. Print the final results
print(f"{'Source File':<25} | {'Keywords'}")
print(f"{'-'*25}-|-----------")

if not keyword_results:
    print("No keywords were found in any of the files.")
else:
    for item in keyword_results:
        # ', '.join() converts the list of keywords into a nice string for printing
        keywords_str = ', '.join(item['keywords'])
        print(f"{item['source_file']:<25} | {keywords_str}")

print("\n" + "="*40)
print("Processing complete!")
print(f"Total files with keywords: {len(keyword_results)}")

Found 3 HTML files to process in 'html_outputs'...

Processing: After the Honeymoon.html
  -> Found 5 keywords.

Processing: Attractive Alternative Partners.html
  -> Found 5 keywords.

Processing: Lucky Guy in Love.html
  -> Found 6 keywords.

Source File               | Keywords
--------------------------|-----------
After the Honeymoon.html  | dopamine, fMRI, marriage, pair-bonds, romantic love
Attractive Alternative Partners.html | attention to alternatives, close relationship, romantic love, self-expansion, social neuroscience
Lucky Guy in Love.html    | AI, MPFC, aMCC, fMRI, intrasexual competition, pain empathy

Processing complete!
Total files with keywords: 3


# Final Result

In [28]:
import pandas as pd
from bs4 import BeautifulSoup, NavigableString
from pathlib import Path
import re

## ----------------------------------------------------------------
## STEP 1: EXTRACT METADATA FROM ALL HTML FILES
## ----------------------------------------------------------------

INPUT_DIR = Path("html_outputs")
html_files = list(INPUT_DIR.glob("*.html"))
extracted_metadata = []

print("--- Starting Metadata Extraction ---")
for html_file in html_files:
    print(f"Processing: {html_file.name}")
    html_text = html_file.read_text(encoding='utf-8', errors='ignore')
    soup = BeautifulSoup(html_text, 'html.parser')
    
    # Initialize variables for this file
    title = None
    pmid = None
    pmcid = None
    keywords = []
    
    # --- Extract Title ---
    title_tag = soup.find('h1', class_='heading-title')
    if title_tag:
        title = title_tag.get_text(strip=True)

    # --- Extract PMID and PMCID from the keywords meta tag ---
    keywords_meta_tag = soup.find('meta', attrs={'name': 'keywords'})
    if keywords_meta_tag:
        content_str = keywords_meta_tag.get('content', '')
        for part in content_str.split(','):
            cleaned_part = part.strip()
            if cleaned_part.startswith('pmid:'):
                pmid = cleaned_part.replace('pmid:', '').strip()
            elif cleaned_part.startswith('PMC'):
                pmcid = cleaned_part
    
    # --- Extract Keywords from the body ---
    keyword_strong_tag = soup.find('strong', string=re.compile(r'Keywords:', re.IGNORECASE))
    if keyword_strong_tag:
        next_element = keyword_strong_tag.next_sibling
        if next_element and isinstance(next_element, NavigableString):
            keyword_str = next_element.strip().rstrip('.')
            keywords = [kw.strip() for kw in keyword_str.split(';')]

    # Store all found data for this file
    extracted_metadata.append({
        'Title': title,
        'PMID': pmid,
        'PMCID': pmcid,
        'Keywords': '; '.join(keywords) # Join list into a single string
    })

# Convert the extracted data into a DataFrame
metadata_df = pd.DataFrame(extracted_metadata)
print("\n--- Metadata Extraction Complete ---")
print("Found the following articles:")
print(metadata_df[['Title', 'PMID']])


## ----------------------------------------------------------------
## STEP 2: LOAD YOUR COORDINATE DATA
## ----------------------------------------------------------------
coords_df = pd.read_csv("coordinates.csv")


## ----------------------------------------------------------------
## STEP 3: MAP THE TITLES TO MERGE THE DATASETS
## ----------------------------------------------------------------

title_map = {
    'After the Honeymoon': 'After the Honeymoon: Neural and Genetic Correlates of Romantic Love in Newlywed Marriages',
    'Attractive Alternative Partners': 'Manipulation of Self-Expansion Alters Responses to Attractive Alternative Partners',
    'Lucky Guy in Love': "Decreased Empathic Responses to the 'Lucky Guy' in Love: The Effect of Intrasexual Competition"
}

# Use the map to create a new 'Title' column in the coords_df for merging
coords_df['Title'] = coords_df['Article'].map(title_map)


## ----------------------------------------------------------------
## STEP 4: MERGE DATAFRAMES AND FINALIZE THE TABLE
## ----------------------------------------------------------------
# Merge the two dataframes using the 'Title' column as the key
final_df = pd.merge(coords_df, metadata_df, on='Title')

# Select and reorder columns to match your desired output
final_df = final_df[[
    'PMID',
    'PMCID',
    'Keywords',
    'Table',
    'X',
    'Y',
    'Z'
]]

print("\n--- Final Merged Table ---")
print(final_df.to_string())

# Optional: Save the final table to a new CSV file
final_df.to_csv('final_merged_data.csv', index=False)
print("\n✅ Successfully saved the final table to 'final_merged_data.csv'")

--- Starting Metadata Extraction ---
Processing: After the Honeymoon.html
Processing: Attractive Alternative Partners.html
Processing: Lucky Guy in Love.html

--- Metadata Extraction Complete ---
Found the following articles:
                                               Title      PMID
0  After the Honeymoon: Neural and Genetic Correl...  32457675
1  Manipulation of Self-Expansion Alters Response...  32528365
2  Decreased Empathic Responses to the 'Lucky Guy...  27242579

--- Final Merged Table ---
        PMID       PMCID                                                                                           Keywords  Table   X   Y   Z
0   32457675  PMC7223160                                                dopamine; fMRI; marriage; pair-bonds; romantic love      3  -6 -24  57
1   32457675  PMC7223160                                                dopamine; fMRI; marriage; pair-bonds; romantic love      3  15 -15 -12
2   32457675  PMC7223160                                         