## Jupyter Notebook: Exploring GEO API

This notebook will guide you through retrieving and analyzing GEO datasets using the GEO API.

In [None]:
# Install dependencies (Run this cell first)
!pip install GEOparse pandas matplotlib seaborn

In [None]:
# Import necessary libraries
import os
import time
import requests
import GEOparse
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from IPython.display import display

## Step 1: Retrieve a GEO Dataset (GSE Accession Number)

In [None]:
def fetch_gse_data(accession_id):
    """Fetches GEO Series data for a given accession ID."""
    print(f"Fetching {accession_id} from GEO...")
    gse = GEOparse.get_GEO(geo=accession_id, destdir="./GEO/")
    print(f"Fetched {accession_id} Successfully!")
    return gse

# Example usage: Fetch GSE285812
gse = fetch_gse_data("GSE285812")

## Step 2: Extract Metadata

In [None]:
print("Dataset Title:", gse.metadata["title"])
print("Dataset Summary:", gse.metadata["summary"])
print("Dataset Overall Design:", gse.metadata["overall_design"])
for item in gse.gsms:
    print(item)

## Step 3: Extract Sample Information

In [None]:
samples = gse.gsms
print(f"Total Samples: {len(samples)}")
print(gse.metadata["title"])
def extract_sample_metadata(gse):
    """Extracts sample metadata from a GEO dataset."""
    metadata = []
    for sample_id, sample in gse.gsms.items():
        metadata.append({
            "Sample ID": sample_id,
            "Title": sample.metadata.get("title", [None])[0],
            "Source Name": sample.metadata.get("source_name_ch1", [None])[0],
            "Organism": sample.metadata.get("organism_ch1", [None])[0],
            "Molecule": sample.metadata.get("molecule_ch1", [None])[0],
            "Description": sample.metadata.get("description", [None])[0]
        })
    return pd.DataFrame(metadata)

print(gse.gsms["GSM8698727"].metadata.get("title"))
print(gse.gsms["GSM8698727"].metadata.get("title", [None])[0])

pd.set_option('display.max_colwidth', None)
sample_df = extract_sample_metadata(gse)
display(sample_df)

## Step 4: Visualization - Sample Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(y=sample_df["Organism"], order=sample_df["Organism"].value_counts().index)
plt.title("Sample Distribution by Organism")
plt.show()

## Step 5: Save Extracted Data

In [None]:
sample_df.to_csv("GSE285812_sample_metadata.csv", index=False)

## Step 6: Extract SRX from GEO Metadata

In [None]:
# Function to extract SRX from the GEO metadata
def extract_srx_from_gse(gse):
    """
    Extract SRX (Experiment IDs) from GSM metadata.
    Returns a dictionary mapping GSM IDs to their corresponding SRX.
    """
    srx_mapping = {}

    for gsm_id, gsm in gse.gsms.items():
        relations = gsm.metadata.get("relation", [])
        for relation in relations:
            if "SRA:" in relation:
                srx_id = relation.split("SRA:")[-1].strip()
                if "https://www.ncbi.nlm.nih.gov/sra?term=" in srx_id:
                    srx_id = srx_id.split("term=")[-1]
                srx_mapping[gsm_id] = srx_id
    return srx_mapping

## Step 7: Extract SRR from SRX

In [None]:
# Function to fetch SRR runs from an SRX experiment
def fetch_srr_from_srx(srx_id):
    """
    Queries SRA for SRR (Run IDs) using the SRX Experiment ID.
    """
    url = f"https://www.ncbi.nlm.nih.gov/sra/?term={srx_id}"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {srx_id}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    srr_list = []
    
    # Search for SRR links
    for link in soup.find_all("a"):
        if "SRR" in link.text:
            srr_list.append(link.text.strip())

    return srr_list

## Step 8: Loading GEO Dataset of Interest

In [None]:
# Load the GEO dataset (Change the accession number as needed)
geo_id = "GSE285812"
gse = GEOparse.get_GEO(geo=geo_id, destdir="./GEO/")

## Step 9: Perform Querries

In [None]:
# Step 1: Extract GSM -> SRX mapping
gsm_to_srx = extract_srx_from_gse(gse)

# Step 2: Query SRA to get SRR IDs
geo_sra_data = []
for gsm, srx in gsm_to_srx.items():
    print(f"Fetching SRR for {gsm} -> {srx}...")
    srr_ids = fetch_srr_from_srx(srx)
    time.sleep(2)  # Avoid rate limits
    geo_sra_data.append({
        "GSM_ID": gsm,
        "SRX_ID": srx,
        "SRR_IDs": ", ".join(srr_ids) if srr_ids else "No SRRs found"
    })

## Step 10: Save Data

In [None]:
# Convert to DataFrame
geo_sra_df = pd.DataFrame(geo_sra_data)

# Display the structured DataFrame
display(name="GEO SRA Data", dataframe=geo_sra_df)

# Save to CSV
output_csv_name = os.path.join("./GEO/", f"{geo_id}_sra_metadata.csv")
geo_sra_df.to_csv(output_csv_name, index=False)

## NCBI-AWS Data Test

In [1]:
import os
import sys
import time
import shutil
import requests
import GEOparse
import functools
import subprocess
import pandas as pd
from Bio import Entrez
from pathlib import Path
from bs4 import BeautifulSoup
project_root = Path().resolve().parent  # Moves to Bloom
sys.path.append(str(project_root)) # Add Bloom to sys.path
from bloom.data.geo_sra_downloader import GEODataDownloader

In [3]:
# Parameters
geo_id = "GSE285812"
download_path = "/Users/egg/Projects/Bloom/data/raw/"
user_email = "eduardogade@gmail.com"
api_key = "c5087c87794c22daeb8f52d13fc5a363d108"
ncbi_path = "/Users/egg/ncbi_sra/"
geo_downloader = GEODataDownloader(geo_id, download_path, user_email, api_key, ncbi_path)
print(geo_downloader)

20-Mar-2025 17:15:38 DEBUG utils - Directory /Users/egg/Projects/Bloom/data/raw/GSE285812/GSE285812_temp already exists. Skipping.
20-Mar-2025 17:15:38 INFO GEOparse - File already exist: using local version.
20-Mar-2025 17:15:38 INFO GEOparse - Parsing /Users/egg/Projects/Bloom/data/raw/GSE285812/GSE285812_temp/GSE285812_family.soft.gz: 
20-Mar-2025 17:15:38 DEBUG GEOparse - DATABASE: GeoMiame
20-Mar-2025 17:15:38 DEBUG GEOparse - SERIES: GSE285812
20-Mar-2025 17:15:38 DEBUG GEOparse - PLATFORM: GPL24247
20-Mar-2025 17:15:38 DEBUG GEOparse - PLATFORM: GPL24676
20-Mar-2025 17:15:38 DEBUG GEOparse - SAMPLE: GSM8698727
20-Mar-2025 17:15:38 DEBUG GEOparse - SAMPLE: GSM8698728
20-Mar-2025 17:15:38 DEBUG GEOparse - SAMPLE: GSM8698729
20-Mar-2025 17:15:38 DEBUG GEOparse - SAMPLE: GSM8698730
20-Mar-2025 17:15:38 DEBUG GEOparse - SAMPLE: GSM8698731
20-Mar-2025 17:15:38 DEBUG GEOparse - SAMPLE: GSM8698732
20-Mar-2025 17:15:38 DEBUG GEOparse - SAMPLE: GSM8698919
20-Mar-2025 17:15:38 DEBUG GEOpar

Fetching GEO dataset: GSE285812
Successfully fetched GEO dataset: GSE285812
Cleaning up temporary files at /Users/egg/Projects/Bloom/data/raw/GSE285812/GSE285812_temp
[GEODataDownloader] GEO ID: GSE285812 | Output Directory: /Users/egg/Projects/Bloom/data/raw/GSE285812


In [25]:
# Test with known SRR
!pip install selenium webdriver-manager
srr_id = "SRR31810743"
url = f"https://trace.ncbi.nlm.nih.gov/Traces/index.html?view=run_browser&acc={srr_id}&display=data-access"
response = requests.get(url, timeout=10)
print(response)
response.raise_for_status()
print(response)
soup = BeautifulSoup(response.text, "html.parser")
print(soup)
aws_links = [a["href"] for a in soup.find_all("a", href=True) if "https://sra-pub-src" in a["href"]]
print(aws_links)


Collecting selenium
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.29.0-py3-none-any.whl (9.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hDownloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading trio-0.29.0-py3-none-any.whl (492 kB)
Downloading t

## Testing with Selenium

In [26]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

In [27]:
def get_sra_aws_links(srr_id):
    """
    Extract AWS download links from NCBI SRA Run Browser using Selenium.

    Parameters:
    srr_id (str): The SRA run ID (e.g., "SRR31810743")

    Returns:
    list: A list of AWS download URLs.
    """
    # Define the URL
    url = f"https://trace.ncbi.nlm.nih.gov/Traces/index.html?view=run_browser&acc={srr_id}&display=data-access"

    # Set up Selenium WebDriver (headless mode)
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode (no browser window)
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        # Load the page
        driver.get(url)
        time.sleep(5)  # Wait for JavaScript to load the content

        # Get the updated page source with JavaScript-rendered content
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract all AWS download links
        aws_links = [a["href"] for a in soup.find_all("a", href=True) if "https://sra-pub-src" in a["href"]]

        return aws_links

    except Exception as e:
        print(f"Error fetching AWS links: {e}")
        return None

    finally:
        driver.quit()  # Close the browser session

In [35]:
# Test
#srr_id = "SRR31810744" # (HAS PREFETCH) No AWS links found or failed to retrieve data. 
#srr_id = "SRR31810743" # (NO PREFETCH) Get AWS links
#srr_id = "SRR31810742" # (NO PREFETCH) Get AWS links
#srr_id = "SRR31810741" # (NO PREFETCH) Get AWS links
#srr_id = "SRR31814601" # (NO PREFETCH) Get AWS links
#srr_id = "SRR31815262" # (HAS PREFETCH) No AWS links found or failed to retrieve data.
#srr_id = "SRR31814605" # (HAS PREFETCH) No AWS links found or failed to retrieve data.
#srr_id = "SRR31814604" # (NO PREFETCH) Get AWS links
aws_links = get_sra_aws_links(srr_id)

if aws_links:
    print("AWS Download Links Found:")
    for link in aws_links:
        print(link)
else:
    print("No AWS links found or failed to retrieve data.")

AWS Download Links Found:
https://sra-pub-src-1.s3.amazonaws.com/SRR31814604/b12-month-BC-dscHiC_S2_L007_R2_001.fastq.gz.1
https://sra-pub-src-1.s3.amazonaws.com/SRR31814604/b12-month-BC-dscHiC_S2_L007_R1_001.fastq.gz.1
https://sra-pub-src-1.s3.amazonaws.com/SRR31814604/b12-month-BC-dscHiC_S2_L007_I2_001.fastq.gz.1
