In [1]:
### 
from agents import Agent, Runner
from IPython.display import display, Markdown
import os
from openai import OpenAI
from bs4 import BeautifulSoup
API_KEY = ''
os.environ['OPENAI_API_KEY']=API_KEY
client = OpenAI()

In [2]:
import os
import time
import json
import re
import requests
import pandas as pd
from typing import TypedDict, List, Dict, Any
from bs4 import BeautifulSoup
from geofetch import Geofetcher
from openai import OpenAI
from langgraph.graph import StateGraph
from langchain_core.runnables import RunnableLambda

# ----------------------------------------
# Agent state schema
# ----------------------------------------

class AgentState(TypedDict):
    query_gene: str
    pathway_info: Dict[str, Any]
    metadata: Dict[str, Any]
    gse_list: List[str]
    research_plan: str

# ----------------------------------------
# Utility functions (LLM + GEO search)
# ----------------------------------------

client = OpenAI()

def extract_json_block(text):
    match = re.search(r"```json\s*(\{.*?\})\s*```", text, re.DOTALL)
    if match:
        return json.loads(match.group(1))
    else:
        return json.loads(text)

def generate_pathway_info(query_gene, model="gpt-4o-mini", temperature=0.3):
    prompt = f"""
You are a biomedical assistant.

Given the gene {query_gene}, return:
1. Key gene symbols in the same biological pathway.
2. Drugs or compounds that inhibit this pathway or {query_gene}'s activity.
3. The disease areas or biological processes this pathway is involved in.

Provide answers in JSON format with keys: "genes", "drugs", "pathways"
    """
    completion = client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[{"role": "user", "content": prompt}]
    )
    content = completion.choices[0].message.content
    return extract_json_block(content)

def scrape_organism_from_geo_html(geo_accession):
    url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={geo_accession}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        organism_row = soup.find("td", string="Organism")
        if organism_row and organism_row.find_next_sibling("td"):
            return organism_row.find_next_sibling("td").text.strip()
    except Exception as e:
        print(f"⚠️ Failed to scrape organism for {geo_accession}: {e}")
    return "Unknown"

def search_geo_datasets(keyword, retmax=10):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "gds",
        "term": keyword,
        "retmode": "json",
        "retmax": retmax
    }
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    return response.json().get("esearchresult", {}).get("idlist", [])

def fetch_gse_accessions(id_list):
    if not id_list:
        return {}
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        "db": "gds",
        "id": ",".join(id_list),
        "retmode": "json"
    }
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    summaries = response.json().get("result", {})
    summaries.pop("uids", None)
    gse_dict = {}
    for uid, info in summaries.items():
        accession = info.get("accession")
        title = info.get("title")
        link = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={accession}"
        organism = scrape_organism_from_geo_html(accession)
        gse_dict[accession] = {
            "title": title,
            "organism": organism,
            "link": link
        }
    return gse_dict

def search_geo_items(item_list, label="gene", max_results=5):
    results = {}
    for item in item_list:
        query = f"{item} AND rna-seq"
        print(f"🔍 Searching GEO for {label}: {item}")
        ids = search_geo_datasets(query, retmax=max_results)
        datasets = fetch_gse_accessions(ids)
        results[item] = datasets
        time.sleep(0.3)
    return results

def get_geofetch_projects(gse_list, metadata_folder="geofetch_metadata"):
    geof = Geofetcher(
        processed=True,
        acc_anno=True,
        discard_soft=True,
        metadata_folder=metadata_folder
    )
    projects = {}
    for gse in gse_list:
        try:
            print(f"📥 Fetching metadata for {gse}")
            result = geof.get_projects(gse)
            projects.update(result)
        except Exception as e:
            print(f"❌ Failed to fetch {gse}: {e}")
    return projects

import subprocess
def download_processed_files_via_cli(gse_list, output_dir="geofetch_metadata", overwrite=False):
    for gse in gse_list:
        gse_path = os.path.join(output_dir, gse)
        if os.path.exists(gse_path) and not overwrite:
            print(f"✅ {gse}: already exists at {gse_path}, skipping.")
            continue
        try:
            subprocess.run(
                ["geofetch", "-i", gse, "--processed", "-m", output_dir],
                check=True
            )
            print(f"✅ Finished downloading for {gse}")
        except subprocess.CalledProcessError as e:
            print(f"❌ geofetch failed for {gse}: {e}")

def save_combined_metadata_csv_from_state(state: Dict, csv_path: str = "geofetch_metadata/combined_metadata.csv") -> pd.DataFrame:
    metadata = state.get("metadata", {})
    if not metadata:
        raise ValueError("No metadata found in the agent state.")

    all_dfs = []
    for gse, project in metadata.items():
        try:
            df = project.sample_table.copy()
            df["source_gse"] = gse
            all_dfs.append(df)
        except Exception as e:
            print(f"⚠️ Failed to extract sample_table from {gse}: {e}")

    if not all_dfs:
        raise ValueError("No sample tables to save.")

    combined_df = pd.concat(all_dfs, ignore_index=True)
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    combined_df.to_csv(csv_path, index=False)
    print(f"✅ Combined metadata saved to: {os.path.abspath(csv_path)}")
    return combined_df

# ----------------------------------------
# Agent 1: Ingestor node
# ----------------------------------------

def ingest_and_prepare(state: dict) -> dict:
    query_gene = state["query_gene"]
    pathway_info = generate_pathway_info(query_gene)
    gene_list = pathway_info.get("genes", [])
    drug_list = pathway_info.get("drugs", [])

    gene_results = search_geo_items(gene_list, label="gene")
    drug_results = search_geo_items(drug_list, label="drug")

    all_gse = set()
    for r in [gene_results, drug_results]:
        for v in r.values():
            all_gse.update(v.keys())
    all_gse = list(all_gse)

    metadata = get_geofetch_projects(all_gse, metadata_folder="geofetch_metadata")
    download_processed_files_via_cli(all_gse, output_dir="geofetch_metadata")

    try:
        save_combined_metadata_csv_from_state({"metadata": metadata})
    except Exception as e:
        print(f"⚠️ Failed to save metadata CSV: {e}")

    return {
        "query_gene": query_gene,
        "pathway_info": pathway_info,
        "metadata": metadata,
        "gse_list": all_gse
    }

# ----------------------------------------
# Agent 2: Analyst node
# ----------------------------------------

def analyze_metadata_and_plan(state: AgentState) -> AgentState:
    metadata = state["metadata"]
    drug_list = state["pathway_info"].get("drugs", [])
    query_gene = state["query_gene"]
    selected = []

    for gse, project in metadata.items():
        df = project.sample_table
        if "processed_file_ftp" in df.columns and df["processed_file_ftp"].notna().any():
            if any(drug.lower() in df.to_string().lower() for drug in drug_list):
                selected.append((gse, df.shape[0]))

    plan = f"🧬 Research Plan for {query_gene} and drugs {drug_list}:\n"
    if not selected:
        plan += "No relevant processed datasets were found.\n"
    else:
        plan += f"{len(selected)} datasets selected:\n"
        for gse, n in selected:
            plan += f"  - {gse} ({n} samples)\n"
        plan += "\nNext: perform differential expression and gene signature clustering."

    return {**state, "research_plan": plan}


def analyze_metadata_and_plan(state: AgentState) -> AgentState:
    import pandas as pd
    import os

    csv_path = "geofetch_metadata/combined_metadata.csv"
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"❌ Metadata CSV not found at {csv_path}")

    df = pd.read_csv(csv_path)

    # Ensure required columns exist
    required_columns = {"sample_name", "sample_source_name_ch1", "sample_title"}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"❌ Metadata CSV must include columns: {required_columns}")

    # Take first 50 rows for LLM context
    selected_df = df[list(required_columns)].fillna("").head(50)
    table_preview = selected_df.to_markdown(index=False)

    query_gene = state["query_gene"]
    drug_list = state["pathway_info"].get("drugs", [])
    selected_df = df[["sample_name", "sample_source_name_ch1", "sample_title"]].fillna("").head(50)
    table_preview = selected_df.to_markdown(index=False)
    print("🧪 Table preview sent to LLM:\n", table_preview)  # ✅ Add this line

    # Build LLM prompt
    prompt = f"""
You are a biomedical research assistant.

The target gene is **{query_gene}**, and the related drugs of interest are: {', '.join(drug_list)}.

Below is a preview of sample metadata (first 50 rows) from multiple GEO studies. Each row includes:
- sample name
- sample source (cell line, tissue)
- sample title (may indicate treatment or condition)

Your task:
1. Identify which studies include drug-treated samples.
2. Identify the control groups if available.
3. Determine the sample types (e.g., cell lines or tissues).
4. Recommend studies and sample comparisons suitable for differential gene expression and drug-response signature analysis.

Respond with:
- GSE or study names (if known)
- The experimental comparison design
- Why the dataset is suitable (or not)
- Bullet points summarizing each recommended comparison

Sample Metadata Table:

"""

    # Call LLM
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0.3,
        messages=[{"role": "user", "content": prompt}]
    )

    research_plan = response.choices[0].message.content.strip()

    return {
        **state,
        "research_plan": research_plan
    }

###
def analyze_metadata_and_plan(state: AgentState) -> AgentState:
    import pandas as pd
    import os

    csv_path = "geofetch_metadata/combined_metadata.csv"
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"❌ Metadata CSV not found at {csv_path}")

    df = pd.read_csv(csv_path)

    # Ensure required columns are available
    required_cols = ["gse", "sample_name", "sample_title", "sample_source_name_ch1", "sample_geo_accession"]
    if not all(col in df.columns for col in required_cols):
        raise ValueError(f"❌ Metadata CSV must contain the following columns: {required_cols}")

    # Clean and preview first 30 rows
    preview_df = df[required_cols].fillna("").head(30)
    preview_text = preview_df.to_string(index=False)

    query_gene = state["query_gene"]
    drug_list = state["pathway_info"].get("drugs", [])

    prompt = f"""
You are a biomedical research assistant.

The target gene is **{query_gene}** and the related drugs of interest are: {', '.join(drug_list)}.

Below is a preview of sample metadata from several GEO datasets.
Each row includes:
- GSE accession
- Sample name
- Sample title (may contain treatment or control info)
- Sample source (cell type or tissue)
- Sample GEO accession

Sample Metadata Table:
{preview_text}

Based on the sample names, titles, and sources:
1. Which GSE studies contain drug-treated samples and matching control groups?
2. What cell types or tissues are used?
3. Which treatments are applied? What are the controls?
4. Recommend GSEs and sample pairs suitable for differential gene expression to identify drug-response gene signatures.

Be specific, refer to GSE and sample names where possible, and explain why you recommend them.
"""

    # Call OpenAI LLM
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0.3,
        messages=[{"role": "user", "content": prompt}]
    )

    research_plan = response.choices[0].message.content.strip()

    return {
        **state,
        "research_plan": research_plan
    }

# ----------------------------------------
# LangGraph pipeline
# ----------------------------------------

workflow = StateGraph(state_schema=AgentState)
workflow.add_node("Ingestor", RunnableLambda(ingest_and_prepare))
workflow.add_node("Analyst", RunnableLambda(analyze_metadata_and_plan))
workflow.set_entry_point("Ingestor")
workflow.add_edge("Ingestor", "Analyst")
workflow.set_finish_point("Analyst")
graph = workflow.compile()

# ----------------------------------------
# Invoke the graph
# ----------------------------------------



In [3]:
from IPython.display import Image, display
try:
    display(Image(graph.get_graph().draw_mermaid_png()))
except Exception:
    # This requires some extra dependencies and is optional
    pass


In [4]:
if __name__ == "__main__":
    result = graph.invoke({"query_gene": "COX1"})
    print("\n📋 Final Research Plan:")
    print(result["research_plan"])

🔍 Searching GEO for gene: PTGS2
🔍 Searching GEO for gene: PTGS1
🔍 Searching GEO for gene: ALOX5
🔍 Searching GEO for gene: CYP2C9
🔍 Searching GEO for gene: CYP2C19
🔍 Searching GEO for drug: Aspirin
🔍 Searching GEO for drug: Ibuprofen
🔍 Searching GEO for drug: Naproxen
🔍 Searching GEO for drug: Celecoxib
🔍 Searching GEO for drug: Diclofenac


[1;30m[INFO][0m [32m[23:19:23][0m Metadata folder: C:\Users\difen\POPPER\geofetch_metadata\project_name
[1;30m[INFO][0m [32m[23:19:23][0m Trying GSE278083 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:23][0m [38;5;200mProcessing accession 1 of 1: 'GSE278083'[0m
[1;30m[INFO][0m [32m[23:19:23][0m Trying GSE278083 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:23][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:19:23][0m [38;5;200mProcessing accession 1 of 1: 'GSE278083'[0m


📥 Fetching metadata for GSE278083


[1;30m[INFO][0m [32m[23:19:25][0m Total number of processed SERIES files found is: 4
[1;30m[INFO][0m [32m[23:19:25][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:25][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:25][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:19:25][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:19:25][0m No files found. No data to save. File geofetch_metadata\project_name\GSE278083_samples\GSE278083_samples.csv won't be created
[1;30m[INFO][0m [32m[23:19:25][0m Trying GSE94840 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:25][0m Trying GSE94840 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:25][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:19:25][0m [38;5;200mProcessing accession 1 of 1: 'GSE94840'[0m


📥 Fetching metadata for GSE94840


[1;30m[INFO][0m [32m[23:19:26][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:19:26][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:26][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:26][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:19:26][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:19:26][0m No files found. No data to save. File geofetch_metadata\project_name\GSE94840_samples\GSE94840_samples.csv won't be created
[1;30m[INFO][0m [32m[23:19:26][0m Trying GSE244787 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:26][0m Trying GSE244787 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:26][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:19:26][0m [38;5;200mProcessing accession 1 of 1: 'GSE244787'[0m


📥 Fetching metadata for GSE244787


[1;30m[INFO][0m [32m[23:19:30][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:19:30][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:30][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:30][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:19:30][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:19:30][0m No files found. No data to save. File geofetch_metadata\project_name\GSE244787_samples\GSE244787_samples.csv won't be created
[1;30m[INFO][0m [32m[23:19:30][0m Trying GSE110293 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:30][0m Trying GSE110293 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:30][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:19:30][0m [38;5;200mProcessing accession 1 of 1: 'GSE110293'[0m


📥 Fetching metadata for GSE110293


[1;30m[INFO][0m [32m[23:19:32][0m 
Total number of processed SAMPLES files found is: 24
[1;30m[INFO][0m [32m[23:19:32][0m Total number of processed SERIES files found is: 0
[1;30m[INFO][0m [32m[23:19:32][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:32][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:32][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:19:32][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:19:32][0m Unifying and saving of metadata... 
[1;30m[INFO][0m [32m[23:19:32][0m Trying GSE101766 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:32][0m Trying GSE101766 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:32][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:19:32][0m [38;5;200mProcessing accession 1 of 1: 'GSE101766'[0m


📥 Fetching metadata for GSE101766


[1;30m[INFO][0m [32m[23:19:44][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:19:44][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:44][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:44][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:19:44][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:19:44][0m No files found. No data to save. File geofetch_metadata\project_name\GSE101766_samples\GSE101766_samples.csv won't be created
[1;30m[INFO][0m [32m[23:19:44][0m Trying GSE277028 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:44][0m Trying GSE277028 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:44][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:19:44][0m [38;5;200mProcessing accession 1 of 1: 'GSE277028'[0m


📥 Fetching metadata for GSE277028


[1;30m[INFO][0m [32m[23:19:48][0m Total number of processed SERIES files found is: 2
[1;30m[INFO][0m [32m[23:19:48][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:48][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:48][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:19:48][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:19:48][0m No files found. No data to save. File geofetch_metadata\project_name\GSE277028_samples\GSE277028_samples.csv won't be created
[1;30m[INFO][0m [32m[23:19:48][0m Trying GSE180857 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:48][0m Trying GSE180857 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:48][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:19:48][0m [38;5;200mProcessing accession 1 of 1: 'GSE180857'[0m


📥 Fetching metadata for GSE180857


[1;30m[INFO][0m [32m[23:19:52][0m 
Total number of processed SAMPLES files found is: 31
[1;30m[INFO][0m [32m[23:19:52][0m Total number of processed SERIES files found is: 0
[1;30m[INFO][0m [32m[23:19:52][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:52][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:52][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:19:52][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:19:52][0m Unifying and saving of metadata... 
[1;30m[INFO][0m [32m[23:19:52][0m Trying GSE231460 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:52][0m Trying GSE231460 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:52][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:19:52][0m [38;5;200mProcessing accession 1 of 1: 'GSE231460'[0m


📥 Fetching metadata for GSE231460


[1;30m[INFO][0m [32m[23:19:53][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:19:53][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:53][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:53][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:19:53][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:19:53][0m No files found. No data to save. File geofetch_metadata\project_name\GSE231460_samples\GSE231460_samples.csv won't be created
[1;30m[INFO][0m [32m[23:19:53][0m Trying GSE97066 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:53][0m Trying GSE97066 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:53][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:19:53][0m [38;5;200mProcessing accession 1 of 1: 'GSE97066'[0m


📥 Fetching metadata for GSE97066


[1;30m[INFO][0m [32m[23:19:59][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:19:59][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:59][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:19:59][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:19:59][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:19:59][0m No files found. No data to save. File geofetch_metadata\project_name\GSE97066_samples\GSE97066_samples.csv won't be created
[1;30m[INFO][0m [32m[23:19:59][0m Trying GSE144219 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:59][0m Trying GSE144219 (not a file) as accession...
[1;30m[INFO][0m [32m[23:19:59][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:19:59][0m [38;5;200mProcessing accession 1 of 1: 'GSE144219'[0m


📥 Fetching metadata for GSE144219


[1;30m[INFO][0m [32m[23:20:52][0m Total number of processed SERIES files found is: 5
[1;30m[INFO][0m [32m[23:20:52][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:20:52][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:20:52][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:20:52][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:20:52][0m No files found. No data to save. File geofetch_metadata\project_name\GSE144219_samples\GSE144219_samples.csv won't be created
[1;30m[INFO][0m [32m[23:20:52][0m Trying GSE131732 (not a file) as accession...
[1;30m[INFO][0m [32m[23:20:52][0m Trying GSE131732 (not a file) as accession...
[1;30m[INFO][0m [32m[23:20:52][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:20:52][0m [38;5;200mProcessing accession 1 of 1: 'GSE131732'[0m


📥 Fetching metadata for GSE131732


[1;30m[INFO][0m [32m[23:20:54][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:20:54][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:20:54][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:20:54][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:20:54][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:20:54][0m No files found. No data to save. File geofetch_metadata\project_name\GSE131732_samples\GSE131732_samples.csv won't be created
[1;30m[INFO][0m [32m[23:20:54][0m Trying GSE95802 (not a file) as accession...
[1;30m[INFO][0m [32m[23:20:54][0m Trying GSE95802 (not a file) as accession...
[1;30m[INFO][0m [32m[23:20:54][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:20:54][0m [38;5;200mProcessing accession 1 of 1: 'GSE95802'[0m


📥 Fetching metadata for GSE95802


[1;30m[INFO][0m [32m[23:20:58][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:20:58][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:20:58][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:20:58][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:20:58][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:20:58][0m No files found. No data to save. File geofetch_metadata\project_name\GSE95802_samples\GSE95802_samples.csv won't be created
[1;30m[INFO][0m [32m[23:20:58][0m Trying GSE222593 (not a file) as accession...
[1;30m[INFO][0m [32m[23:20:58][0m Trying GSE222593 (not a file) as accession...
[1;30m[INFO][0m [32m[23:20:58][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:20:58][0m [38;5;200mProcessing accession 1 of 1: 'GSE222593'[0m


📥 Fetching metadata for GSE222593


[1;30m[INFO][0m [32m[23:22:06][0m 
Total number of processed SAMPLES files found is: 355
[1;30m[INFO][0m [32m[23:22:06][0m Total number of processed SERIES files found is: 6
[1;30m[INFO][0m [32m[23:22:06][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:06][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:06][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:06][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:06][0m Unifying and saving of metadata... 
[1;30m[INFO][0m [32m[23:22:07][0m Trying GSE110282 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:07][0m Trying GSE110282 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:07][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:07][0m [38;5;200mProcessing accession 1 of 1: 'GSE110282'[0m


📥 Fetching metadata for GSE110282


[1;30m[INFO][0m [32m[23:22:08][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:22:08][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:08][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:08][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:08][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:08][0m No files found. No data to save. File geofetch_metadata\project_name\GSE110282_samples\GSE110282_samples.csv won't be created
[1;30m[INFO][0m [32m[23:22:08][0m Trying GSE279800 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:08][0m Trying GSE279800 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:08][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:08][0m [38;5;200mProcessing accession 1 of 1: 'GSE279800'[0m


📥 Fetching metadata for GSE279800


[1;30m[INFO][0m [32m[23:22:11][0m 
Total number of processed SAMPLES files found is: 8
[1;30m[INFO][0m [32m[23:22:11][0m Total number of processed SERIES files found is: 0
[1;30m[INFO][0m [32m[23:22:11][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:11][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:11][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:11][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:11][0m Unifying and saving of metadata... 
[1;30m[INFO][0m [32m[23:22:11][0m Trying GSE279268 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:11][0m Trying GSE279268 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:11][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:11][0m [38;5;200mProcessing accession 1 of 1: 'GSE279268'[0m


📥 Fetching metadata for GSE279268


[1;30m[INFO][0m [32m[23:22:17][0m Total number of processed SERIES files found is: 8
[1;30m[INFO][0m [32m[23:22:17][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:17][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:17][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:17][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:17][0m No files found. No data to save. File geofetch_metadata\project_name\GSE279268_samples\GSE279268_samples.csv won't be created
[1;30m[INFO][0m [32m[23:22:17][0m Trying GSE139044 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:17][0m Trying GSE139044 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:17][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:17][0m [38;5;200mProcessing accession 1 of 1: 'GSE139044'[0m


📥 Fetching metadata for GSE139044


[1;30m[INFO][0m [32m[23:22:18][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:22:18][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:18][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:18][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:18][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:18][0m No files found. No data to save. File geofetch_metadata\project_name\GSE139044_samples\GSE139044_samples.csv won't be created
[1;30m[INFO][0m [32m[23:22:18][0m Trying GSE139045 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:18][0m Trying GSE139045 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:18][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:18][0m [38;5;200mProcessing accession 1 of 1: 'GSE139045'[0m


📥 Fetching metadata for GSE139045


[1;30m[INFO][0m [32m[23:22:19][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:22:19][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:19][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:19][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:19][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:20][0m No files found. No data to save. File geofetch_metadata\project_name\GSE139045_samples\GSE139045_samples.csv won't be created
[1;30m[INFO][0m [32m[23:22:20][0m Trying GSE286021 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:20][0m Trying GSE286021 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:20][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:20][0m [38;5;200mProcessing accession 1 of 1: 'GSE286021'[0m


📥 Fetching metadata for GSE286021


[1;30m[INFO][0m [32m[23:22:22][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:22:22][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:22][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:22][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:22][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:22][0m No files found. No data to save. File geofetch_metadata\project_name\GSE286021_samples\GSE286021_samples.csv won't be created
[1;30m[INFO][0m [32m[23:22:22][0m Trying GSE281885 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:22][0m Trying GSE281885 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:22][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:22][0m [38;5;200mProcessing accession 1 of 1: 'GSE281885'[0m


📥 Fetching metadata for GSE281885


[1;30m[INFO][0m [32m[23:22:24][0m 
Total number of processed SAMPLES files found is: 10
[1;30m[INFO][0m [32m[23:22:24][0m Total number of processed SERIES files found is: 0
[1;30m[INFO][0m [32m[23:22:24][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:24][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:24][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:24][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:24][0m Unifying and saving of metadata... 
[1;30m[INFO][0m [32m[23:22:24][0m Trying GSE124074 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:24][0m Trying GSE124074 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:24][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:24][0m [38;5;200mProcessing accession 1 of 1: 'GSE124074'[0m


📥 Fetching metadata for GSE124074


[1;30m[INFO][0m [32m[23:22:31][0m 
Total number of processed SAMPLES files found is: 60
[1;30m[INFO][0m [32m[23:22:31][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:22:31][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:31][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:31][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:31][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:31][0m Unifying and saving of metadata... 
[1;30m[INFO][0m [32m[23:22:31][0m Trying GSE38809 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:31][0m Trying GSE38809 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:31][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:31][0m [38;5;200mProcessing accession 1 of 1: 'GSE38809'[0m


📥 Fetching metadata for GSE38809


[1;30m[INFO][0m [32m[23:22:34][0m 
Total number of processed SAMPLES files found is: 3
[1;30m[INFO][0m [32m[23:22:34][0m Total number of processed SERIES files found is: 3
[1;30m[INFO][0m [32m[23:22:34][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:34][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:34][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:34][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:34][0m Unifying and saving of metadata... 
[1;30m[INFO][0m [32m[23:22:34][0m Trying GSE184884 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:34][0m Trying GSE184884 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:34][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:34][0m [38;5;200mProcessing accession 1 of 1: 'GSE184884'[0m


📥 Fetching metadata for GSE184884


[1;30m[INFO][0m [32m[23:22:43][0m 
Total number of processed SAMPLES files found is: 197
[1;30m[INFO][0m [32m[23:22:43][0m Total number of processed SERIES files found is: 3
[1;30m[INFO][0m [32m[23:22:43][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:43][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:43][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:43][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:43][0m Unifying and saving of metadata... 
[1;30m[INFO][0m [32m[23:22:43][0m Trying GSE245768 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:43][0m Trying GSE245768 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:43][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:43][0m [38;5;200mProcessing accession 1 of 1: 'GSE245768'[0m


📥 Fetching metadata for GSE245768


[1;30m[INFO][0m [32m[23:22:46][0m 
Total number of processed SAMPLES files found is: 15
[1;30m[INFO][0m [32m[23:22:46][0m Total number of processed SERIES files found is: 0
[1;30m[INFO][0m [32m[23:22:46][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:46][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:46][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:46][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:46][0m Unifying and saving of metadata... 
[1;30m[INFO][0m [32m[23:22:46][0m Trying GSE242369 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:46][0m Trying GSE242369 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:46][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:46][0m [38;5;200mProcessing accession 1 of 1: 'GSE242369'[0m


📥 Fetching metadata for GSE242369


[1;30m[INFO][0m [32m[23:22:50][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:22:50][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:50][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:50][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:50][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:50][0m No files found. No data to save. File geofetch_metadata\project_name\GSE242369_samples\GSE242369_samples.csv won't be created
[1;30m[INFO][0m [32m[23:22:50][0m Trying GSE263024 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:50][0m Trying GSE263024 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:50][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:50][0m [38;5;200mProcessing accession 1 of 1: 'GSE263024'[0m


📥 Fetching metadata for GSE263024


[1;30m[INFO][0m [32m[23:22:51][0m Total number of processed SERIES files found is: 2
[1;30m[INFO][0m [32m[23:22:51][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:51][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:51][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:51][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:51][0m No files found. No data to save. File geofetch_metadata\project_name\GSE263024_samples\GSE263024_samples.csv won't be created
[1;30m[INFO][0m [32m[23:22:51][0m Trying GSE255683 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:51][0m Trying GSE255683 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:51][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:51][0m [38;5;200mProcessing accession 1 of 1: 'GSE255683'[0m


📥 Fetching metadata for GSE255683


[1;30m[INFO][0m [32m[23:22:53][0m 
Total number of processed SAMPLES files found is: 20
[1;30m[INFO][0m [32m[23:22:53][0m Total number of processed SERIES files found is: 0
[1;30m[INFO][0m [32m[23:22:53][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:53][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:53][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:53][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:53][0m Unifying and saving of metadata... 
[1;30m[INFO][0m [32m[23:22:53][0m Trying GSE175744 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:53][0m Trying GSE175744 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:53][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:53][0m [38;5;200mProcessing accession 1 of 1: 'GSE175744'[0m


📥 Fetching metadata for GSE175744


[1;30m[INFO][0m [32m[23:22:59][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:22:59][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:59][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:22:59][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:22:59][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:22:59][0m No files found. No data to save. File geofetch_metadata\project_name\GSE175744_samples\GSE175744_samples.csv won't be created
[1;30m[INFO][0m [32m[23:22:59][0m Trying GSE95588 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:59][0m Trying GSE95588 (not a file) as accession...
[1;30m[INFO][0m [32m[23:22:59][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:22:59][0m [38;5;200mProcessing accession 1 of 1: 'GSE95588'[0m


📥 Fetching metadata for GSE95588


[1;30m[INFO][0m [32m[23:23:03][0m Total number of processed SERIES files found is: 2
[1;30m[INFO][0m [32m[23:23:03][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:23:03][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:23:03][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:23:03][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:23:03][0m No files found. No data to save. File geofetch_metadata\project_name\GSE95588_samples\GSE95588_samples.csv won't be created
[1;30m[INFO][0m [32m[23:23:03][0m Trying GSE120596 (not a file) as accession...
[1;30m[INFO][0m [32m[23:23:03][0m Trying GSE120596 (not a file) as accession...
[1;30m[INFO][0m [32m[23:23:03][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:23:03][0m [38;5;200mProcessing accession 1 of 1: 'GSE120596'[0m


📥 Fetching metadata for GSE120596


[1;30m[INFO][0m [32m[23:23:06][0m Total number of processed SERIES files found is: 2
[1;30m[INFO][0m [32m[23:23:06][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:23:06][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:23:06][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:23:06][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:23:06][0m No files found. No data to save. File geofetch_metadata\project_name\GSE120596_samples\GSE120596_samples.csv won't be created
[1;30m[INFO][0m [32m[23:23:06][0m Trying GSE162256 (not a file) as accession...
[1;30m[INFO][0m [32m[23:23:06][0m Trying GSE162256 (not a file) as accession...
[1;30m[INFO][0m [32m[23:23:06][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:23:06][0m [38;5;200mProcessing accession 1 of 1: 'GSE162256'[0m


📥 Fetching metadata for GSE162256


[1;30m[INFO][0m [32m[23:23:28][0m Total number of processed SERIES files found is: 1
[1;30m[INFO][0m [32m[23:23:28][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:23:28][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:23:28][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:23:28][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:23:28][0m No files found. No data to save. File geofetch_metadata\project_name\GSE162256_samples\GSE162256_samples.csv won't be created
[1;30m[INFO][0m [32m[23:23:28][0m Trying GSE242272 (not a file) as accession...
[1;30m[INFO][0m [32m[23:23:28][0m Trying GSE242272 (not a file) as accession...
[1;30m[INFO][0m [32m[23:23:28][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:23:28][0m [38;5;200mProcessing accession 1 of 1: 'GSE242272'[0m


📥 Fetching metadata for GSE242272


[1;30m[INFO][0m [32m[23:23:30][0m 
Total number of processed SAMPLES files found is: 18
[1;30m[INFO][0m [32m[23:23:30][0m Total number of processed SERIES files found is: 0
[1;30m[INFO][0m [32m[23:23:30][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:23:30][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:23:30][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:23:30][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:23:30][0m Unifying and saving of metadata... 
[1;30m[INFO][0m [32m[23:23:30][0m Trying GSE156453 (not a file) as accession...
[1;30m[INFO][0m [32m[23:23:30][0m Trying GSE156453 (not a file) as accession...
[1;30m[INFO][0m [32m[23:23:30][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:23:30][0m [38;5;200mProcessing accession 1 of 1: 'GSE156453'[0m


📥 Fetching metadata for GSE156453


[1;30m[INFO][0m [32m[23:23:45][0m Total number of processed SERIES files found is: 2
[1;30m[INFO][0m [32m[23:23:45][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:23:45][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:23:45][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:23:45][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:23:45][0m No files found. No data to save. File geofetch_metadata\project_name\GSE156453_samples\GSE156453_samples.csv won't be created
[1;30m[INFO][0m [32m[23:23:45][0m Trying GSE262419 (not a file) as accession...
[1;30m[INFO][0m [32m[23:23:45][0m Trying GSE262419 (not a file) as accession...
[1;30m[INFO][0m [32m[23:23:45][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:23:45][0m [38;5;200mProcessing accession 1 of 1: 'GSE262419'[0m


📥 Fetching metadata for GSE262419


[1;30m[INFO][0m [32m[23:24:17][0m Trying GSE221957 (not a file) as accession...
[1;30m[INFO][0m [32m[23:24:17][0m Trying GSE221957 (not a file) as accession...
[1;30m[INFO][0m [32m[23:24:17][0m Skipped 0 accessions. Starting now.
[1;30m[INFO][0m [32m[23:24:17][0m [38;5;200mProcessing accession 1 of 1: 'GSE221957'[0m


❌ Failed to fetch GSE262419: Error in requesting file: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gsm&acc=GSE262419&form=text&view=full
📥 Fetching metadata for GSE221957


[1;30m[INFO][0m [32m[23:24:20][0m 
Total number of processed SAMPLES files found is: 16
[1;30m[INFO][0m [32m[23:24:20][0m Total number of processed SERIES files found is: 0
[1;30m[INFO][0m [32m[23:24:20][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:24:20][0m Expanding metadata list...
[1;30m[INFO][0m [32m[23:24:20][0m Finished processing 1 accession(s)
[1;30m[INFO][0m [32m[23:24:20][0m Cleaning soft files ...
[1;30m[INFO][0m [32m[23:24:20][0m Unifying and saving of metadata... 


✅ Finished downloading for GSE278083
✅ Finished downloading for GSE94840
✅ Finished downloading for GSE244787
❌ geofetch failed for GSE110293: Command '['geofetch', '-i', 'GSE110293', '--processed', '-m', 'geofetch_metadata']' returned non-zero exit status 1.
✅ Finished downloading for GSE101766
✅ Finished downloading for GSE277028
❌ geofetch failed for GSE180857: Command '['geofetch', '-i', 'GSE180857', '--processed', '-m', 'geofetch_metadata']' returned non-zero exit status 1.
✅ Finished downloading for GSE231460
✅ Finished downloading for GSE97066
✅ Finished downloading for GSE144219
✅ Finished downloading for GSE131732
✅ Finished downloading for GSE95802
❌ geofetch failed for GSE222593: Command '['geofetch', '-i', 'GSE222593', '--processed', '-m', 'geofetch_metadata']' returned non-zero exit status 1.
✅ Finished downloading for GSE110282
❌ geofetch failed for GSE279800: Command '['geofetch', '-i', 'GSE279800', '--processed', '-m', 'geofetch_metadata']' returned non-zero exit status

In [50]:
import os
import requests
import tarfile
import gzip
import shutil

def download_and_extract_gse(gse_id="GSE242272", base_dir="rna_seq_analysis"):
    """
    Download GEO supplementary file (.tar) for the given GSE ID and extract it.
    Returns the path to the extracted folder.
    """
    download_dir = os.path.join(base_dir, gse_id)
    os.makedirs(download_dir, exist_ok=True)

    url = f"https://www.ncbi.nlm.nih.gov/geo/download/?acc={gse_id}&format=file"
    tar_path = os.path.join(download_dir, f"{gse_id}_supplement.tar")

    print(f"📦 Downloading: {url}")
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(tar_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"✅ Downloaded to: {tar_path}")
    except Exception as e:
        print(f"❌ Download failed: {e}")
        return None

    print(f"🗂️ Extracting TAR file...")
    try:
        with tarfile.open(tar_path, "r:*") as tar:
            tar.extractall(path=download_dir)
        print(f"✅ Extracted contents to: {download_dir}")
    except Exception as e:
        print(f"❌ Extraction failed: {e}")
        return None

    return download_dir


def decompress_gz_files(root_dir: str):
    """
    Recursively decompress all .txt.gz files in a directory.
    """
    print(f"🔍 Decompressing .txt.gz files under: {root_dir}")
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith(".txt.gz"):
                gz_path = os.path.join(dirpath, filename)
                txt_path = gz_path[:-3]  # Remove .gz extension
                try:
                    with gzip.open(gz_path, 'rb') as f_in:
                        with open(txt_path, 'wb') as f_out:
                            shutil.copyfileobj(f_in, f_out)
                    print(f"✅ Decompressed: {gz_path}")
                except Exception as e:
                    print(f"❌ Failed to decompress {gz_path}: {e}")


def run_deg_analysis_from_gse(gse_id="GSE242272", base_dir="rna_seq_analysis"):
    gse_dir = os.path.join(base_dir, gse_id)
    output_csv = os.path.join(base_dir, f"{gse_id}_deseq2_results.csv")

    # Step 1: Assemble count matrix
    count_dfs = []
    sample_conditions = []

    for subdir in os.listdir(gse_dir):
        subpath = os.path.join(gse_dir, subdir)
        if os.path.isdir(subpath) and subdir.startswith("GSM"):
            for fname in os.listdir(subpath):
                if fname.endswith(".txt"):
                    file_path = os.path.join(subpath, fname)
                    try:
                        df = pd.read_csv(file_path, sep="\t", header=None, names=["gene", subdir])
                        df.set_index("gene", inplace=True)
                        count_dfs.append(df)

                        # infer condition from folder name
                        label = "treated" if any(x in subdir.lower() for x in ["treated", "pge2", "caffeine"]) else "control"
                        sample_conditions.append((subdir, label))
                        break
                    except Exception as e:
                        print(f"⚠️ Error reading {file_path}: {e}")

    if not count_dfs:
        raise ValueError("❌ No valid count files found.")

    combined_counts = pd.concat(count_dfs, axis=1).fillna(0).astype(int)
    # messege for combined_counts created:
    print("✅ Count matrix assembled.")
    print("📐 Shape (genes × samples):", combined_counts.shape)
    print("🔍 Preview:")
    print(combined_counts.iloc[:5, :5])  # Show first 5 genes × 5 samples
    # TEST: Save raw count matrix
    combined_counts.to_csv(f"{base_dir}/{gse_id}_raw_counts.csv")
    print(f"🧬 Raw count matrix saved to: {base_dir}/{gse_id}_raw_counts.csv")
    sample_df = pd.DataFrame(sample_conditions, columns=["sample", "condition"])
    sample_df.set_index("sample", inplace=True)
    
    # Step 2: DESeq2
    dds = py_DESeq2(
        count_matrix=combined_counts.T,
        design_matrix=sample_df,
        design_formula="~ condition",
        gene_column="gene"
    )

    dds.run_deseq()
    res = dds.get_deseq_result()
    res_sorted = res.sort_values("padj").dropna().head(50)
    res_sorted.to_csv(output_csv, index=False)
    print(f"📄 DESeq2 results saved to: {output_csv}")

    return res_sorted


In [52]:
import os

gse_path = "rna_seq_analysis/GSE242272"
for root, dirs, files in os.walk(gse_path):
    print(f"\n📁 {root}")
    for f in files:
        print(f"  📄 {f}")



📁 rna_seq_analysis/GSE242272
  📄 GSE242272_supplement.tar
  📄 GSM7757588_CD8_24h_Vehicle1.tabular.txt
  📄 GSM7757588_CD8_24h_Vehicle1.tabular.txt.gz
  📄 GSM7757589_CD8_24h_Vehicle2.tabular.txt
  📄 GSM7757589_CD8_24h_Vehicle2.tabular.txt.gz
  📄 GSM7757590_CD8_24h_Vehicle3.tabular.txt
  📄 GSM7757590_CD8_24h_Vehicle3.tabular.txt.gz
  📄 GSM7757591_CD8_24h_PGE2_1.tabular.txt
  📄 GSM7757591_CD8_24h_PGE2_1.tabular.txt.gz
  📄 GSM7757592_CD8_24h_PGE2_2.tabular.txt
  📄 GSM7757592_CD8_24h_PGE2_2.tabular.txt.gz
  📄 GSM7757593_CD8_24h_PGE2_3.tabular.txt
  📄 GSM7757593_CD8_24h_PGE2_3.tabular.txt.gz
  📄 GSM7757594_CD8_48h_Vehicle1.tabular.txt
  📄 GSM7757594_CD8_48h_Vehicle1.tabular.txt.gz
  📄 GSM7757595_CD8_48h_Vehicle2.tabular.txt
  📄 GSM7757595_CD8_48h_Vehicle2.tabular.txt.gz
  📄 GSM7757596_CD8_48h_Vehicle3.tabular.txt
  📄 GSM7757596_CD8_48h_Vehicle3.tabular.txt.gz
  📄 GSM7757597_CD8_48h_PGE2_1.tabular.txt
  📄 GSM7757597_CD8_48h_PGE2_1.tabular.txt.gz
  📄 GSM7757598_CD8_48h_PGE2_2.tabular.txt
  📄 G

In [7]:
import os
import requests
import tarfile
import gzip
import shutil
import pandas as pd

# Optional DESeq2 import
try:
    #from pyDESeq2 import py_DESeq2
    from pydeseq2.dds import DeseqDataSet
    from pydeseq2.default_inference import DefaultInference
    from pydeseq2.ds import DeseqStats
    from sklearn.preprocessing import LabelEncoder
    use_deseq2 = True
except ImportError:
    print("⚠️ pyDESeq2 not found. Skipping DEG step.")
    use_deseq2 = False


def download_and_extract_gse(gse_id="GSE242272", base_dir="rna_seq_analysis"):
    download_dir = os.path.join(base_dir, gse_id)
    os.makedirs(download_dir, exist_ok=True)

    url = f"https://www.ncbi.nlm.nih.gov/geo/download/?acc={gse_id}&format=file"
    tar_path = os.path.join(download_dir, f"{gse_id}_supplement.tar")

    print(f"📦 Downloading: {url}")
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(tar_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"✅ Downloaded to: {tar_path}")
    except Exception as e:
        print(f"❌ Download failed: {e}")
        return None

    print(f"🗂️ Extracting TAR...")
    try:
        with tarfile.open(tar_path, "r:*") as tar:
            tar.extractall(path=download_dir)
        print(f"✅ Extracted contents to: {download_dir}")
    except Exception as e:
        print(f"❌ Extraction failed: {e}")
        return None

    return download_dir


def decompress_gz_files(root_dir: str):
    print(f"🔍 Decompressing .txt.gz files...")
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith(".txt.gz"):
                gz_path = os.path.join(dirpath, filename)
                txt_path = gz_path[:-3]
                if os.path.exists(txt_path):
                    print(f"⏭️ Skipping: {txt_path} already exists.")
                    continue
                try:
                    with gzip.open(gz_path, 'rb') as f_in, open(txt_path, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                    print(f"✅ Decompressed: {gz_path}")
                except Exception as e:
                    print(f"❌ Failed to decompress {gz_path}: {e}")

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats
def run_deg_analysis_from_gse(gse_id="GSE242272", base_dir="rna_seq_analysis"):
    gse_dir = os.path.join(base_dir, gse_id)
    output_csv = os.path.join(base_dir, f"{gse_id}_deseq2_results.csv")

    count_dfs = []
    sample_conditions = []

    print(f"\n🔍 Searching GSM folders in: {gse_dir}")
    for fname in os.listdir(gse_dir):
        if fname.endswith(".txt") and not fname.endswith(".txt.gz"):
            file_path = os.path.join(gse_dir, fname)
            sample_id = fname.split("_")[0]
            try:
                df = pd.read_csv(file_path, sep="\t", header=None, names=["gene", sample_id])
                df.set_index("gene", inplace=True)
                count_dfs.append(df)

                label = "treated" if any(x in fname.lower() for x in ["treated", "pge2", "caffeine"]) else "control"
                sample_conditions.append((sample_id, label))
                print(f"  ✅ Loaded: {fname} as {label}")
            except Exception as e:
                print(f"⚠️ Failed to read {file_path}: {e}")

    if not count_dfs:
        raise ValueError("❌ No valid count files found.")

    combined_counts = pd.concat(count_dfs, axis=1).fillna(0).astype(int)

    print(f"\n✅ Count matrix shape: {combined_counts.shape}")
    print(combined_counts.iloc[:5, :5])

    # Save raw counts
    raw_csv = os.path.join(base_dir, f"{gse_id}_raw_counts.csv")
    combined_counts.to_csv(raw_csv)
    print(f"📄 Saved raw count matrix to: {raw_csv}")

    if not use_deseq2:
        return combined_counts

    # Prepare design metadata
    sample_df = pd.DataFrame(sample_conditions, columns=["sample", "condition"])
    sample_df.set_index("sample", inplace=True)

    print("\n🚀 Running DESeq2...")
    dds = py_DESeq2(
        count_matrix=combined_counts.T,
        design_matrix=sample_df,
        design_formula="~ condition",
        gene_column="gene"
    )
    dds.run_deseq()
    res = dds.get_deseq_result()
    res_sorted = res.sort_values("padj").dropna().head(50)
    res_sorted.to_csv(output_csv, index=False)
    print(f"📊 DESeq2 top results saved to: {output_csv}")

    return res_sorted


In [8]:
import pydeseq2
pydeseq2.__version__

'0.5.1'

In [9]:
gse_id = "GSE242272"
base_dir = "rna_seq_analysis"

# Step 1: Download and extract
path = download_and_extract_gse(gse_id, base_dir)

# Step 2: Decompress only missing .txt files
if path:
    decompress_gz_files(path)

# Step 3: Assemble and analyze
result = run_deg_analysis_from_gse(gse_id, base_dir)

📦 Downloading: https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE242272&format=file
✅ Downloaded to: rna_seq_analysis\GSE242272\GSE242272_supplement.tar
🗂️ Extracting TAR...
✅ Extracted contents to: rna_seq_analysis\GSE242272
🔍 Decompressing .txt.gz files...
⏭️ Skipping: rna_seq_analysis\GSE242272\GSM7757588_CD8_24h_Vehicle1.tabular.txt already exists.
⏭️ Skipping: rna_seq_analysis\GSE242272\GSM7757589_CD8_24h_Vehicle2.tabular.txt already exists.
⏭️ Skipping: rna_seq_analysis\GSE242272\GSM7757590_CD8_24h_Vehicle3.tabular.txt already exists.
⏭️ Skipping: rna_seq_analysis\GSE242272\GSM7757591_CD8_24h_PGE2_1.tabular.txt already exists.
⏭️ Skipping: rna_seq_analysis\GSE242272\GSM7757592_CD8_24h_PGE2_2.tabular.txt already exists.
⏭️ Skipping: rna_seq_analysis\GSE242272\GSM7757593_CD8_24h_PGE2_3.tabular.txt already exists.
⏭️ Skipping: rna_seq_analysis\GSE242272\GSM7757594_CD8_48h_Vehicle1.tabular.txt already exists.
⏭️ Skipping: rna_seq_analysis\GSE242272\GSM7757595_CD8_48h_Vehicle2.tabular

NameError: name 'py_DESeq2' is not defined

In [None]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

In [10]:
# Updated import using correct PyDESeq2 structure
from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

def run_deseq2_manual(counts_df, metadata_df, contrast=["condition", "treated", "control"], output_path="rna_seq_analysis"):
    # Filter out samples with missing conditions
    print("📋 Initial metadata:")
    print(metadata_df)

    samples_to_keep = ~metadata_df.condition.isna()
    counts_df = counts_df.loc[samples_to_keep]
    metadata_df = metadata_df.loc[samples_to_keep]

    # Filter genes with low counts
    genes_to_keep = counts_df.columns[counts_df.sum(axis=0) >= 10]
    counts_df = counts_df[genes_to_keep]

    # Set up DESeq2
    inference = DefaultInference(n_cpus=1)
    dds = DeseqDataSet(
        counts=counts_df,
        metadata=metadata_df,
        design_factors="condition",
        refit_cooks=True,
        inference=inference,
    )

    dds.deseq2()
    print("📊 LFC matrix:")
    print(dds.varm["LFC"].head())

    # Run differential expression stats
    ds = DeseqStats(dds, contrast=contrast, inference=inference)
    ds.summary()

    results_path = os.path.join(output_path, "results.csv")
    ds.results_df.to_csv(results_path)
    print(f"✅ DESeq2 results saved to: {results_path}")

    return ds.results_df


In [None]:
# Load real data saved earlier
gse_id = "GSE242272"
base_dir = "rna_seq_analysis"
counts_path = os.path.join(base_dir, f"{gse_id}_raw_counts.csv")
metadata_path = os.path.join(base_dir, f"{gse_id}_sample_metadata.csv")

# Read count matrix
counts_df = pd.read_csv(counts_path, index_col=0)

# Read or reconstruct metadata (if not saved separately, recreate it from column names)
sample_names = counts_df.index.tolist()
conditions = ["treated" if any(x in name.lower() for x in ["pge2", "caffeine", "treated"])
              else "control" for name in sample_names]
metadata_df = pd.DataFrame({"condition": conditions}, index=sample_names)

# Run DESeq2 analysis on real data
results_df = run_deseq2_manual(counts_df, metadata_df, contrast=["condition", "treated", "control"], output_path=base_dir)
results_df.head()

📋 Initial metadata:
              condition
0610005C13Rik   control
0610009B22Rik   control
0610009E02Rik   control
0610009L18Rik   control
0610010F05Rik   control
...                 ...
Zyx             control
Zzef1           control
Zzz3            control
a               control
ccdc198         control

[25239 rows x 1 columns]
Using None as control genes, passed at DeseqDataSet initialization


  dds = DeseqDataSet(
Fitting size factors...
  self.fit_size_factors(
Fitting dispersions...
... done in 2.20 seconds.

Fitting MAP dispersions...
... done in 1.44 seconds.



In [None]:
results_df.head()