In [None]:
# Cargando el módulo Entrez de la biblioteca biopython y otras bibliotecas necesarias para esta sesión
from Bio import Entrez
import pandas as pd
from io import StringIO
import subprocess
import os

# Proporcionar correo electrónico
Entrez.email = "yeimicc@lcg.unam.mx"
# Buscar el UID del BioProject
project_acc = "PRJNA168994"
handle = Entrez.esearch(db="bioproject", term=project_acc)
search_results = Entrez.read(handle)
print(search_results)
handle.close()
# Extraer solo el UID
project_uid = search_results["IdList"][0]
print("UID del BioProject:", project_uid)

{'Count': '1', 'RetMax': '1', 'RetStart': '0', 'IdList': ['168994'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'PRJNA168994[All Fields]', 'Field': 'All Fields', 'Count': '1', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': 'PRJNA168994[All Fields]'}
UID del BioProject: 168994


* Bio.Entrez es un modulo de biopiton para interactuar con las cases de datos de NCBI
* pandas para manipular datos tabulares
* StringIO para tratar cadenas como archivos
* subprocess para ejecutar comandos del sistema desde Python
* os para interactuar con el sistema operativo (rutas o archivos)

NCBI requiere un correo electronico para rastrear el uso de su API, obligatorio para usar Entrez

project_acc: es el num de acceso del BioProject a consultar
Entrez.esearch busca en la base de datos bioproject usando el ID
handle es un archivo temporal, formato XML, tienes que leer el contenido y convertirlo a un diccionario de Python


In [4]:
# Obtener resumen del BioProject
handle = Entrez.esummary(db="bioproject", id=project_uid)
proj_record = Entrez.read(handle)
print(proj_record)
handle.close()

{'DocumentSummarySet': DictElement({'DocumentSummary': [DictElement({'TaxId': '7227', 'Project_Id': '168994', 'Project_Acc': 'PRJNA168994', 'Project_Type': 'Primary submission', 'Project_Data_Type': 'Transcriptome or Gene expression', 'Sort_By_ProjectType': '889987', 'Sort_By_DataType': '887347', 'Sort_By_Organism': '130844', 'Project_Subtype': '', 'Project_Target_Scope': 'Multiisolate', 'Project_Target_Material': 'Transcriptome', 'Project_Target_Capture': 'Whole', 'Project_MethodType': 'Array', 'Project_Method': '', 'Project_Objectives_List': [{'Project_ObjectivesType': 'Expression', 'Project_Objectives': ''}], 'Registration_Date': '2012/06/20 00:00', 'Project_Name': 'Drosophila melanogaster', 'Project_Title': 'Drosophila melanogaster Transcriptome or Gene expression', 'Project_Description': 'RNA profiling data sets generated by the Drosophila modENCODE project.', 'Keyword': '', 'Relevance_Agricultural': '', 'Relevance_Medical': '', 'Relevance_Industrial': '', 'Relevance_Environmental

In [5]:
# Seleccionando la información relacionada al proyecto
proj = proj_record["DocumentSummarySet"]["DocumentSummary"][0]

# Mostrando todos los campos del diccionario
print(proj.keys())

# Accediendo a algunos campos
print("\n=== Información del BioProject ===")
print("Acceso:", proj["Project_Acc"])
print("Título:", proj["Project_Title"])
print("Organismo:", proj["Organism_Name"])
print("Descripción:", proj.get("Project_Description"))

dict_keys(['TaxId', 'Project_Id', 'Project_Acc', 'Project_Type', 'Project_Data_Type', 'Sort_By_ProjectType', 'Sort_By_DataType', 'Sort_By_Organism', 'Project_Subtype', 'Project_Target_Scope', 'Project_Target_Material', 'Project_Target_Capture', 'Project_MethodType', 'Project_Method', 'Project_Objectives_List', 'Registration_Date', 'Project_Name', 'Project_Title', 'Project_Description', 'Keyword', 'Relevance_Agricultural', 'Relevance_Medical', 'Relevance_Industrial', 'Relevance_Environmental', 'Relevance_Evolution', 'Relevance_Model', 'Relevance_Other', 'Organism_Name', 'Organism_Strain', 'Organism_Label', 'Sequencing_Status', 'Submitter_Organization', 'Submitter_Organization_List', 'Supergroup'])

=== Información del BioProject ===
Acceso: PRJNA168994
Título: Drosophila melanogaster Transcriptome or Gene expression
Organismo: Drosophila melanogaster
Descripción: RNA profiling data sets generated by the Drosophila modENCODE project.


In [6]:
# Revisar enlaces desde BioProject hacia GEO (GDS), con el uid del bioproject
handle = Entrez.elink(dbfrom="bioproject", db="gds", id=project_uid)
linksBioProj = Entrez.read(handle)
handle.close()

#print(linksBioProj)

if linksBioProj[0]["LinkSetDb"]:
    geo_ids = [link["Id"] for link in linksBioProj[0]["LinkSetDb"][0]["Link"]]
    print("\nEl proyecto", project_acc, " está asociado a la base de datos de GEO")
    print("UIDs GEO asociados:", geo_ids[:5], "... total:", len(geo_ids))
else:
    print("\nEl proyecto", project_acc, " NO tiene enlaces directos a GEO")    


El proyecto PRJNA168994  está asociado a la base de datos de GEO
UIDs GEO asociados: ['200040045', '200040043', '200040042', '200040040', '200040039'] ... total: 156


In [7]:
# Verificando la información de GEO
geo_uid = geo_ids[1]  # ejemplo, segundo UID
handle = Entrez.esummary(db="gds", id=geo_uid)
summary = Entrez.read(handle)
handle.close()
print(summary[0])
print(summary[0].keys())

{'Item': [], 'Id': '200040043', 'Accession': 'GSE40043', 'GDS': '', 'title': 'Drosophila melanogaster Pupae 2-4 days', 'summary': 'modENCODE_submission_757 This submission comes from a modENCODE project of Eric Lai. For full list of modENCODE projects, see http://www.genome.gov/26524648 Project Goal: We plan to generate a comprehensive catalog of expressed and functional microRNAs, and generate biological evidence for their regulatory activity. We plan also to delineate the primary transcription units of microRNA genes. Finally, we plan to annotate other classes of non-miRNA expressed small RNAs, as least some of which may define novel classes of small RNA genes. For data usage terms and conditions, please refer to http://www.genome.gov/27528022 and http://www.genome.gov/Pages/Research/ENCODE/ENCODEDataReleasePolicyFinal2008.pdf', 'GPL': '9058', 'GSE': '40043', 'taxon': 'Drosophila melanogaster', 'entryType': 'GSE', 'gdsType': 'Non-coding RNA profiling by high throughput sequencing', '

In [8]:
# imprimiendo los campos
geo_record=summary[0]
print("=== GEO Accession ===")
print("Accession:", geo_record.get("Accession"))
print("Tipo:", geo_record.get("entryType"))
print("Título:", geo_record.get("title"))
print("Resumen:", geo_record.get("summary"))
print("Organismo:", geo_record.get("taxon"))
print("Número de muestras:", geo_record.get("n_samples"))
print("PubMed IDs:", geo_record.get("PubMedIds"))
print("Enlace FTP:", geo_record.get("FTPLink"))
print("Archivos suplementarios:", geo_record.get("suppFile"))
print("Proyectos: ",geo_record.get("Projects"))

# Muestras asociadas
print("\n=== Muestras ===")
for sample in geo_record.get("Samples",[]):
    print(sample['Accession'],":",sample['Title'])

=== GEO Accession ===
Accession: GSE40043
Tipo: GSE
Título: Drosophila melanogaster Pupae 2-4 days
Resumen: modENCODE_submission_757 This submission comes from a modENCODE project of Eric Lai. For full list of modENCODE projects, see http://www.genome.gov/26524648 Project Goal: We plan to generate a comprehensive catalog of expressed and functional microRNAs, and generate biological evidence for their regulatory activity. We plan also to delineate the primary transcription units of microRNA genes. Finally, we plan to annotate other classes of non-miRNA expressed small RNAs, as least some of which may define novel classes of small RNA genes. For data usage terms and conditions, please refer to http://www.genome.gov/27528022 and http://www.genome.gov/Pages/Research/ENCODE/ENCODEDataReleasePolicyFinal2008.pdf
Organismo: Drosophila melanogaster
Número de muestras: IntegerElement(2, attributes={})
PubMed IDs: []
Enlace FTP: ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE40nnn/GSE40043/
Archivos s

In [9]:
# Obtener el UID correspondiente a la muestra
gsm_id = "GSM461177"
handle = Entrez.esearch(db="sra", term=gsm_id)
record = Entrez.read(handle)
print(record)

{'Count': '1', 'RetMax': '1', 'RetStart': '0', 'IdList': ['14790'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'GSM461177[All Fields]', 'Field': 'All Fields', 'Count': '1', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': 'GSM461177[All Fields]'}


In [13]:
# Consultar si existe el identificador
if not record["IdList"]:
    print("No se encontraron SRR para", gsm_id)
else:
    sra_id = record["IdList"]
# Obtener la información de la corrida como texto
    handle = Entrez.efetch(db="sra", id=sra_id, rettype="runinfo", retmode="text")
    runinfo = handle.read()
    handle.close()
    
    if isinstance(runinfo, bytes):
        runinfo = runinfo.decode("utf-8")

# Cargar en pandas el resultado, directamente desde la cadena
    df = pd.read_csv(StringIO(runinfo))
    print(df.columns)
    display(df[['Run','Experiment','Platform','LibraryName','LibraryLayout','Sample','ScientificName','SampleName']].head())

Index(['Run', 'ReleaseDate', 'LoadDate', 'spots', 'bases', 'spots_with_mates',
       'avgLength', 'size_MB', 'AssemblyName', 'download_path', 'Experiment',
       'LibraryName', 'LibraryStrategy', 'LibrarySelection', 'LibrarySource',
       'LibraryLayout', 'InsertSize', 'InsertDev', 'Platform', 'Model',
       'SRAStudy', 'BioProject', 'Study_Pubmed_id', 'ProjectID', 'Sample',
       'BioSample', 'SampleType', 'TaxID', 'ScientificName', 'SampleName',
       'g1k_pop_code', 'source', 'g1k_analysis_group', 'Subject_ID', 'Sex',
       'Disease', 'Tumor', 'Affection_Status', 'Analyte_Type',
       'Histological_Type', 'Body_Site', 'CenterName', 'Submission',
       'dbgap_study_accession', 'Consent', 'RunHash', 'ReadHash'],
      dtype='object')


Unnamed: 0,Run,Experiment,Platform,LibraryName,LibraryLayout,Sample,ScientificName,SampleName
0,SRR031714,SRX014459,ILLUMINA,S2_DRSC_Untreated-3,PAIRED,SRS008447,Drosophila melanogaster,GSM461177
1,SRR031715,SRX014459,ILLUMINA,S2_DRSC_Untreated-3,PAIRED,SRS008447,Drosophila melanogaster,GSM461177


In [None]:
# Crear la ruta del directorio de entrada y salida de datos
output_dir = ""

# Hacer el prefetch de los archivos srr
subprocess.run(["prefetch", "--output-directory", output_dir, "SRR031714"], check=True)
subprocess.run(["prefetch", "--output-directory", output_dir, "SRR031715"], check=True)

FileNotFoundError: [WinError 2] El sistema no puede encontrar el archivo especificado