In [1]:
!pip3 install GEOparse



In [2]:
import GEOparse
import pandas as pd
import os
import logging

In [3]:
# creamos los directorios para guardar los notebooks y algunos resultados:
os.makedirs("/Users/user/Desktop/breast_cancer_fr/results", exist_ok=True)
os.makedirs("/Users/user/Desktop/breast_cancer_fr/data", exist_ok=True)

In [4]:
logging.getLogger("GEOparse").setLevel(logging.ERROR)

gse = GEOparse.get_GEO("GSE50948", destdir="../data")
gse.phenotype_data.head()

100%|██████████| 80.4M/80.4M [00:16<00:00, 5.11MB/s]  
  return read_csv(StringIO(data), index_col=None, sep="\t")


Unnamed: 0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,taxid_ch1,...,characteristics_ch1.6.er,characteristics_ch1.7.pr,characteristics_ch1.8.her2,characteristics_ch1.9.arm,characteristics_ch1.10.treatment,characteristics_ch1.11.age,characteristics_ch1.12.ecog,characteristics_ch1.13.inflammatory.brca,characteristics_ch1.14.menopausal.status,characteristics_ch1.15.pcr
GSM1232992,BreastCancer_S002,GSM1232992,Public on Jan 21 2014,Sep 17 2013,Jun 06 2022,RNA,1,breast cancer,Homo sapiens,9606,...,,,,,,,,,,
GSM1232993,BreastCancer_S003,GSM1232993,Public on Jan 21 2014,Sep 17 2013,Jun 06 2022,RNA,1,breast cancer,Homo sapiens,9606,...,,,,,,,,,,
GSM1232994,BreastCancer_S004,GSM1232994,Public on Jan 21 2014,Sep 17 2013,Jun 06 2022,RNA,1,breast cancer,Homo sapiens,9606,...,,,,,,,,,,
GSM1232995,BreastCancer_S005,GSM1232995,Public on Jan 21 2014,Sep 17 2013,Jun 06 2022,RNA,1,breast cancer,Homo sapiens,9606,...,,,,,,,,,,
GSM1232996,BreastCancer_S006,GSM1232996,Public on Jan 21 2014,Sep 17 2013,Jun 06 2022,RNA,1,breast cancer,Homo sapiens,9606,...,,,,,,,,,,


In [5]:
# analizar los metadatos estructurados a partir de los strings de 'characteristics_ch1'
def parse_characteristics(characteristics_list):
    parsed = {}
    for item in characteristics_list:
        if ":" in item:
            key, value = item.split(":", 1)
            parsed[key.strip().lower()] = value.strip()
    return parsed

# aplicar el análisis a todas las muestras
characteristics_df = pd.DataFrame([
    parse_characteristics(gsm.metadata.get("characteristics_ch1", []))
    for gsm in gse.gsms.values()
], index=gse.gsms.keys())


characteristics_df.head(2)

Unnamed: 0,centerid,patid,size_dissected_area,percentage_tumor_cells,cell_number_dissected,inflammatory_cells [%],invasive_tumor_area_size1 [mm],invasive_tumor_area_size2 [mm],invasive_tumor_cells [%],invasive_tumor_grade,...,er,pr,her2,arm,treatment,age,ecog,inflammatory.brca,menopausal.status,pcr
GSM1232992,40,161,40,70,22400,10,4,10,70,3,...,ER-,PR-,HER2+,HER2+ CT,neoadjuvant doxorubicin/paclitaxel (AT) follow...,66,0,yes,post,RD
GSM1232993,40,171,4,60,1500,10,2,7,80,2,...,ER-,PR-,HER2+,HER2+ CT,neoadjuvant doxorubicin/paclitaxel (AT) follow...,58,1,yes,post,RD


# Tratamiento de datos: characteristics_df

In [6]:
def map_binary(col, mapping):
    return (
        characteristics_df[col]
        .astype(str)
        .str.strip()
        .str.upper()
        .map(mapping)
)

binary_map = {
    "ER+": 1, "ER-": 0,
    "PR+": 1, "PR-": 0,
    "HER2+": 1, "HER2-": 0,
    "PCR": 1, "RD": 0,
    "YES": 1, "NO": 0
}

for col in ["er", "pr", "her2", "pcr", "inflammatory.brca"]:
    characteristics_df[col] = map_binary(col, binary_map)

In [7]:
#miramos el numero de samples que tenemos
print("numero de samples en characteristics_df:", characteristics_df.shape[0])

numero de samples en characteristics_df: 156


la columna 'pcr' indica la respuesta patológica al tratamiento neoadyuvante:
* 'pCR' (pathological Complete Response): no se detecta cáncer residual tras el tratamiento
* 'RD' (Residual Disease): queda enfermedad residual (el tratamiento no eliminó todo el tumor)

mapeamos estas categorías a valores binarios para facilitar el modelado predictivo:
* pCR → 1 (respuesta completa)
* RD  → 0 (sin respuesta completa)
esto permite tratar el problema como una clasificación binaria

In [8]:
# filtrar solo las muestras que tienen un valor binario válido en la columna 'pcr' (0 o 1)
# esta columna se usará directamente como variable objetivo
characteristics_df = characteristics_df[characteristics_df["pcr"].isin([0, 1])].copy()

# renombrar la columna 'pcr' como 'pcr_binary' para usarla como variable objetivo
characteristics_df.rename(columns={"pcr": "pcr_binary"}, inplace=True)
characteristics_df.head(2)

Unnamed: 0,centerid,patid,size_dissected_area,percentage_tumor_cells,cell_number_dissected,inflammatory_cells [%],invasive_tumor_area_size1 [mm],invasive_tumor_area_size2 [mm],invasive_tumor_cells [%],invasive_tumor_grade,...,er,pr,her2,arm,treatment,age,ecog,inflammatory.brca,menopausal.status,pcr_binary
GSM1232992,40,161,40,70,22400,10,4,10,70,3,...,0,0,1,HER2+ CT,neoadjuvant doxorubicin/paclitaxel (AT) follow...,66,0,1,post,0
GSM1232993,40,171,4,60,1500,10,2,7,80,2,...,0,0,1,HER2+ CT,neoadjuvant doxorubicin/paclitaxel (AT) follow...,58,1,1,post,0


In [9]:
# guaradremos el dataframe en un archivo CSV
characteristics_df.to_csv("/Users/user/Desktop/breast_cancer_fr/data/characteristics_df.csv", index=True)

In [10]:
# construir la matriz de expresión: genes como filas, muestras como columnas
# se extrae la columna 'VALUE' de cada muestra y se renombran con el nombre GSM
expression_matrix = pd.concat(
    [gsm.table.set_index("ID_REF")[["VALUE"]].rename(columns={"VALUE": gsm.name})
     for gsm in gse.gsms.values()],
    axis=1
)

# transponer la matriz: ahora las filas son las muestras y las columnas son los genes
expression_t = expression_matrix.T
expression_t.index.name = "Sample"

# asegurarse de que no haya espacios en los índices
expression_t.index = expression_t.index.str.strip()
characteristics_df.index = characteristics_df.index.str.strip()

# quedarnos solo con las muestras comunes a ambos dataframes
common_samples = expression_t.index.intersection(characteristics_df.index)
expression_t = expression_t.loc[common_samples]
characteristics_df = characteristics_df.loc[common_samples]

In [11]:
# unir los datos clínicos (characteristics_df) con la matriz de expresión génica (expression_t)
# se hace un join por el índice (los ids de muestra tipo GSMxxxx), conservando solo las muestras comunes
full_data = characteristics_df.join(expression_t, how="inner")

# reiniciar el índice: pasa los ids de muestra de índice a columna normal para facilitar su uso
full_data.reset_index(inplace=True)

# verificación final: imprimir forma del dataframe (número de muestras, número de columnas)
# y mostrar las primeras 30 columnas (para ver metadatos y algunas expresiones génicas)
print(full_data.shape)
print(full_data.columns[:31])

(156, 54705)
Index(['index', 'centerid', 'patid', 'size_dissected_area',
       'percentage_tumor_cells', 'cell_number_dissected',
       'inflammatory_cells [%]', 'invasive_tumor_area_size1 [mm]',
       'invasive_tumor_area_size2 [mm]', 'invasive_tumor_cells [%]',
       'invasive_tumor_grade', 'necrotic_cells [%]',
       'normal_epithelial_cells [%]', 'preneoplastic_tumor_cells [%]',
       'stroma_cells [%]', 'bgus.ct', 'er.ct', 'her2.ct', 'pr.ct', 'race',
       'er', 'pr', 'her2', 'arm', 'treatment', 'age', 'ecog',
       'inflammatory.brca', 'menopausal.status', 'pcr_binary', '1007_s_at'],
      dtype='object')


Let's explore where the nulls come from

In [12]:
faltantes = full_data.isnull().sum()
print(faltantes[faltantes > 0])

size_dissected_area               2
percentage_tumor_cells            2
cell_number_dissected             2
inflammatory_cells [%]            2
invasive_tumor_area_size1 [mm]    3
invasive_tumor_area_size2 [mm]    2
invasive_tumor_cells [%]          2
invasive_tumor_grade              2
necrotic_cells [%]                2
normal_epithelial_cells [%]       2
preneoplastic_tumor_cells [%]     2
stroma_cells [%]                  2
bgus.ct                           1
pr.ct                             2
dtype: int64


In [13]:
# ver filas que contienen al menos un valor nulo
full_data[full_data.isnull().any(axis=1)]

Unnamed: 0,index,centerid,patid,size_dissected_area,percentage_tumor_cells,cell_number_dissected,inflammatory_cells [%],invasive_tumor_area_size1 [mm],invasive_tumor_area_size2 [mm],invasive_tumor_cells [%],...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
17,GSM1233009,40,321,6.0,90.0,3240.0,10.0,,5.0,80.0,...,2.082038,1.61801,2.025026,2.023271,2.79102,1.687044,1.474732,1.310008,1.702085,2.1671
56,GSM1233048,26,441,24.0,75.0,14400.0,5.0,8.0,3.0,75.0,...,2.380466,2.001925,2.622977,2.501423,3.6857,1.79407,2.11088,1.877111,2.270053,2.090914
154,GSM1233146,7,161,,,,,,,,...,2.000076,1.395142,2.077709,1.7849,3.027264,2.055914,1.718584,1.587461,1.794698,1.727104
155,GSM1233147,28,231,,,,,,,,...,1.994588,1.567696,2.187575,1.897464,3.037246,1.468921,1.495367,1.471581,1.673586,1.662956


In [14]:
#dado que solo son 4 muestras, las eliminamos
full_data = full_data.dropna()
full_data.head()

Unnamed: 0,index,centerid,patid,size_dissected_area,percentage_tumor_cells,cell_number_dissected,inflammatory_cells [%],invasive_tumor_area_size1 [mm],invasive_tumor_area_size2 [mm],invasive_tumor_cells [%],...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,GSM1232992,40,161,40,70,22400,10,4,10,70,...,1.617069,1.55283,1.984722,1.771254,2.813851,1.631844,1.55537,1.291948,1.621068,1.704353
1,GSM1232993,40,171,4,60,1500,10,2,7,80,...,1.763601,1.728114,1.875366,2.290922,3.171749,1.878034,1.457797,1.532488,1.845472,2.110035
2,GSM1232994,40,191,19,45,5130,10,6,10,60,...,1.754134,1.486587,1.752249,2.392434,2.765988,1.736611,1.490477,1.597638,1.692226,1.959056
3,GSM1232995,40,141,40,90,28800,5,10,4,90,...,1.687064,1.283018,1.701278,1.89929,2.705988,1.396333,1.340778,1.385656,1.585541,1.924786
4,GSM1232996,40,131,40,90,28800,5,4,10,90,...,1.80656,1.376001,2.109005,1.785332,2.86205,1.70406,1.684241,1.368954,1.808343,1.74166


In [15]:
#comprobamos que se han eliminado las 4 filas
full_data.shape

(152, 54705)

# Data types

In [16]:
print(full_data.dtypes.head(20))

index                             object
centerid                          object
patid                             object
size_dissected_area               object
percentage_tumor_cells            object
cell_number_dissected             object
inflammatory_cells [%]            object
invasive_tumor_area_size1 [mm]    object
invasive_tumor_area_size2 [mm]    object
invasive_tumor_cells [%]          object
invasive_tumor_grade              object
necrotic_cells [%]                object
normal_epithelial_cells [%]       object
preneoplastic_tumor_cells [%]     object
stroma_cells [%]                  object
bgus.ct                           object
er.ct                             object
her2.ct                           object
pr.ct                             object
race                              object
dtype: object


In [17]:
# lista de columnas numéricas que deberían ser float
cols_numericas = [
    "size_dissected_area", "percentage_tumor_cells", "cell_number_dissected",
    "inflammatory_cells [%]", "invasive_tumor_area_size1 [mm]", "invasive_tumor_area_size2 [mm]",
    "invasive_tumor_cells [%]", "invasive_tumor_grade", "necrotic_cells [%]",
    "normal_epithelial_cells [%]", "preneoplastic_tumor_cells [%]", "stroma_cells [%]",
    "bgus.ct", "er.ct", "her2.ct", "pr.ct"
]

# convertirlas a float (forzando coerción si hay problemas)
for col in cols_numericas:
    full_data[col] = pd.to_numeric(full_data[col], errors="coerce")


In [18]:
print(full_data.dtypes.head(20))

index                              object
centerid                           object
patid                              object
size_dissected_area               float64
percentage_tumor_cells              int64
cell_number_dissected               int64
inflammatory_cells [%]              int64
invasive_tumor_area_size1 [mm]    float64
invasive_tumor_area_size2 [mm]    float64
invasive_tumor_cells [%]            int64
invasive_tumor_grade                int64
necrotic_cells [%]                  int64
normal_epithelial_cells [%]         int64
preneoplastic_tumor_cells [%]       int64
stroma_cells [%]                    int64
bgus.ct                           float64
er.ct                             float64
her2.ct                           float64
pr.ct                             float64
race                               object
dtype: object


# ahora vamos a inspeccionar las variables categoricas para ver como las tratamos.

In [19]:
# obtener todas las columnas de tipo object
cols_object = full_data.select_dtypes(include="object").columns.tolist()

# mostrar el listado
print("Columnas de tipo object:")
for col in cols_object:
    print(f"- {col}")

Columnas de tipo object:
- index
- centerid
- patid
- race
- arm
- treatment
- age
- ecog
- menopausal.status


In [20]:
cols_object = [
    "index", "centerid", "patid", "race", "arm", "treatment",
    "age", "ecog", "menopausal.status"
]

for col in cols_object:
    print(f"\n{col}:")
    print(full_data[col].unique())


index:
['GSM1232992' 'GSM1232993' 'GSM1232994' 'GSM1232995' 'GSM1232996'
 'GSM1232997' 'GSM1232998' 'GSM1232999' 'GSM1233000' 'GSM1233001'
 'GSM1233002' 'GSM1233003' 'GSM1233004' 'GSM1233005' 'GSM1233006'
 'GSM1233007' 'GSM1233008' 'GSM1233010' 'GSM1233011' 'GSM1233012'
 'GSM1233013' 'GSM1233014' 'GSM1233015' 'GSM1233016' 'GSM1233017'
 'GSM1233018' 'GSM1233019' 'GSM1233020' 'GSM1233021' 'GSM1233022'
 'GSM1233023' 'GSM1233024' 'GSM1233025' 'GSM1233026' 'GSM1233027'
 'GSM1233028' 'GSM1233029' 'GSM1233030' 'GSM1233031' 'GSM1233032'
 'GSM1233033' 'GSM1233034' 'GSM1233035' 'GSM1233036' 'GSM1233037'
 'GSM1233038' 'GSM1233039' 'GSM1233040' 'GSM1233041' 'GSM1233042'
 'GSM1233043' 'GSM1233044' 'GSM1233045' 'GSM1233046' 'GSM1233047'
 'GSM1233049' 'GSM1233050' 'GSM1233051' 'GSM1233052' 'GSM1233053'
 'GSM1233054' 'GSM1233055' 'GSM1233056' 'GSM1233057' 'GSM1233058'
 'GSM1233059' 'GSM1233060' 'GSM1233061' 'GSM1233062' 'GSM1233063'
 'GSM1233064' 'GSM1233065' 'GSM1233066' 'GSM1233067' 'GSM1233068'
 '

In [21]:
# eliminar columnas innecesarias
full_data.drop(columns=["index", "centerid", "patid"], inplace=True)

# guardamos la dataframe final en un archivo CSV dentro de la carpeta 'data'
full_data.to_csv("/Users/user/Desktop/breast_cancer_fr/data/data_categorical_plots.csv", index=False)

# convertir columnas numéricas almacenadas como texto
full_data["age"] = pd.to_numeric(full_data["age"], errors="coerce")
full_data["ecog"] = pd.to_numeric(full_data["ecog"], errors="coerce")

# codificar 'menopausal.status' como ordinal
full_data["menopausal.status"] = full_data["menopausal.status"].map({"pre": 0, "post": 1})

# codificar 'race' y 'arm' como dummies
full_data = pd.get_dummies(full_data, columns=["race", "arm"], drop_first=True)

In [22]:
print(full_data["pcr_binary"].value_counts(dropna=False))

pcr_binary
0    102
1     50
Name: count, dtype: int64


In [23]:
#comprobamos que no hay valores nulos
columnas_con_nan = full_data.columns[full_data.isnull().any()]
full_data[columnas_con_nan].isnull().sum()

Series([], dtype: float64)

# nombre de las columnas:

In [24]:
full_data.columns = (
    full_data.columns
    .str.strip()
    .str.replace(" [mm]", "_mm", regex=False)
    .str.replace(" [%]", "_pct", regex=False)
    .str.replace(" ", "_")
    .str.replace(".", "", regex=False)
    .str.replace("[", "", regex=False)
    .str.replace("]", "", regex=False)
)

In [25]:
# guardamos la dataframe final en un archivo CSV dentro de la carpeta 'data'
full_data.to_csv("/Users/user/Desktop/breast_cancer_fr/data/processed_data.csv", index=False)