In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import geopandas

In [2]:
# geo_simple_parser.py
# Minimal parser for GEO Series Matrix (.txt) into two DataFrames:
#   sample_df (rows = samples, columns = metadata)
#   expr_df   (rows = features/probes, columns = samples)
#
# Notes:
# - Assumes !Sample_characteristics_ch1 rows like "key: value"
# - Uses !series_matrix_table_begin / !series_matrix_table_end to find the table
# - Does NOT need GeoPandas (GEO here is Gene Expression Omnibus)

import io

def load_geo_series_matrix(path):
    # read all lines
    with open(path, encoding="utf-8", errors="replace") as f:
        lines = f.readlines()

    # --------- 1) collect sample metadata ---------
    # keep any !Sample_* lines
    sample_lines = [ln for ln in lines if ln.startswith("!Sample_")]

    # helper to split a tab line into fields (and strip quotes)
    def cells(line):
        parts = [p.strip().strip('"') for p in line.rstrip("\n").split("\t")]
        return parts

    # find number of samples (length of the first !Sample_* row minus the tag)
    n = max((len(cells(ln)) - 1 for ln in sample_lines), default=0)

    # start a dict of columns -> values
    meta = {}

    # capture a sample id column if present
    id_candidates = ["!Sample_geo_accession", "!Sample_title", "!Sample_name"]

    for ln in sample_lines:
        cols = cells(ln)
        tag, vals = cols[0], cols[1:]
        # pad short rows so everything has length n
        vals += [""] * (n - len(vals))

        # characteristics rows look like: "key: value"
        if tag.lower().startswith("!sample_characteristics"):
            # infer the column name from the first value's key
            first = vals[0]
            key = first.split(":", 1)[0].strip().lower().replace(" ", "_") if ":" in first else "characteristic"
            # keep only the value part
            vals = [v.split(":", 1)[1].strip() if ":" in v else v for v in vals]
            colname = key
        else:
            colname = tag[1:].lower()  # drop leading "!"

        # avoid overwriting duplicate column names
        while colname in meta:
            colname += "_dup"
        meta[colname] = vals

    # choose sample_id from common candidates or make generic names
    sample_id = None
    for tag in id_candidates:
        key = tag[1:].lower()
        if key in meta:
            sample_id = meta[key]
            break
    if sample_id is None:
        sample_id = [f"sample_{i+1}" for i in range(n)]
    meta["sample_id"] = sample_id

    sample_df = pd.DataFrame(meta).set_index("sample_id", drop=False)

    # --------- 2) read the expression table ---------
    # locate begin/end markers
    begin = next((i for i, ln in enumerate(lines) if ln.strip().lower() == "!series_matrix_table_begin"), None)
    end   = next((i for i, ln in enumerate(lines) if ln.strip().lower() == "!series_matrix_table_end"), None)

    if begin is None or end is None or end <= begin:
        # no table found -> empty frame
        expr_df = pd.DataFrame()
        return sample_df, expr_df

    table_text = "".join(lines[begin+1:end])
    expr_df = pd.read_csv(io.StringIO(table_text), sep="\t", dtype=str)
    expr_df = expr_df.set_index(expr_df.columns[0])       # first col = feature id
    expr_df.columns = [c.strip().strip('"') for c in expr_df.columns]
    expr_df = expr_df.apply(pd.to_numeric, errors="coerce")  # make numeric where possible

    return sample_df, expr_df

# quick demo (uncomment to run as a script):
# if __name__ == "__main__":
#     s, e = load_geo_series_matrix("GSE_series_matrix.txt")
#     print(s.head())
#     print(e.shape)


In [3]:
s, e = load_geo_series_matrix("GSE121239_series_matrix.txt")

In [4]:
#removing unnecessary collumns 
s = s.drop(columns=['sample_status', 'sample_status',
       'sample_submission_date', 'sample_last_update_date', 'sample_type',
       'sample_channel_count','sample_organism_ch1', 'sample_label_protocol_ch1', 'sample_taxid_ch1', 'sample_hyb_protocol',
       'sample_scan_protocol', 'sample_description', 'sample_data_processing',
       'sample_platform_id', 'sample_contact_name', 'sample_contact_email',
       'sample_contact_department', 'sample_contact_institute',
       'sample_contact_address', 'sample_contact_city', 'sample_contact_state',
       'sample_contact_zip/postal_code', 'sample_contact_country',
       'sample_supplementary_file', 'sample_data_row_count', 'sample_relation', 'sample_treatment_protocol_ch1','sample_growth_protocol_ch1', 'sample_molecule_ch1', 
       'sample_extract_protocol_ch1','sample_label_ch1', 'tissue'])

In [5]:
s['sledai'] = s['sledai'].astype(int)

In [None]:
#classificando as lesões e o caso do paciente: sledai: leve: 0-6, média 7-11, grave >=12

s['class'] = s['sledai'].apply(lambda x: 'leve' if 0 <= x <= 6 
                               else ('média' if 7 <= x <= 11 
                                             else 'grave'))
s.loc[s['sledai']==0, 'class'] = 'healthy'

In [None]:
#apenas pacientes com lupus 
s_lupus = s.loc[s['disease_state'] != 'Healthy' ]
s_lupus['patient_id'] = s_lupus['patient_id'].astype(int)
s_lupus = s_lupus.sort_values('patient_id')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s_lupus['patient_id'] = s_lupus['patient_id'].astype(int)


In [11]:
s_lupus

Unnamed: 0_level_0,sample_title,sample_geo_accession,sample_source_name_ch1,disease_state,patient_id,sledai,visit_date,imputed_neutrophil_percentage,sample_id,class
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GSM3428330,PBMC_SLE24_v1,GSM3428330,PBMC_SLE24_v1,Systemic Lupus Erythematosus,24,4,2009-10-15,89.3,GSM3428330,leve
GSM3428331,PBMC_SLE24_v2,GSM3428331,PBMC_SLE24_v2,Systemic Lupus Erythematosus,24,0,2009-12-07,68,GSM3428331,healthy
GSM3428332,PBMC_SLE24_v3,GSM3428332,PBMC_SLE24_v3,Systemic Lupus Erythematosus,24,8,2010-01-07,94.7,GSM3428332,média
GSM3428333,PBMC_SLE24_v4,GSM3428333,PBMC_SLE24_v4,Systemic Lupus Erythematosus,24,0,2010-02-19,64.9,GSM3428333,healthy
GSM3428334,PBMC_SLE24_v5,GSM3428334,PBMC_SLE24_v5,Systemic Lupus Erythematosus,24,0,2010-03-08,64.9,GSM3428334,healthy
...,...,...,...,...,...,...,...,...,...,...
GSM3428615,PBMC_SLE2129_v3,GSM3428615,PBMC_SLE2129_v3,Systemic Lupus Erythematosus,2129,6,2010-02-15,,GSM3428615,leve
GSM3428620,PBMC_SLE2132_v3,GSM3428620,PBMC_SLE2132_v3,Systemic Lupus Erythematosus,2132,4,2010-05-24,48.2,GSM3428620,leve
GSM3428621,PBMC_SLE2132_v4,GSM3428621,PBMC_SLE2132_v4,Systemic Lupus Erythematosus,2132,2,2010-12-15,56.2,GSM3428621,leve
GSM3428618,PBMC_SLE2132_v1,GSM3428618,PBMC_SLE2132_v1,Systemic Lupus Erythematosus,2132,2,2009-11-30,40.7,GSM3428618,leve


In [51]:
df_patient = pd.DataFrame()
df_patient['patient_id'] = np.unique(s_lupus['patient_id'])

In [52]:
df_patient['first_visit'] = s_lupus.groupby('patient_id')['visit_date'].min().values
df_patient['last_visit'] = s_lupus.groupby('patient_id')['visit_date'].max().values

df_patient['min_sledai'] = s_lupus.groupby('patient_id')['sledai'].min().values
df_patient['max_sledai'] = s_lupus.groupby('patient_id')['sledai'].max().values

In [57]:
def classify_lesions(df, col_sledai, col_to): 
    df[col_to] = df[col_sledai].apply(lambda x: 'leve' if 0 <= x <= 6 
                               else ('média' if 7 <= x <= 11 
                                             else 'grave'))

In [53]:
s_lupus['visit_date'] = pd.to_datetime(s_lupus['visit_date'])

# ---- primeira visita ----
first_visits = s_lupus.sort_values('visit_date').groupby('patient_id').first()

# ---- última visita ----
last_visits = s_lupus.sort_values('visit_date').groupby('patient_id').last()

# ---- merge no df_patient ----
df_patient = df_patient.merge(
    first_visits[['sledai']], 
    left_on='patient_id', 
    right_index=True, 
    how='left'
).rename(columns={'sledai': 'first_sledai'})

df_patient['first_class'] = df_patient['first_sledai'].apply(lambda x: 'leve' if 0 <= x <= 6 
                               else ('média' if 7 <= x <= 11 
                                             else 'grave'))

df_patient = df_patient.merge(
    last_visits[['sledai']], 
    left_on='patient_id', 
    right_index=True, 
    how='left'
).rename(columns={'sledai': 'last_sledai'})

df_patient['last_class'] = df_patient['last_sledai'].apply(lambda x: 'leve' if 0 <= x <= 6 
                               else ('média' if 7 <= x <= 11 
                                             else 'grave'))

In [54]:
df_patient['delta_sledai'] = df_patient['last_sledai'] - df_patient['first_sledai']

In [56]:
df_patient

Unnamed: 0,patient_id,first_visit,last_visit,min_sledai,max_sledai,first_sledai,first_class,last_sledai,last_class,delta_sledai
0,24,2009-10-15,2010-03-08,0,8,4,leve,0,leve,-4
1,46,2009-10-29,2010-08-26,0,10,2,leve,2,leve,0
2,113,2009-11-12,2010-07-29,0,2,0,leve,0,leve,0
3,244,2010-02-25,2011-11-10,0,10,0,leve,2,leve,2
4,317,2009-10-15,2011-10-31,2,10,2,leve,2,leve,0
...,...,...,...,...,...,...,...,...,...,...
60,2119,2009-09-28,2011-04-18,2,10,4,leve,10,média,6
61,2122,2010-07-29,2012-06-11,0,8,8,média,0,leve,-8
62,2128,2009-11-05,2011-11-03,2,4,4,leve,4,leve,0
63,2129,2009-10-15,2011-01-13,2,12,10,média,2,leve,-8


In [55]:
df_patient.loc[df_patient['last_class'] =='média']

Unnamed: 0,patient_id,first_visit,last_visit,min_sledai,max_sledai,first_sledai,first_class,last_sledai,last_class,delta_sledai
12,704,2009-12-14,2012-01-09,0,8,0,leve,8,média,8
40,1679,2009-10-22,2010-02-16,2,10,2,leve,10,média,8
46,1807,2009-10-29,2010-05-06,0,10,2,leve,10,média,8
60,2119,2009-09-28,2011-04-18,2,10,4,leve,10,média,6


In [40]:
df_patient['delta_sledai'].max()

np.int64(8)

In [24]:
s_lupus = s_lupus.sort_values('patient_id')
s_lupus.groupby("patient_id", group_keys=True)[['sledai', 'visit_date', 'class']].apply(lambda x: x)

Unnamed: 0_level_0,Unnamed: 1_level_0,sledai,visit_date,class
patient_id,sample_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24,GSM3428330,4,2009-10-15,leve
24,GSM3428331,0,2009-12-07,healthy
24,GSM3428332,8,2010-01-07,média
24,GSM3428333,0,2010-02-19,healthy
24,GSM3428334,0,2010-03-08,healthy
...,...,...,...,...
2129,GSM3428617,2,2011-01-13,leve
2132,GSM3428618,2,2009-11-30,leve
2132,GSM3428619,4,2010-02-22,leve
2132,GSM3428620,4,2010-05-24,leve


In [None]:
df_group = s_group.groupby('patient_id')['visit_date'].min()
#df_group = s_group.groupby('patient_id')['visit_date'].max()

In [None]:
s_lupus.groupby('patient_id')['sledai'].max()

In [None]:
s_lupus.groupby('patient_id')['visit_date'].max()

In [None]:
df_patient = pd.DataFrame()
df_patient['patient_id'] = s_lupus.groupby('patient_id').index

In [None]:
s_lupus.groupby('patient_id')['patient_id']

In [None]:
s_lupus.groupby(as_index='patient_id', by='patient_id')[]

In [None]:
e.T