In [1]:
# importing libraries
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import GEOparse
import plotly.graph_objects as go

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### To-do List

* Entrega Parcial
    * Construir df final (ok)
    * Diferenciar e agrupar pacientes leves e medios/graves pelo sledai (ok)
    * Analise de expressao diferencial
    * Montar a rede de expressão diferencial e analisar

* Neutrofilos -> Velocidade da resposta imune (reduz vies da analise do médico)
    * -> Encontrar genes mais responsáveis pelo aumento no perecentual de neutrofilos
    * -> Selecionar os genes mais relevantes para o aumento dos neutrofilos
    * -> Correlação entre nivel de expressao genica e resposta imune

In [2]:
import GEOparse
import pandas as pd
import numpy as np

class GEOParser:
    """
    Parser for GEO datasets using GEOparse.
    First version specific for GSE121239 (Lúpus).

    Parameters
    -----------
    geo_id : str
        GEO Series ID (e.g., 'GSE121239').
    destdir : str
        Directory to download GEO data.
    gene_id_cols : list of str, optional
        Columns to use for gene ID mapping. Default is ['ID', 'Gene Symbol', 'Gene Title'].
    meta_map : dict, optional
        Mapping of metadata fields to their index in characteristics_ch1. 
    """
    def __init__(self, geo_id, destdir, gene_id_cols=None, meta_map=None):
        self.geo_id = geo_id
        self.destdir = destdir
        self.gene_id_cols = gene_id_cols or ['ID', 'Gene Symbol', 'Gene Title']
        self.meta_map = meta_map
        self.gse = None
        self.gpl = None

    def load(self):
        """
        Load GEO Series data.
        """
        self.gse = GEOparse.get_GEO(geo=self.geo_id, destdir=self.destdir, silent=True)

    def try_cast(self, val, typ):
        """
        Try to cast a value to a specified type, return NaN on failure.

        Parameters
        ----------
        val : any
            Value to cast.
        typ : type
            Target type for casting.
        """
        try:
            return typ(val)
        except (ValueError, TypeError):
            return np.nan

    def select_main_gene(self, gene_str, separator='///'):
        """
        Select the main gene from a string of gene symbols separated by a given separator.
        Parameters
        ----------
        gene_str : str
            String containing gene symbols separated by the specified separator.
        separator : str, optional
            Separator used in the gene string. Default is '///'.
        Returns
        -------
        main_gene : str or np.nan
            The selected main gene symbol, or NaN if input is NaN.
        """
        if pd.isna(gene_str):
            return np.nan
        # Lista de genes encontrados
        genes = [g.strip() for g in gene_str.split(separator)]

        # Prioriza genes que não começam com LOC (Locus genômico)
        known_genes = [g for g in genes if not g.startswith('LOC')]
        if known_genes:
            return known_genes[0]
        return genes[0]  # Se só tem LOC, retorna o primeiro

    def parse(self):
        """
        Parse the loaded GEO Series data.

        Returns
        -------
        gene_data: pd.DataFrame
            Dataframe with Gene expression data.
        patient_data: pd.DataFrame
            Dataframe with Patient metadata.
        """
        if self.gse is None:
            self.load()

        # 1. Cria listas com metadados e expressao dos paciente e expressao genica 
        patient_meta_list = []
        gene_data_list = []

        # 2. Itera sobre os GSMs para extrair metadados e dados de expressão
        for gsm_name, gsm in self.gse.gsms.items():

            # 2.1. Extrai metadados do paciente
            meta = gsm.metadata['characteristics_ch1']
            patient_meta = {'sample_code': gsm_name}

            # 2.2. Parse de metadados de acordo com o mapeamento fornecido
            for key, idx in self.meta_map.items():
                # 2.3 . Extrai valor da lista de metadados e faz cast apropriado
                val = meta[idx].split(':')[1].strip() if len(meta) > idx else None
                if key == 'sledai':
                    val = self.try_cast(val, int)
                elif key == 'visit_date':
                    val = self.try_cast(val, pd.Timestamp)
                elif key == 'neutrophil_percent':
                    val = self.try_cast(val, float)
                # 2.4. Adiciona informação ao dicionario de metadados do paciente
                patient_meta[key] = val
            
            patient_meta_list.append(patient_meta)

            # 2.5. Extrai dados de expressão gênica
            gene_df = gsm.table[['ID_REF', 'VALUE']].copy()
            gene_df['sample_code'] = gsm_name
            gene_data_list.append(gene_df)

        # 3. Concatena dados de expressão gênica em um único DataFrame
        gene_data = pd.concat(gene_data_list, ignore_index=True)

        # 4. GPLS: plataforma com anotação de genes, incluindo nome e símbolo
        self.gpl = self.gse.gpls[list(self.gse.gpls.keys())[0]].table

        # 5. Recupera nome e título do gene com base no ID
        gene_data = gene_data.merge(self.gpl[self.gene_id_cols], left_on='ID_REF', right_on='ID', how='left')
        gene_data = gene_data.rename(columns={'VALUE': 'gene_expression_value', 'Gene Symbol': 'gene_symbol', 'Gene Title': 'gene_title'})
        gene_data = gene_data[['sample_code', 'ID', 'gene_symbol', 'gene_title', 'gene_expression_value']]

        # 6. Metadados do paciente em dataframe
        patient_data = pd.DataFrame(patient_meta_list)
        patient_data['patient_id'] = patient_data['patient_id'].replace({'NA': np.nan})
        patient_data = patient_data.sort_values(by=['patient_id', 'visit_date'])

        return gene_data, patient_data

## Loading Data

In [3]:
meta_map = {
    'state': 0,
    'patient_id': 1,
    'sledai': 2,
    'visit_date': 3,
    'neutrophil_percent': 5
}

# Cria objeto parser
parser = GEOParser(geo_id="GSE121239", destdir="../../data/raw/", meta_map=meta_map)

# Parse dos dados e armazena em tabelas
gene_data, patient_data = parser.parse()

# Mantem o "gene principal" em casos de genes separados por '///'
gene_data['main_gene'] = gene_data['gene_symbol'].apply(parser.select_main_gene)

# Remove valores de microarray (Affymetrix) -> Controle de qualidade de experimento
gene_data = gene_data[~gene_data['ID'].str.startswith('AFFX')]

  return read_csv(StringIO(data), index_col=None, sep="\t")


In [4]:
patient_data.head()

Unnamed: 0,sample_code,state,patient_id,sledai,visit_date,neutrophil_percent
105,GSM3428415,Systemic Lupus Erythematosus,1001,4,2009-10-08,65.2
106,GSM3428416,Systemic Lupus Erythematosus,1001,2,2010-01-11,61.5
107,GSM3428417,Systemic Lupus Erythematosus,1001,6,2010-03-29,57.5
108,GSM3428418,Systemic Lupus Erythematosus,1041,2,2009-09-24,73.8
109,GSM3428419,Systemic Lupus Erythematosus,1041,10,2009-12-10,64.1


In [6]:
gene_data.head()

Unnamed: 0,sample_code,ID,gene_symbol,gene_title,gene_expression_value,main_gene
0,GSM3428310,1007_PM_s_at,DDR1,discoidin domain receptor tyrosine kinase 1,4.955257,DDR1
1,GSM3428310,1053_PM_at,RFC2,"replication factor C (activator 1) 2, 40kDa",5.984784,RFC2
2,GSM3428310,117_PM_at,HSPA6,heat shock 70kDa protein 6 (HSP70B'),9.477945,HSPA6
3,GSM3428310,121_PM_at,PAX8,paired box 8,4.553229,PAX8
4,GSM3428310,1255_PM_g_at,GUCA1A,guanylate cyclase activator 1A (retina),1.92119,GUCA1A


### Classificando lesões

In [7]:
def classify_lesion(sledai_score):
    if sledai_score == 0:
        return 'healthy'
    if 1 <= sledai_score <= 6:
        return 'leve'
    elif 7 <= sledai_score <= 11:
        return 'media'
    else:
        return 'grave'

In [8]:
patient_data['class'] = patient_data['sledai'].apply(classify_lesion)
patient_data['class_group'] = np.where(patient_data['class'].isin(['grave', 'media']), 'medio/grave', 'leve')
patient_data['class_group'] = np.where(patient_data['class'] == 'healthy', 'healthy', patient_data['class_group'])

In [9]:
patient_data.head()

Unnamed: 0,sample_code,state,patient_id,sledai,visit_date,neutrophil_percent,class,class_group
105,GSM3428415,Systemic Lupus Erythematosus,1001,4,2009-10-08,65.2,leve,leve
106,GSM3428416,Systemic Lupus Erythematosus,1001,2,2010-01-11,61.5,leve,leve
107,GSM3428417,Systemic Lupus Erythematosus,1001,6,2010-03-29,57.5,leve,leve
108,GSM3428418,Systemic Lupus Erythematosus,1041,2,2009-09-24,73.8,leve,leve
109,GSM3428419,Systemic Lupus Erythematosus,1041,10,2009-12-10,64.1,media,medio/grave


In [27]:
gene_data.to_parquet('../../data/processed/gene_data.parquet', index=False)
patient_data.to_csv('../../data/processed/patient_data.csv', index=False)

In [24]:
patient_data

Unnamed: 0,sample_code,state,patient_id,sledai,visit_date,neutrophil_percent,class,class_group
105,GSM3428415,Systemic Lupus Erythematosus,1001,4,2009-10-08,65.2,leve,leve
106,GSM3428416,Systemic Lupus Erythematosus,1001,2,2010-01-11,61.5,leve,leve
107,GSM3428417,Systemic Lupus Erythematosus,1001,6,2010-03-29,57.5,leve,leve
108,GSM3428418,Systemic Lupus Erythematosus,1041,2,2009-09-24,73.8,leve,leve
109,GSM3428419,Systemic Lupus Erythematosus,1041,10,2009-12-10,64.1,media,medio/grave
...,...,...,...,...,...,...,...,...
15,GSM3428325,Healthy,,0,NaT,,healthy,healthy
16,GSM3428326,Healthy,,0,NaT,,healthy,healthy
17,GSM3428327,Healthy,,0,NaT,,healthy,healthy
18,GSM3428328,Healthy,,0,NaT,,healthy,healthy


In [None]:
def plot_pie_distribution(patient_data, col='class'):
    if col == 'class':
        colors = ["#ABACB2", "#bb77ba", "#39b3c2", "#7e7ae7"]
    else:
        colors = ["#39b3c2", "#7e7ae7"]
        
    fig = go.Figure(data=[go.Pie(
        labels=[label.capitalize() for label in np.sort(patient_data[col].unique())],
        values=patient_data[col].value_counts(normalize=True).sort_index().values,
        hole=0.34,
        marker_colors=colors,
        textinfo='percent+label',
        insidetextorientation='radial',
        textfont=dict(family='Arial Black')
    )])
    
    fig.update_layout(
        title=dict(x=0.5, text='Distribuição do Grau de Lesão (SLEDAI) nos Pacientes'),
        template='plotly_white',
        showlegend=False,
        width=500
    )
    
    fig.show()

fig = plot_pie_distribution(patient_data, col='class')
fig = plot_pie_distribution(patient_data.query("class_group != 'healthy'"), col='class_group')

In [17]:
# Filtra a observação com maior SLEDAI por paciente
patient_data['max_sledai'] = patient_data.groupby('patient_id')['sledai'].transform('max')
patient_data_max_sledai = patient_data.query("sledai == max_sledai").drop(columns=['max_sledai'])

In [18]:
df = gene_data.merge(patient_data_max_sledai, on='sample_code')
# df = df[['sample_code', 'patient_id', 'state', 'sledai', 'visit_date', 'neutrophil_percent', 'gene_symbol', 'gene_title', 'gene_expression_value', 'ID_REF']]

In [26]:
import scipy.stats as stats

# Supondo que df tem colunas: 'gene_symbol', 'gene_expression_value', 'class'
# Exemplo: comparar 'leve' vs 'grave'

genes = df['main_gene'].dropna().unique()
results = []

for gene in genes:
    group1 = df[(df['main_gene'] == gene) & (df['class_group'] == 'leve')]['gene_expression_value']
    group2 = df[(df['main_gene'] == gene) & (df['class_group'] == 'medio/grave')]['gene_expression_value']
    stat, pval = stats.ttest_ind(group1, group2, equal_var=False)
    results.append({'gene': gene, 'pval': pval, 'mean_leve': group1.mean(), 'mean_grave': group2.mean()})

KeyboardInterrupt: 

In [None]:
# DataFrame com resultados
res_df = pd.DataFrame(results)
res_df['log2fc'] = np.log2(res_df['mean_grave'] + 1) - np.log2(res_df['mean_leve'] + 1)
res_df = res_df.sort_values('pval')

# Genes mais diferenciais
top_genes = res_df[res_df['pval'] < 0.05].sort_values('log2fc', ascending=False)

In [68]:
results

[{'gene': 'DDR1',
  'pval': np.float64(0.2100797001084677),
  'mean_leve': np.float64(4.425244505479166),
  'mean_grave': np.float64(4.369157444695312)},
 {'gene': 'RFC2',
  'pval': np.float64(0.20727321476765737),
  'mean_leve': np.float64(6.259382690402777),
  'mean_grave': np.float64(6.191040201328125)},
 {'gene': 'HSPA6',
  'pval': np.float64(0.9562692446052213),
  'mean_leve': np.float64(9.587159257618055),
  'mean_grave': np.float64(9.582517183390625)},
 {'gene': 'PAX8',
  'pval': np.float64(0.9477600426952131),
  'mean_leve': np.float64(3.476633524646701),
  'mean_grave': np.float64(3.4725149381523437)},
 {'gene': 'GUCA1A',
  'pval': np.float64(0.8995143302419972),
  'mean_leve': np.float64(3.27648892130787),
  'mean_grave': np.float64(3.2895532785624995)},
 {'gene': 'UBA7',
  'pval': np.float64(0.027651728205105915),
  'mean_leve': np.float64(7.942164412815973),
  'mean_grave': np.float64(7.825528932)},
 {'gene': 'THRA',
  'pval': np.float64(0.9894981719394134),
  'mean_leve': 

In [None]:
import plotly.graph_objects as go

# Adiciona coluna para -log10(pval)
res_df['-log10_pval'] = -np.log10(res_df['pval'])

# Define cores: destaque genes significativos
colors = np.where((res_df['pval'] < 0.05) & (abs(res_df['log2fc']) > 1), '#A259C6', '#4CB944')

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=res_df['log2fc'],
    y=res_df['-log10_pval'],
    mode='markers',
    marker=dict(
        color=colors,
        size=8,
        line=dict(width=0.5, color='DarkSlateGrey')
    ),
    text=res_df['gene'],  # hover com nome do gene
    hovertemplate='Gene: %{text}<br>log2FC: %{x:.2f}<br>-log10(pval): %{y:.2f}<extra></extra>'
))

fig.update_layout(
    title='Volcano Plot - Expressão Diferencial',
    xaxis_title='log2 Fold Change',
    yaxis_title='-log10(p-valor)',
    template='plotly_white',
    width=700,
    height=500
)

fig.show()

In [None]:
# Visualizacoes uteis

# gse = GEOparse.get_GEO(geo="GSE121239", destdir="../../data/raw/", silent=True)
# gse.gsms['GSM3428621'].metadata # Metadados pacientes
# gse.gsms['GSM3428621'].table # Tabela de genes
# gpl.head() # De para de Genes

{'title': ['PBMC_SLE2132_v4'],
 'geo_accession': ['GSM3428621'],
 'status': ['Public on Oct 15 2018'],
 'submission_date': ['Oct 15 2018'],
 'last_update_date': ['Oct 15 2018'],
 'type': ['RNA'],
 'channel_count': ['1'],
 'source_name_ch1': ['PBMC_SLE2132_v4'],
 'organism_ch1': ['Homo sapiens'],
 'taxid_ch1': ['9606'],
 'characteristics_ch1': ['disease state: Systemic Lupus Erythematosus',
  'patient id: 2132',
  'sledai: 2',
  'visit date: 2010-12-15',
  'tissue: PBMC',
  'imputed neutrophil percentage: 56.2'],
 'treatment_protocol_ch1': ['Standard.'],
 'growth_protocol_ch1': ['Whole Blood samples in collected fom PAXgene tubes.'],
 'molecule_ch1': ['total RNA'],
 'extract_protocol_ch1': ['The RNA was extracted with Trizol reagent (Life Technologies) and cleaned up with RNA easy (Qiagen) according to the manufacturer’s protocols.'],
 'label_ch1': ['Biotin'],
 'label_protocol_ch1': ['NuGEN Ovation.'],
 'hyb_protocol': ['Standard Affymetrix protcol.'],
 'scan_protocol': ['HTAPS scan was