# Capstone Project: Neuroblastoma gene expression data analysis

In [1]:
import GEOparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os
import itertools
import requests

# Working on the metadata table

In [76]:
gse_id = "GSE49711"
geo_dir = "data"
os.makedirs(geo_dir, exist_ok=True)
# if you need to download from GEO:
# gse = GEOparse.get_GEO(geo=gse_id, destdir=geo_dir)
# if you already have the SOFT file locally:
gse = GEOparse.get_GEO(filepath=os.path.join(geo_dir, gse_id + "_family.soft.gz"))
meta_df = gse.phenotype_data


08-Jul-2025 14:53:33 INFO GEOparse - Parsing data/GSE49711_family.soft.gz: 
08-Jul-2025 14:53:33 DEBUG GEOparse - DATABASE: GeoMiame
08-Jul-2025 14:53:33 DEBUG GEOparse - SERIES: GSE49711
08-Jul-2025 14:53:33 DEBUG GEOparse - PLATFORM: GPL17553
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205736
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205737
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205738
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205739
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205740
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205741
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205742
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205743
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205744
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205745
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205746
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205747
08-Jul-2025 14:53:33 DEBUG GEOparse - SAMPLE: GSM1205748
08-Jul-2025 14

In [38]:
meta_df.columns

Index(['title', 'tissue', 'dataset', 'Sex', 'age_at_diagnosis', 'mycn_status',
       'high_risk', 'inss_stage', 'class_label', 'progression',
       'death_from_disease'],
      dtype='object')

In [77]:
# optionally, drop columns
meta_cols_to_keep = meta_df.columns[(meta_df.columns == 'title') | (meta_df.columns.str.startswith('characteristics'))]
meta_df = meta_df[meta_cols_to_keep]

# cleanup column names a bit
# e.g. "characteristics_ch1.3.age at diagnosis" -> "age_at_diagnosis"
newcols = meta_df.columns.str.replace(r'^.*\.', '', regex=True)
newcols
newcols = newcols.str.replace(' ', '_')
meta_df.columns = newcols

meta_df.head()

Unnamed: 0,title,tissue,dataset,Sex,age_at_diagnosis,mycn_status,high_risk,inss_stage,class_label,progression,death_from_disease
GSM1205736,SEQC_NB001,neuroblastoma,1,M,987,0,1,4,1.0,1,1
GSM1205737,SEQC_NB002,neuroblastoma,2,M,1808,0,1,4,1.0,1,1
GSM1205738,SEQC_NB003,neuroblastoma,1,F,625,1,1,4,1.0,1,1
GSM1205739,SEQC_NB004,neuroblastoma,2,F,335,1,1,2,,0,0
GSM1205740,SEQC_NB005,neuroblastoma,1,F,536,1,1,4,1.0,1,1


In [78]:
# makin ga copy of the original meta_df, to avoid SettingWithCopyWarning
meta_tr = meta_df.copy()
meta_tr.head()


Unnamed: 0,title,tissue,dataset,Sex,age_at_diagnosis,mycn_status,high_risk,inss_stage,class_label,progression,death_from_disease
GSM1205736,SEQC_NB001,neuroblastoma,1,M,987,0,1,4,1.0,1,1
GSM1205737,SEQC_NB002,neuroblastoma,2,M,1808,0,1,4,1.0,1,1
GSM1205738,SEQC_NB003,neuroblastoma,1,F,625,1,1,4,1.0,1,1
GSM1205739,SEQC_NB004,neuroblastoma,2,F,335,1,1,2,,0,0
GSM1205740,SEQC_NB005,neuroblastoma,1,F,536,1,1,4,1.0,1,1


In [79]:
meta_tr.columns

Index(['title', 'tissue', 'dataset', 'Sex', 'age_at_diagnosis', 'mycn_status',
       'high_risk', 'inss_stage', 'class_label', 'progression',
       'death_from_disease'],
      dtype='object')

In [80]:
meta_tr.head()

Unnamed: 0,title,tissue,dataset,Sex,age_at_diagnosis,mycn_status,high_risk,inss_stage,class_label,progression,death_from_disease
GSM1205736,SEQC_NB001,neuroblastoma,1,M,987,0,1,4,1.0,1,1
GSM1205737,SEQC_NB002,neuroblastoma,2,M,1808,0,1,4,1.0,1,1
GSM1205738,SEQC_NB003,neuroblastoma,1,F,625,1,1,4,1.0,1,1
GSM1205739,SEQC_NB004,neuroblastoma,2,F,335,1,1,2,,0,0
GSM1205740,SEQC_NB005,neuroblastoma,1,F,536,1,1,4,1.0,1,1


In [81]:
#clean up table - remove NAs from class_label
meta_tr["class_label"] = meta_tr["class_label"].map({"N/A": "Neither", "0": "Favorable", "1": "Unfavorable"})

meta_tr.class_label.value_counts()


class_label
Neither        226
Favorable      181
Unfavorable     91
Name: count, dtype: int64

In [82]:
# redefine mycn_status
meta_tr["mycn_status"] = meta_tr["mycn_status"].map({"0": "Normal", "1": "Amplified"})

meta_tr["mycn_status"].value_counts()

mycn_status
Normal       401
Amplified     92
Name: count, dtype: int64

In [83]:
meta_tr.isna().sum()

title                 0
tissue                0
dataset               0
Sex                   0
age_at_diagnosis      0
mycn_status           5
high_risk             0
inss_stage            0
class_label           0
progression           0
death_from_disease    0
dtype: int64

In [84]:
meta_tr = meta_tr.dropna()

In [85]:
meta_tr.dataset.value_counts()

dataset
2    247
1    246
Name: count, dtype: int64

In [86]:
meta_tr.head(4)

Unnamed: 0,title,tissue,dataset,Sex,age_at_diagnosis,mycn_status,high_risk,inss_stage,class_label,progression,death_from_disease
GSM1205736,SEQC_NB001,neuroblastoma,1,M,987,Normal,1,4,Unfavorable,1,1
GSM1205737,SEQC_NB002,neuroblastoma,2,M,1808,Normal,1,4,Unfavorable,1,1
GSM1205738,SEQC_NB003,neuroblastoma,1,F,625,Amplified,1,4,Unfavorable,1,1
GSM1205739,SEQC_NB004,neuroblastoma,2,F,335,Amplified,1,2,Neither,0,0


# Working on the expression data table

## download the table and read in as an object df_expression

In [None]:
# DOWNLOADING EXPRESSION TABLE
'''url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE49nnn/GSE49711/matrix/GSE49711_SEQC_NB_MAV_G_log2.20121127.txt.gz"
output_file = "GSE49711_SEQC_NB_MAV_G_log2.20121127.txt.gz"

response = requests.get(url)
with open(output_file, "wb") as:
    f.write(response.content)

print("Download complete.")'''

In [9]:
df_expression = pd.read_csv("/home/gracie/Downloads/day4-20250702T073025Z-1-001/data/GSE49711_SEQC_NB_MAV_G_log2.20121127.txt", sep="\t")

  df_expression = pd.read_csv("/home/gracie/Downloads/day4-20250702T073025Z-1-001/data/GSE49711_SEQC_NB_MAV_G_log2.20121127.txt", sep="\t")


In [11]:
df_expression.head()

Unnamed: 0,#Gene,#NCBI GeneId,#RefSeq transcript Id,#Chromosome,#Strand,#from base,#to base,#Title,#Measured object,SEQC_NB001,...,SEQC_NB489,SEQC_NB490,SEQC_NB491,SEQC_NB492,SEQC_NB493,SEQC_NB494,SEQC_NB495,SEQC_NB496,SEQC_NB497,SEQC_NB498
0,ALB,213,NM_000477.5,4,+,74249329,74287139,"complex locus ALB, encoding albumin.",ALB:Gene_AceView,9.29,...,9.89,6.48,7.73,9.02,8.47,7.78,8.58,8.41,8.62,10.2
1,CD24L4.1,938;100133941,NM_013230.2,Y,-,21152409,21154677,"gene CD24L4.1, encoding CD24 molecule-like 4 p...",CD24L4.1:Gene_AceView,18.82,...,19.99,20.17,18.94,20.42,20.75,20.86,20.57,21.04,20.13,20.02
2,RPS11,6205,NM_001015.4,19,+,49999634,50002944,ribosomal protein S11,RPS11:Gene_RefSeq,21.17,...,22.98,23.31,21.14,22.85,20.08,20.91,20.67,21.1,21.04,20.82
3,RPS18,6222,NM_022551.2,6,+,33239799,33244290,"gene RPS18, encoding ribosomal protein S18.",RPS18:Gene_AceView,20.9,...,22.28,22.87,20.93,22.39,20.71,20.95,21.03,21.66,20.49,21.11
4,C5orf13,9315,NM_001142474.1;NM_001142475.1;NM_001142476.1;N...,5,-,110998323,111312627,"gene C5orf13, encoding chromosome 5 open readi...",C5orf13:Gene_AceView,20.02,...,21.55,21.04,20.57,21.06,22.16,21.98,21.56,21.8,22.06,21.31


## Clean up the table and remove duplicated genes

In [18]:
# keep only genes with a NCBI gene ID
df = df_expression.loc[~df_expression["#NCBI GeneId"].isna() & ~df_expression['#RefSeq transcript Id'].isna()]
# select relevant columns from expression df before merging with sample metadata
expr = df[ df.columns[df.columns.str.contains('#Gene') | df.columns.str.startswith("SEQC_")] ].rename(columns={"#Gene": "ID"})
expr.head()

Unnamed: 0,ID,SEQC_NB001,SEQC_NB002,SEQC_NB003,SEQC_NB004,SEQC_NB005,SEQC_NB006,SEQC_NB007,SEQC_NB008,SEQC_NB009,...,SEQC_NB489,SEQC_NB490,SEQC_NB491,SEQC_NB492,SEQC_NB493,SEQC_NB494,SEQC_NB495,SEQC_NB496,SEQC_NB497,SEQC_NB498
0,ALB,9.29,7.99,9.25,9.35,8.99,9.26,8.15,9.54,10.72,...,9.89,6.48,7.73,9.02,8.47,7.78,8.58,8.41,8.62,10.2
1,CD24L4.1,18.82,19.8,20.25,20.44,20.09,20.38,19.75,20.21,19.66,...,19.99,20.17,18.94,20.42,20.75,20.86,20.57,21.04,20.13,20.02
2,RPS11,21.17,20.42,22.44,21.22,22.09,21.23,21.0,22.6,21.43,...,22.98,23.31,21.14,22.85,20.08,20.91,20.67,21.1,21.04,20.82
3,RPS18,20.9,20.36,22.0,21.08,21.71,20.74,20.88,22.55,21.63,...,22.28,22.87,20.93,22.39,20.71,20.95,21.03,21.66,20.49,21.11
4,C5orf13,20.02,21.7,21.05,21.89,21.65,21.14,21.48,20.64,21.81,...,21.55,21.04,20.57,21.06,22.16,21.98,21.56,21.8,22.06,21.31


In [None]:
# check for duplicated gene names...
dup_id = expr.loc[expr.ID.duplicated(), :].ID

# check the expression levels of these dup genes
expr.loc[expr["ID"].isin(dup_id)]
# expression values of dup genes in the same samples are not always identical
# for the sake of simplicity, remove dup genes using ~expr.ID.duplicated()


Unnamed: 0,ID,SEQC_NB001,SEQC_NB002,SEQC_NB003,SEQC_NB004,SEQC_NB005,SEQC_NB006,SEQC_NB007,SEQC_NB008,SEQC_NB009,...,SEQC_NB489,SEQC_NB490,SEQC_NB491,SEQC_NB492,SEQC_NB493,SEQC_NB494,SEQC_NB495,SEQC_NB496,SEQC_NB497,SEQC_NB498
66,INS-IGF2,15.67,16.07,15.25,14.51,18.72,17.67,21.72,15.94,16.77,...,15.37,19.42,13.16,15.23,17.46,18.49,16.32,18.20,15.28,17.28
105,RPS21,19.16,18.34,20.40,18.89,20.23,18.95,18.81,20.53,20.37,...,20.70,20.91,20.09,20.69,18.10,18.71,19.20,19.47,18.52,19.02
106,RPS21,19.25,18.39,20.49,18.96,20.31,19.03,18.89,20.59,20.45,...,20.80,21.01,20.17,20.80,18.18,18.82,19.32,19.52,18.60,19.13
167,INS-IGF2,14.64,14.95,14.06,13.28,17.44,16.52,20.60,14.73,15.59,...,14.29,18.29,11.99,14.02,16.45,17.47,15.26,17.13,14.44,16.34
256,EIF4A1,17.79,18.22,19.36,18.56,19.12,19.16,17.96,19.44,19.17,...,19.49,19.73,18.88,19.33,18.51,18.61,18.60,18.91,18.55,17.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56218,LOC728728,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
56427,OR1D5,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,3.85,...,0.00,0.00,6.93,0.00,0.00,0.00,0.00,0.00,0.00,0.00
56428,OR1D5,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,4.44,0.00,0.00,0.00,0.00,0.00,0.00,0.00
56787,SFTPA1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,5.38,0.16,0.00,0.00


In [22]:

print(f"before removing dup: {expr.shape}")
expr = expr.loc[~expr.ID.duplicated(), :].copy()
print(f"after removing dup: {expr.shape}")


before removing dup: (22861, 499)
after removing dup: (22674, 499)


In [None]:
# transpose to have patient IDs on the rows
# gene names as column names
expr = expr.set_index('ID').T
expr.head()

ID,ALB,CD24L4.1,RPS11,RPS18,C5orf13,CCT2,COL1A1,DDX1,EEF1A1,FLT3LG_,...,USP17L5,USP17L7,USP17L8,VTRNA2,WFDC11,WFDC9,XAGE2,XAGE2B,ZFATAS,ZP4
SEQC_NB001,9.29,18.82,21.17,20.9,20.02,16.31,18.6,15.73,21.71,20.02,...,0.0,0.0,0.0,0.0,4.74,0.0,0.0,0.0,0.0,0.0
SEQC_NB002,7.99,19.8,20.42,20.36,21.7,16.74,18.76,15.62,21.08,19.23,...,0.0,0.0,0.0,0.0,2.98,0.0,0.0,0.0,0.0,0.0
SEQC_NB003,9.25,20.25,22.44,22.0,21.05,17.06,19.39,22.84,22.72,21.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SEQC_NB004,9.35,20.44,21.22,21.08,21.89,16.83,20.87,15.81,22.18,20.02,...,0.0,0.0,0.0,0.0,5.14,0.0,0.0,0.0,0.0,0.0
SEQC_NB005,8.99,20.09,22.09,21.71,21.65,16.85,23.02,15.79,22.24,20.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
expr.isna().sum()

ID
ALB         0
CD24L4.1    0
RPS11       0
RPS18       0
C5orf13     0
           ..
WFDC9       0
XAGE2       0
XAGE2B      0
ZFATAS      0
ZP4         0
Length: 22674, dtype: int64

In [None]:
expr.isna().sum().sum()

0

If many NAs, one needs to impute:
- replace NAs by 0
- replace by median/mean
- ...

# merge expression table and metadata

In [26]:
print(expr.shape)
print(meta_df.shape)

(498, 22674)
(498, 11)


In [87]:
# merge expression and metadata
expr_tr = meta_tr.merge(expr, left_on="title", right_index=True)
expr_tr.head(4)

Unnamed: 0,title,tissue,dataset,Sex,age_at_diagnosis,mycn_status,high_risk,inss_stage,class_label,progression,...,USP17L5,USP17L7,USP17L8,VTRNA2,WFDC11,WFDC9,XAGE2,XAGE2B,ZFATAS,ZP4
GSM1205736,SEQC_NB001,neuroblastoma,1,M,987,Normal,1,4,Unfavorable,1,...,0.0,0.0,0.0,0.0,4.74,0.0,0.0,0.0,0.0,0.0
GSM1205737,SEQC_NB002,neuroblastoma,2,M,1808,Normal,1,4,Unfavorable,1,...,0.0,0.0,0.0,0.0,2.98,0.0,0.0,0.0,0.0,0.0
GSM1205738,SEQC_NB003,neuroblastoma,1,F,625,Amplified,1,4,Unfavorable,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1205739,SEQC_NB004,neuroblastoma,2,F,335,Amplified,1,2,Neither,0,...,0.0,0.0,0.0,0.0,5.14,0.0,0.0,0.0,0.0,0.0
