In [1]:
#Import pandas and load the file into a DataFrame 
import pandas as pd

In [2]:
df_gpl16570 = pd.read_csv("GPL16570.txt", sep="\t")

In [3]:
#select required columns to create a new dataframe
df_gpl16570_selected = df_gpl16570[["ID", "gene_assignment"]]


In [4]:
#remove null data(na/---) and put it into a new dataframe
df_gpl16570_annotation = df_gpl16570_selected.dropna(subset=["gene_assignment"])
df_gpl16570_annotation = df_gpl16570_annotation[df_gpl16570_annotation["gene_assignment"] != "---"]

In [5]:
#extract gene symbol from gene assignment by string split, save as a new dataframe
df_gpl16570_annotation["Gene Symbol"] = df_gpl16570_annotation["gene_assignment"].str.split(" // ").str[1]
df_gpl16570_genesymbol= df_gpl16570_annotation[["ID","Gene Symbol"]]

In [6]:
# Download the Series Matrix file (Processed Data)
!wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE225nnn/GSE225001/matrix/GSE225001_series_matrix.txt.gz

--2025-03-20 21:00:35--  https://ftp.ncbi.nlm.nih.gov/geo/series/GSE225nnn/GSE225001/matrix/GSE225001_series_matrix.txt.gz
130.14.250.7, 130.14.250.12, 130.14.250.10, ...h.gov)... 
connected. to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.7|:443... 
HTTP request sent, awaiting response... 200 OK
Length: 937104 (915K) [application/x-gzip]
Saving to: ‘GSE225001_series_matrix.txt.gz’


2025-03-20 21:00:36 (1.69 MB/s) - ‘GSE225001_series_matrix.txt.gz’ saved [937104/937104]



In [7]:
# Load the Series Matrix file into pandas
df_expression = pd.read_csv("GSE225001_series_matrix.txt.gz", sep="\t", comment="!", compression="gzip",index_col=0)
df_expression = df_expression.reset_index().rename(columns={"index": "ID"})  # Ensure 'ID' is a column


In [8]:
df_expression.head()

Unnamed: 0,ID_REF,GSM7036843,GSM7036844,GSM7036845,GSM7036846,GSM7036847,GSM7036848,GSM7036849,GSM7036850,GSM7036851,GSM7036852,GSM7036853,GSM7036854
0,17210850,4.99,6.96,6.06,4.17,4.32,4.66,7.78,10.85,4.66,5.5,4.29,6.54
1,17210852,6.36,6.32,7.06,6.28,5.43,5.98,5.5,8.11,7.78,6.96,5.31,6.06
2,17210855,770.69,803.41,843.36,849.22,942.27,792.35,861.08,935.76,855.13,809.0,765.36,849.22
3,17210869,436.55,369.65,424.61,413.0,418.77,487.75,410.15,471.14,330.84,461.44,464.65,424.61
4,17210883,12.47,13.36,15.03,15.03,10.13,13.09,15.35,22.63,16.11,14.83,17.51,15.14


In [9]:
df_gpl16570_genesymbol.head()

Unnamed: 0,ID,Gene Symbol
2,17210855,Lypla1
3,17210869,Tcea1
5,17210887,Atp6v1h
6,17210904,Oprk1
7,17210912,Rb1cc1


In [12]:
df_expression.rename(columns={"ID_REF": "ID"}, inplace=True)

In [13]:
df_expression.head()

Unnamed: 0,ID,GSM7036843,GSM7036844,GSM7036845,GSM7036846,GSM7036847,GSM7036848,GSM7036849,GSM7036850,GSM7036851,GSM7036852,GSM7036853,GSM7036854
0,17210850,4.99,6.96,6.06,4.17,4.32,4.66,7.78,10.85,4.66,5.5,4.29,6.54
1,17210852,6.36,6.32,7.06,6.28,5.43,5.98,5.5,8.11,7.78,6.96,5.31,6.06
2,17210855,770.69,803.41,843.36,849.22,942.27,792.35,861.08,935.76,855.13,809.0,765.36,849.22
3,17210869,436.55,369.65,424.61,413.0,418.77,487.75,410.15,471.14,330.84,461.44,464.65,424.61
4,17210883,12.47,13.36,15.03,15.03,10.13,13.09,15.35,22.63,16.11,14.83,17.51,15.14


In [14]:
# Merge expression data with annotation
df_merged = df_expression.merge(df_gpl16570_genesymbol, on="ID", how="left")

In [15]:
df_merged.head()

Unnamed: 0,ID,GSM7036843,GSM7036844,GSM7036845,GSM7036846,GSM7036847,GSM7036848,GSM7036849,GSM7036850,GSM7036851,GSM7036852,GSM7036853,GSM7036854,Gene Symbol
0,17210850,4.99,6.96,6.06,4.17,4.32,4.66,7.78,10.85,4.66,5.5,4.29,6.54,
1,17210852,6.36,6.32,7.06,6.28,5.43,5.98,5.5,8.11,7.78,6.96,5.31,6.06,
2,17210855,770.69,803.41,843.36,849.22,942.27,792.35,861.08,935.76,855.13,809.0,765.36,849.22,Lypla1
3,17210869,436.55,369.65,424.61,413.0,418.77,487.75,410.15,471.14,330.84,461.44,464.65,424.61,Tcea1
4,17210883,12.47,13.36,15.03,15.03,10.13,13.09,15.35,22.63,16.11,14.83,17.51,15.14,


In [16]:
# Drop Probe ID and set Gene Symbol as index
df_merged.drop(columns=["ID"], inplace=True)
df_merged.set_index("Gene Symbol", inplace=True)


In [19]:
# If multiple probes map to the same gene, take the average expression
df_final = df_merged.groupby("Gene Symbol").mean()

In [20]:
df_final.head()

Unnamed: 0_level_0,GSM7036843,GSM7036844,GSM7036845,GSM7036846,GSM7036847,GSM7036848,GSM7036849,GSM7036850,GSM7036851,GSM7036852,GSM7036853,GSM7036854
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0610007N19Rik,36.5,40.22,44.32,48.5,34.54,35.26,30.7,37.53,29.24,39.4,32.9,34.06
0610007P08Rik,84.45,73.01,82.71,104.69,118.6,95.01,72.5,73.01,76.11,77.17,64.45,76.64
0610007P14Rik,786.88,666.29,685.02,613.11,652.58,588.13,749.61,508.46,922.88,617.37,719.08,749.61
0610009B14Rik,45.57,45.57,45.25,39.67,39.67,43.71,42.22,53.08,46.53,48.5,49.52,43.71
0610009B22Rik,81.01,64.0,72.0,89.26,123.64,88.65,83.29,89.26,68.12,80.45,75.58,67.65


In [21]:
# Save the Processed Data
df_final.to_csv("GSE225001_gene_expression.csv")

In [29]:
# Z-score normalization for each gene (row)
df_normalized = df_final.copy()  # Make a copy to keep the original data
df_normalized.iloc[:] = df_final.iloc[:].apply(lambda x: (x - x.mean()) / x.std(), axis=1)

In [30]:
df_normalized.head()

Unnamed: 0_level_0,GSM7036843,GSM7036844,GSM7036845,GSM7036846,GSM7036847,GSM7036848,GSM7036849,GSM7036850,GSM7036851,GSM7036852,GSM7036853,GSM7036854
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0610007N19Rik,-0.077852,0.594358,1.335235,2.090568,-0.432028,-0.301922,-1.125922,0.10827,-1.389746,0.446183,-0.728378,-0.518764
0610007P08Rik,0.080657,-0.65506,-0.031244,1.382309,2.276873,0.759779,-0.687858,-0.65506,-0.455696,-0.387527,-1.205561,-0.421611
0610007P14Rik,0.914155,-0.203556,-0.029953,-0.696464,-0.330629,-0.927996,0.568711,-1.666432,2.174696,-0.65698,0.285738,0.568711
0610009B14Rik,0.081847,0.081847,0.0,-1.427215,-1.427215,-0.393891,-0.774993,2.002705,0.32739,0.831263,1.092152,-0.393891
0610009B22Rik,-0.056845,-1.132208,-0.626452,0.464716,2.638202,0.426152,0.087296,0.464716,-0.871744,-0.092248,-0.400127,-0.901457


In [31]:
# Save the cleaned, filtered, and normalized data
df_normalized.to_csv("GSE225001_cleaned_normalized.csv", index=False)

In [32]:
df_normalized.head()

Unnamed: 0_level_0,GSM7036843,GSM7036844,GSM7036845,GSM7036846,GSM7036847,GSM7036848,GSM7036849,GSM7036850,GSM7036851,GSM7036852,GSM7036853,GSM7036854
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0610007N19Rik,-0.077852,0.594358,1.335235,2.090568,-0.432028,-0.301922,-1.125922,0.10827,-1.389746,0.446183,-0.728378,-0.518764
0610007P08Rik,0.080657,-0.65506,-0.031244,1.382309,2.276873,0.759779,-0.687858,-0.65506,-0.455696,-0.387527,-1.205561,-0.421611
0610007P14Rik,0.914155,-0.203556,-0.029953,-0.696464,-0.330629,-0.927996,0.568711,-1.666432,2.174696,-0.65698,0.285738,0.568711
0610009B14Rik,0.081847,0.081847,0.0,-1.427215,-1.427215,-0.393891,-0.774993,2.002705,0.32739,0.831263,1.092152,-0.393891
0610009B22Rik,-0.056845,-1.132208,-0.626452,0.464716,2.638202,0.426152,0.087296,0.464716,-0.871744,-0.092248,-0.400127,-0.901457
