In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import scipy.io as sio
import os

from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

# GSE8052

In [2]:
path = "../../../../nasdatafolder/MTL/Data/Raw_Data/"

In [3]:
annotation = pd.read_csv(path + "GPL570-55999.txt", sep='\t')

In [4]:
annotation

Unnamed: 0,ID,Gene Symbol,GB_ACC,Representative Public ID,Gene Title
0,1007_s_at,DDR1 /// MIR4640,U48705,U48705,discoidin domain receptor tyrosine kinase 1 //...
1,1053_at,RFC2,M87338,M87338,"replication factor C (activator 1) 2, 40kDa"
2,117_at,HSPA6,X51757,X51757,heat shock 70kDa protein 6 (HSP70B')
3,121_at,PAX8,X69699,X69699,paired box 8
4,1255_g_at,GUCA1A,L36861,L36861,guanylate cyclase activator 1A (retina)
...,...,...,...,...,...
54670,AFFX-ThrX-5_at,,,AFFX-ThrX-5,
54671,AFFX-ThrX-M_at,,,AFFX-ThrX-M,
54672,AFFX-TrpnX-3_at,,,AFFX-TrpnX-3,
54673,AFFX-TrpnX-5_at,,,AFFX-TrpnX-5,


In [5]:
annotation = annotation[~annotation["Gene Symbol"].isna()].reset_index(drop = True)

In [6]:
annotation

Unnamed: 0,ID,Gene Symbol,GB_ACC,Representative Public ID,Gene Title
0,1007_s_at,DDR1 /// MIR4640,U48705,U48705,discoidin domain receptor tyrosine kinase 1 //...
1,1053_at,RFC2,M87338,M87338,"replication factor C (activator 1) 2, 40kDa"
2,117_at,HSPA6,X51757,X51757,heat shock 70kDa protein 6 (HSP70B')
3,121_at,PAX8,X69699,X69699,paired box 8
4,1255_g_at,GUCA1A,L36861,L36861,guanylate cyclase activator 1A (retina)
...,...,...,...,...,...
45777,AFFX-HUMGAPDH/M33197_M_at,GAPDH,,AFFX-HUMGAPDH/M33197_M,glyceraldehyde-3-phosphate dehydrogenase
45778,AFFX-HUMISGF3A/M97935_3_at,STAT1,,AFFX-HUMISGF3A/M97935_3,signal transducer and activator of transcripti...
45779,AFFX-HUMISGF3A/M97935_5_at,STAT1,,AFFX-HUMISGF3A/M97935_5,signal transducer and activator of transcripti...
45780,AFFX-HUMISGF3A/M97935_MA_at,STAT1,,AFFX-HUMISGF3A/M97935_MA,signal transducer and activator of transcripti...


In [7]:
annotation[annotation["Gene Symbol"] == "1-Dec"]

Unnamed: 0,ID,Gene Symbol,GB_ACC,Representative Public ID,Gene Title
26949,220781_at,1-Dec,NM_017418,NM_017418,deleted in esophageal cancer 1


In [8]:
mapping_annotation = annotation[["ID", "Gene Symbol"]]

In [9]:
mapping_annotation

Unnamed: 0,ID,Gene Symbol
0,1007_s_at,DDR1 /// MIR4640
1,1053_at,RFC2
2,117_at,HSPA6
3,121_at,PAX8
4,1255_g_at,GUCA1A
...,...,...
45777,AFFX-HUMGAPDH/M33197_M_at,GAPDH
45778,AFFX-HUMISGF3A/M97935_3_at,STAT1
45779,AFFX-HUMISGF3A/M97935_5_at,STAT1
45780,AFFX-HUMISGF3A/M97935_MA_at,STAT1


In [10]:
np.unique(mapping_annotation["Gene Symbol"]), np.unique(mapping_annotation["Gene Symbol"]).size

(array(['1-Dec', '1-Mar', '1-Sep', ...,
        'hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10',
        'hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10',
        'mir-223'], dtype=object),
 23518)

In [11]:
%time
asthma_gse_8052 = pd.read_excel(path + "GSE8052.xlsx")

CPU times: user 2 µs, sys: 3 µs, total: 5 µs
Wall time: 11.4 µs


In [12]:
asthma_gse_8052

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column396,Column397,Column398,Column399,Column400,Column401,Column402,Column403,Column404,Column405
0,,,,,,,,,,,...,,,,,,,,,,
1,Gender,MALE,FEMALE,MALE,MALE,FEMALE,FEMALE,MALE,MALE,FEMALE,...,MALE,MALE,MALE,MALE,FEMALE,FEMALE,MALE,MALE,FEMALE,FEMALE
2,status,CONTROL,CONTROL,ASTHMA,ASTHMA,ASTHMA,CONTROL,ASTHMA,ASTHMA,ASTHMA,...,CONTROL,ASTHMA,CONTROL,ASTHMA,CONTROL,ASTHMA,CONTROL,CONTROL,ASTHMA,ASTHMA
3,Sample ID,GSM199024,GSM199025,GSM199026,GSM199027,GSM199028,GSM199029,GSM199030,GSM199031,GSM199032,...,GSM199418,GSM199419,GSM199420,GSM199421,GSM199422,GSM199423,GSM199424,GSM199425,GSM199426,GSM199427
4,1007_s_at,6.945,6.300,6.097,6.693,6.156,6.163,5.675,6.070,6.129,...,6.103,6.040,5.938,6.421,6.058,5.943,5.971,6.208,6.303,6.361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54675,AFFX-ThrX-M_at,2.882,2.826,2.988,2.898,2.968,2.794,2.906,2.891,2.955,...,2.823,2.886,3.043,2.928,3.033,2.969,3.159,2.940,2.820,2.894
54676,AFFX-TrpnX-3_at,2.532,2.632,2.630,2.608,2.871,2.605,2.543,2.700,3.224,...,2.870,2.741,2.663,2.712,2.547,2.774,2.764,2.782,2.653,2.655
54677,AFFX-TrpnX-5_at,2.929,2.714,2.932,2.894,2.888,2.997,2.878,2.837,3.007,...,2.888,3.205,3.075,2.956,2.968,2.987,2.896,2.793,3.009,2.931
54678,AFFX-TrpnX-M_at,2.858,2.687,2.989,2.752,2.994,3.035,2.912,3.027,3.020,...,2.787,2.765,2.923,2.747,2.820,2.672,3.055,2.941,2.713,2.773


In [13]:
asthma_gse_8052 = asthma_gse_8052.T
asthma_gse_8052

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54670,54671,54672,54673,54674,54675,54676,54677,54678,54679
Column1,,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,...,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at,!series_matrix_table_end
Column2,,MALE,CONTROL,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,...,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858,
Column3,,FEMALE,CONTROL,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,...,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687,
Column4,,MALE,ASTHMA,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,...,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989,
Column5,,MALE,ASTHMA,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,...,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Column401,,FEMALE,ASTHMA,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,...,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672,
Column402,,MALE,CONTROL,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,...,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055,
Column403,,MALE,CONTROL,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,...,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941,
Column404,,FEMALE,ASTHMA,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,...,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713,


In [14]:
asthma_gse_8052 = asthma_gse_8052.drop(0, axis = 1).reset_index(drop = True)
asthma_gse_8052

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,54670,54671,54672,54673,54674,54675,54676,54677,54678,54679
0,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at,!series_matrix_table_end
1,MALE,CONTROL,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858,
2,FEMALE,CONTROL,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687,
3,MALE,ASTHMA,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989,
4,MALE,ASTHMA,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,FEMALE,ASTHMA,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672,
401,MALE,CONTROL,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055,
402,MALE,CONTROL,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941,
403,FEMALE,ASTHMA,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713,


In [15]:
asthma_gse_8052.columns = asthma_gse_8052.loc[0]

In [16]:
asthma_gse_8052 = asthma_gse_8052.drop(0).reset_index(drop = True)

In [17]:
asthma_gse_8052

Unnamed: 0,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at,!series_matrix_table_end
0,MALE,CONTROL,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858,
1,FEMALE,CONTROL,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687,
2,MALE,ASTHMA,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989,
3,MALE,ASTHMA,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752,
4,FEMALE,ASTHMA,GSM199028,6.156,7.674,4.479,6.636,2.777,7.500,4.328,...,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,FEMALE,ASTHMA,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672,
400,MALE,CONTROL,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055,
401,MALE,CONTROL,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941,
402,FEMALE,ASTHMA,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713,


In [18]:
%time
check_gse_8052 = pd.read_excel(path + "GSE_8052.xlsx")

CPU times: user 15 µs, sys: 0 ns, total: 15 µs
Wall time: 27.7 µs


In [19]:
check_gse_8052

Unnamed: 0,Sample name,Unnamed: 1,Unnamed: 2,title,CEL file,source name,organism,ped,id,fatid,...,rs981684,rs12453124,rs4132126,rs7209228,rs7219451,rs7211017,molecule,label,description,platform
0,8001.4,GSM199024,Lymphoblastoid Cell Line for Individual 8001.4...,Lymphoblastoid_Cell_Line_for_Individual_8001.4...,8001.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8001,4,1,...,AG,CT,AG,TC,TC,GG,total_rna,biotin,-,GPL570
1,8002.4,GSM199025,Lymphoblastoid Cell Line for Individual 8002.4...,Lymphoblastoid_Cell_Line_for_Individual_8002.4...,8002.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8002,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
2,8003.6,GSM199026,Lymphoblastoid Cell Line for Individual 8003.6...,Lymphoblastoid_Cell_Line_for_Individual_8003.6...,8003.6.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,6,1,...,AA,CC,AG,CC,TC,GG,total_rna,biotin,-,GPL570
3,8003.5,GSM199027,Lymphoblastoid Cell Line for Individual 8003.5...,Lymphoblastoid_Cell_Line_for_Individual_8003.5...,8003.5.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,5,1,...,AG,CT,AG,CC,TC,GG,total_rna,biotin,-,GPL570
4,8003.4,GSM199028,Lymphoblastoid Cell Line for Individual 8003.4...,Lymphoblastoid_Cell_Line_for_Individual_8003.4...,8003.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764,GK1394.1,,,DNA_Sample_for_Individual_GK1394.1,,,H.sapiens,GK1394,1,0,...,GG,CC,AA,CC,TT,AA,,,,
2765,GK1395.1,,,DNA_Sample_for_Individual_GK1395.1,,,H.sapiens,GK1395,1,0,...,AG,CC,AG,TC,TC,GG,,,,
2766,GK1396.1,,,DNA_Sample_for_Individual_GK1396.1,,,H.sapiens,GK1396,1,0,...,AA,CC,AG,TC,CC,GG,,,,
2767,GK1397.1,,,DNA_Sample_for_Individual_GK1397.1,,,H.sapiens,GK1397,1,0,...,AA,CC,AG,CC,TC,GG,,,,


In [20]:
check_gse_8052 = check_gse_8052.rename(columns = {"Unnamed: 1" : "Sample ID", "Unnamed: 2" : "Type of Sample"})

In [21]:
check_gse_8052

Unnamed: 0,Sample name,Sample ID,Type of Sample,title,CEL file,source name,organism,ped,id,fatid,...,rs981684,rs12453124,rs4132126,rs7209228,rs7219451,rs7211017,molecule,label,description,platform
0,8001.4,GSM199024,Lymphoblastoid Cell Line for Individual 8001.4...,Lymphoblastoid_Cell_Line_for_Individual_8001.4...,8001.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8001,4,1,...,AG,CT,AG,TC,TC,GG,total_rna,biotin,-,GPL570
1,8002.4,GSM199025,Lymphoblastoid Cell Line for Individual 8002.4...,Lymphoblastoid_Cell_Line_for_Individual_8002.4...,8002.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8002,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
2,8003.6,GSM199026,Lymphoblastoid Cell Line for Individual 8003.6...,Lymphoblastoid_Cell_Line_for_Individual_8003.6...,8003.6.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,6,1,...,AA,CC,AG,CC,TC,GG,total_rna,biotin,-,GPL570
3,8003.5,GSM199027,Lymphoblastoid Cell Line for Individual 8003.5...,Lymphoblastoid_Cell_Line_for_Individual_8003.5...,8003.5.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,5,1,...,AG,CT,AG,CC,TC,GG,total_rna,biotin,-,GPL570
4,8003.4,GSM199028,Lymphoblastoid Cell Line for Individual 8003.4...,Lymphoblastoid_Cell_Line_for_Individual_8003.4...,8003.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764,GK1394.1,,,DNA_Sample_for_Individual_GK1394.1,,,H.sapiens,GK1394,1,0,...,GG,CC,AA,CC,TT,AA,,,,
2765,GK1395.1,,,DNA_Sample_for_Individual_GK1395.1,,,H.sapiens,GK1395,1,0,...,AG,CC,AG,TC,TC,GG,,,,
2766,GK1396.1,,,DNA_Sample_for_Individual_GK1396.1,,,H.sapiens,GK1396,1,0,...,AA,CC,AG,TC,CC,GG,,,,
2767,GK1397.1,,,DNA_Sample_for_Individual_GK1397.1,,,H.sapiens,GK1397,1,0,...,AA,CC,AG,CC,TC,GG,,,,


In [22]:
check_gse_8052 = check_gse_8052[~check_gse_8052["Type of Sample"].isna()]

In [23]:
check_gse_8052

Unnamed: 0,Sample name,Sample ID,Type of Sample,title,CEL file,source name,organism,ped,id,fatid,...,rs981684,rs12453124,rs4132126,rs7209228,rs7219451,rs7211017,molecule,label,description,platform
0,8001.4,GSM199024,Lymphoblastoid Cell Line for Individual 8001.4...,Lymphoblastoid_Cell_Line_for_Individual_8001.4...,8001.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8001,4,1,...,AG,CT,AG,TC,TC,GG,total_rna,biotin,-,GPL570
1,8002.4,GSM199025,Lymphoblastoid Cell Line for Individual 8002.4...,Lymphoblastoid_Cell_Line_for_Individual_8002.4...,8002.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8002,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
2,8003.6,GSM199026,Lymphoblastoid Cell Line for Individual 8003.6...,Lymphoblastoid_Cell_Line_for_Individual_8003.6...,8003.6.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,6,1,...,AA,CC,AG,CC,TC,GG,total_rna,biotin,-,GPL570
3,8003.5,GSM199027,Lymphoblastoid Cell Line for Individual 8003.5...,Lymphoblastoid_Cell_Line_for_Individual_8003.5...,8003.5.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,5,1,...,AG,CT,AG,CC,TC,GG,total_rna,biotin,-,GPL570
4,8003.4,GSM199028,Lymphoblastoid Cell Line for Individual 8003.4...,Lymphoblastoid_Cell_Line_for_Individual_8003.4...,8003.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,8211.3,GSM199423,Lymphoblastoid Cell Line for Individual 8211.3...,Lymphoblastoid_Cell_Line_for_Individual_8211.3...,8211.3.CEL,Lymphoblastoid_cell_line,H.sapiens,8211,3,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
400,8213.5,GSM199424,Lymphoblastoid Cell Line for Individual 8213.5...,Lymphoblastoid_Cell_Line_for_Individual_8213.5...,8213.5.CEL,Lymphoblastoid_cell_line,H.sapiens,8213,5,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
401,8213.4,GSM199425,Lymphoblastoid Cell Line for Individual 8213.4...,Lymphoblastoid_Cell_Line_for_Individual_8213.4...,8213.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8213,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
402,8213.3,GSM199426,Lymphoblastoid Cell Line for Individual 8213.3...,Lymphoblastoid_Cell_Line_for_Individual_8213.3...,8213.3.CEL,Lymphoblastoid_cell_line,H.sapiens,8213,3,1,...,AA,CC,AG,CC,TT,AG,total_rna,biotin,-,GPL570


In [24]:
to_compare_check_gse_8052 = check_gse_8052[["Sample ID", "sex", "DDAST"]]

In [25]:
to_compare_asthma_gse_8052 = asthma_gse_8052[["Gender", "status", "Sample ID"]]

In [26]:
compared_gse_8052 = pd.merge(to_compare_check_gse_8052, to_compare_asthma_gse_8052, how = "inner")

In [27]:
compared_gse_8052

Unnamed: 0,Sample ID,sex,DDAST,Gender,status
0,GSM199024,1,CONTROL,MALE,CONTROL
1,GSM199025,2,CONTROL,FEMALE,CONTROL
2,GSM199026,1,CASE,MALE,ASTHMA
3,GSM199027,1,CASE,MALE,ASTHMA
4,GSM199028,2,CASE,FEMALE,ASTHMA
...,...,...,...,...,...
399,GSM199423,2,CASE,FEMALE,ASTHMA
400,GSM199424,1,CONTROL,MALE,CONTROL
401,GSM199425,1,CONTROL,MALE,CONTROL
402,GSM199426,2,CASE,FEMALE,ASTHMA


In [28]:
compared_gse_8052['sex'] = compared_gse_8052['sex'].apply(lambda x : "MALE" if x == 1 else "FEMALE")

In [29]:
compared_gse_8052['status'] = compared_gse_8052['status'].apply(lambda x : x if x.find("CONTROL") != -1 else "CASE")

In [30]:
compared_gse_8052

Unnamed: 0,Sample ID,sex,DDAST,Gender,status
0,GSM199024,MALE,CONTROL,MALE,CONTROL
1,GSM199025,FEMALE,CONTROL,FEMALE,CONTROL
2,GSM199026,MALE,CASE,MALE,CASE
3,GSM199027,MALE,CASE,MALE,CASE
4,GSM199028,FEMALE,CASE,FEMALE,CASE
...,...,...,...,...,...
399,GSM199423,FEMALE,CASE,FEMALE,CASE
400,GSM199424,MALE,CONTROL,MALE,CONTROL
401,GSM199425,MALE,CONTROL,MALE,CONTROL
402,GSM199426,FEMALE,CASE,FEMALE,CASE


In [33]:
(compared_gse_8052["sex"].values == compared_gse_8052["Gender"].values).sum()

404

In [34]:
(compared_gse_8052['DDAST'] == compared_gse_8052['status']).sum()

404

---

In [36]:
asthma_gse_8052["!series_matrix_table_end"].isna().sum()

404

In [37]:
asthma_gse_8052 = asthma_gse_8052.drop(columns = ["!series_matrix_table_end"])

In [38]:
asthma_gse_8052

Unnamed: 0,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,MALE,CONTROL,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,FEMALE,CONTROL,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,MALE,ASTHMA,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,MALE,ASTHMA,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,FEMALE,ASTHMA,GSM199028,6.156,7.674,4.479,6.636,2.777,7.500,4.328,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,FEMALE,ASTHMA,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,MALE,CONTROL,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,MALE,CONTROL,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,FEMALE,ASTHMA,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [39]:
asthma_gse_8052.iloc[:, 3:].isna().sum().sum()

0

In [40]:
(asthma_gse_8052.iloc[:, 3:].sum(axis = 0) == 0).sum()

0

In [41]:
np.unique(asthma_gse_8052["status"])

array(['ASTHMA', 'CONTROL'], dtype=object)

In [42]:
np.unique(asthma_gse_8052["Gender"])

array(['FEMALE', 'MALE'], dtype=object)

In [43]:
asthma_gse_8052['Gender'] = asthma_gse_8052['Gender'].apply(lambda x : 0 if x.find('FEMALE') != -1 else 1)

In [44]:
asthma_gse_8052

Unnamed: 0,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,1,CONTROL,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,0,CONTROL,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,1,ASTHMA,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,1,ASTHMA,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,0,ASTHMA,GSM199028,6.156,7.674,4.479,6.636,2.777,7.500,4.328,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,ASTHMA,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,1,CONTROL,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,1,CONTROL,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,0,ASTHMA,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [45]:
asthma_gse_8052['status'] = asthma_gse_8052['status'].apply(lambda x : 1 if x.find('ASTHMA') != -1 else 0)

In [46]:
asthma_gse_8052

Unnamed: 0,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,1,0,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,0,0,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,1,1,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,1,1,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,0,1,GSM199028,6.156,7.674,4.479,6.636,2.777,7.500,4.328,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,1,0,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,1,0,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,0,1,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [47]:
asthma_gse_8052 = asthma_gse_8052.rename(columns = {"status" : "Label"})

In [48]:
asthma_gse_8052

Unnamed: 0,Gender,Label,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,1,0,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,0,0,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,1,1,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,1,1,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,0,1,GSM199028,6.156,7.674,4.479,6.636,2.777,7.500,4.328,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,1,0,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,1,0,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,0,1,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [49]:
asthma_gse_8052 = asthma_gse_8052.drop(columns = ["Sample ID"])

In [50]:
asthma_gse_8052

Unnamed: 0,Gender,Label,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,1,0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,0,0,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,1,1,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,1,1,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,0,1,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,1,0,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,1,0,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,0,1,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [51]:
asthma_gse_8052.iloc[:, (asthma_gse_8052.columns.str.find("_") != -1)]

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,7.828,2.952,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,7.258,2.998,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,8.038,2.894,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,8.063,3.042,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,5.236,3.058,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,7.044,3.052,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,5.900,3.150,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,5.240,3.032,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,6.275,3.164,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [52]:
asthma_gse_8052.iloc[:, 2:].columns

Index(['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at',
       '1316_at', '1320_at', '1405_i_at', '1431_at',
       ...
       'AFFX-r2-Ec-bioD-3_at', 'AFFX-r2-Ec-bioD-5_at', 'AFFX-r2-P1-cre-3_at',
       'AFFX-r2-P1-cre-5_at', 'AFFX-ThrX-3_at', 'AFFX-ThrX-5_at',
       'AFFX-ThrX-M_at', 'AFFX-TrpnX-3_at', 'AFFX-TrpnX-5_at',
       'AFFX-TrpnX-M_at'],
      dtype='object', name=0, length=54675)

In [53]:
asthma_gse_8052.iloc[:, 2:].columns.str.rstrip('_at')

Index(['1007_s', '1053', '117', '121', '1255_g', '1294', '1316', '1320',
       '1405_i', '1431',
       ...
       'AFFX-r2-Ec-bioD-3', 'AFFX-r2-Ec-bioD-5', 'AFFX-r2-P1-cre-3',
       'AFFX-r2-P1-cre-5', 'AFFX-ThrX-3', 'AFFX-ThrX-5', 'AFFX-ThrX-M',
       'AFFX-TrpnX-3', 'AFFX-TrpnX-5', 'AFFX-TrpnX-M'],
      dtype='object', name=0, length=54675)

In [54]:
uniq_col = asthma_gse_8052.iloc[:, 2:].columns.str.rstrip('-3_at')

In [55]:
uniq_col

Index(['1007_s', '105', '117', '121', '1255_g', '1294', '1316', '1320',
       '1405_i', '1431',
       ...
       'AFFX-r2-Ec-bioD', 'AFFX-r2-Ec-bioD-5', 'AFFX-r2-P1-cre',
       'AFFX-r2-P1-cre-5', 'AFFX-ThrX', 'AFFX-ThrX-5', 'AFFX-ThrX-M',
       'AFFX-TrpnX', 'AFFX-TrpnX-5', 'AFFX-TrpnX-M'],
      dtype='object', name=0, length=54675)

In [56]:
np.unique(uniq_col.str.rstrip('-5_at')), np.unique(uniq_col.str.rstrip('-5_at')).size

(array(['1', '10', '1007_s', ..., 'AFFX-r2-Ec-bioC', 'AFFX-r2-Ec-bioD',
        'AFFX-r2-P1-cre'], dtype=object),
 51927)

In [57]:
uniq_col = np.unique(uniq_col.str.rstrip('-5_at'))

In [58]:
uniq_col, uniq_col.size

(array(['1', '10', '1007_s', ..., 'AFFX-r2-Ec-bioC', 'AFFX-r2-Ec-bioD',
        'AFFX-r2-P1-cre'], dtype=object),
 51927)

In [59]:
np.unique(asthma_gse_8052.iloc[:, 2:].columns.str.rstrip('_at')), np.unique(asthma_gse_8052.iloc[:, 2:].columns.str.rstrip('_at')).size

(array(['1007_s', '1053', '117', ..., 'AFFX-r2-Ec-bioD-5',
        'AFFX-r2-P1-cre-3', 'AFFX-r2-P1-cre-5'], dtype=object),
 54675)

In [60]:
asthma_gse_8052.iloc[:, 2:]

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,7.828,2.952,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,7.258,2.998,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,8.038,2.894,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,8.063,3.042,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,5.236,3.058,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,7.044,3.052,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,5.900,3.150,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,5.240,3.032,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,6.275,3.164,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [61]:
asthma_gse_8052.iloc[:, :2]

Unnamed: 0,Gender,Label
0,1,0
1,0,0
2,1,1
3,1,1
4,0,1
...,...,...
399,0,1
400,1,0
401,1,0
402,0,1


In [62]:
asthma_gse_8052["Label"]

0      0
1      0
2      1
3      1
4      1
      ..
399    1
400    0
401    0
402    1
403    1
Name: Label, Length: 404, dtype: int64

## Mapping GSE 8052 with Annotation

In [145]:
mapping_gse_8052 = asthma_gse_8052.iloc[:, 2:].T.reset_index().rename(columns = {"index" : "ID"})

In [146]:
mapping_gse_8052

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,394,395,396,397,398,399,400,401,402,403
0,1007_s_at,6.945,6.300,6.097,6.693,6.156,6.163,5.675,6.070,6.129,...,6.103,6.040,5.938,6.421,6.058,5.943,5.971,6.208,6.303,6.361
1,1053_at,7.507,7.044,7.318,7.027,7.674,7.628,7.896,7.449,7.104,...,7.635,7.642,7.439,8.069,7.703,7.790,7.540,7.600,7.833,7.688
2,117_at,4.402,4.510,5.014,4.654,4.479,4.428,4.344,4.652,4.765,...,4.559,4.498,4.753,4.894,4.164,4.358,4.887,4.661,4.162,4.137
3,121_at,6.424,6.282,6.450,6.434,6.636,6.745,6.220,6.259,6.782,...,6.357,6.413,6.673,6.765,6.646,6.542,6.350,6.631,6.594,6.356
4,1255_g_at,2.572,2.518,2.599,2.531,2.777,2.590,2.703,2.542,2.955,...,2.734,2.614,2.570,2.683,2.676,2.544,2.612,2.652,2.604,2.630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54670,AFFX-ThrX-5_at,3.104,3.125,3.343,3.186,3.315,3.231,3.187,3.210,3.177,...,2.972,3.092,3.282,3.218,3.319,3.072,3.529,3.298,2.984,3.197
54671,AFFX-ThrX-M_at,2.882,2.826,2.988,2.898,2.968,2.794,2.906,2.891,2.955,...,2.823,2.886,3.043,2.928,3.033,2.969,3.159,2.940,2.820,2.894
54672,AFFX-TrpnX-3_at,2.532,2.632,2.630,2.608,2.871,2.605,2.543,2.700,3.224,...,2.870,2.741,2.663,2.712,2.547,2.774,2.764,2.782,2.653,2.655
54673,AFFX-TrpnX-5_at,2.929,2.714,2.932,2.894,2.888,2.997,2.878,2.837,3.007,...,2.888,3.205,3.075,2.956,2.968,2.987,2.896,2.793,3.009,2.931


In [143]:
mapping_annotation

Unnamed: 0,ID,Gene Symbol
0,1007_s_at,DDR1 /// MIR4640
1,1053_at,RFC2
2,117_at,HSPA6
3,121_at,PAX8
4,1255_g_at,GUCA1A
...,...,...
45777,AFFX-HUMGAPDH/M33197_M_at,GAPDH
45778,AFFX-HUMISGF3A/M97935_3_at,STAT1
45779,AFFX-HUMISGF3A/M97935_5_at,STAT1
45780,AFFX-HUMISGF3A/M97935_MA_at,STAT1


In [159]:
mapped_gse_8052 = pd.merge(mapping_gse_8052, mapping_annotation, on = "ID")

In [160]:
mapped_gse_8052

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,395,396,397,398,399,400,401,402,403,Gene Symbol
0,1007_s_at,6.945,6.300,6.097,6.693,6.156,6.163,5.675,6.070,6.129,...,6.040,5.938,6.421,6.058,5.943,5.971,6.208,6.303,6.361,DDR1 /// MIR4640
1,1053_at,7.507,7.044,7.318,7.027,7.674,7.628,7.896,7.449,7.104,...,7.642,7.439,8.069,7.703,7.790,7.540,7.600,7.833,7.688,RFC2
2,117_at,4.402,4.510,5.014,4.654,4.479,4.428,4.344,4.652,4.765,...,4.498,4.753,4.894,4.164,4.358,4.887,4.661,4.162,4.137,HSPA6
3,121_at,6.424,6.282,6.450,6.434,6.636,6.745,6.220,6.259,6.782,...,6.413,6.673,6.765,6.646,6.542,6.350,6.631,6.594,6.356,PAX8
4,1255_g_at,2.572,2.518,2.599,2.531,2.777,2.590,2.703,2.542,2.955,...,2.614,2.570,2.683,2.676,2.544,2.612,2.652,2.604,2.630,GUCA1A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45777,AFFX-HUMGAPDH/M33197_M_at,12.851,12.758,13.192,12.812,12.991,12.999,12.898,12.963,12.939,...,12.970,13.020,13.304,12.921,13.069,12.801,12.954,12.903,12.958,GAPDH
45778,AFFX-HUMISGF3A/M97935_3_at,10.800,11.271,10.731,10.603,10.531,10.578,10.258,10.768,10.286,...,10.663,10.357,10.739,10.262,10.386,10.425,10.148,10.426,10.579,STAT1
45779,AFFX-HUMISGF3A/M97935_5_at,7.925,8.728,7.707,8.308,8.737,9.068,5.526,8.701,8.145,...,8.467,8.723,8.805,8.594,8.673,8.561,8.154,8.818,8.614,STAT1
45780,AFFX-HUMISGF3A/M97935_MA_at,8.792,9.810,9.518,9.279,9.547,9.776,8.309,9.581,8.644,...,9.653,9.749,9.641,9.492,9.641,9.636,9.187,9.761,9.529,STAT1


In [161]:
mapped_gse_8052["Gene Symbol"].isna().sum()

0

In [162]:
mapped_gse_8052 = mapped_gse_8052.drop(["ID", "Gene Symbol"], axis = 1).T

In [163]:
mapped_gse_8052

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45772,45773,45774,45775,45776,45777,45778,45779,45780,45781
0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,7.828,2.952,...,13.260,12.360,12.636,12.862,12.753,12.851,10.800,7.925,8.792,8.936
1,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,7.258,2.998,...,13.041,12.759,12.902,12.790,12.812,12.758,11.271,8.728,9.810,9.951
2,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,8.038,2.894,...,13.459,13.207,13.486,13.133,12.936,13.192,10.731,7.707,9.518,9.369
3,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,8.063,3.042,...,13.260,13.078,13.213,12.721,12.734,12.812,10.603,8.308,9.279,9.253
4,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,5.236,3.058,...,13.187,13.112,13.226,12.880,12.964,12.991,10.531,8.737,9.547,9.223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,7.044,3.052,...,13.354,13.422,13.544,12.906,12.988,13.069,10.386,8.673,9.641,9.072
400,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,5.900,3.150,...,13.117,13.023,13.153,12.681,12.766,12.801,10.425,8.561,9.636,9.077
401,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,5.240,3.032,...,13.125,12.952,13.145,12.864,12.954,12.954,10.148,8.154,9.187,8.732
402,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,6.275,3.164,...,13.150,13.134,13.264,12.747,12.833,12.903,10.426,8.818,9.761,9.227


In [164]:
mapped_gse_8052.columns = mapping_annotation["Gene Symbol"]

In [165]:
mapped_gse_8052

Gene Symbol,DDR1 /// MIR4640,RFC2,HSPA6,PAX8,GUCA1A,MIR5193 /// UBA7,THRA,PTPN21,CCL5,CYP2E1,...,ACTB,ACTB.1,ACTB.2,GAPDH,GAPDH.1,GAPDH.2,STAT1,STAT1.1,STAT1.2,STAT1.3
0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,7.828,2.952,...,13.260,12.360,12.636,12.862,12.753,12.851,10.800,7.925,8.792,8.936
1,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,7.258,2.998,...,13.041,12.759,12.902,12.790,12.812,12.758,11.271,8.728,9.810,9.951
2,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,8.038,2.894,...,13.459,13.207,13.486,13.133,12.936,13.192,10.731,7.707,9.518,9.369
3,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,8.063,3.042,...,13.260,13.078,13.213,12.721,12.734,12.812,10.603,8.308,9.279,9.253
4,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,5.236,3.058,...,13.187,13.112,13.226,12.880,12.964,12.991,10.531,8.737,9.547,9.223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,7.044,3.052,...,13.354,13.422,13.544,12.906,12.988,13.069,10.386,8.673,9.641,9.072
400,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,5.900,3.150,...,13.117,13.023,13.153,12.681,12.766,12.801,10.425,8.561,9.636,9.077
401,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,5.240,3.032,...,13.125,12.952,13.145,12.864,12.954,12.954,10.148,8.154,9.187,8.732
402,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,6.275,3.164,...,13.150,13.134,13.264,12.747,12.833,12.903,10.426,8.818,9.761,9.227


In [168]:
asthma_gse_8052.iloc[:, :2]

Unnamed: 0,Gender,Label
0,1,0
1,0,0
2,1,1
3,1,1
4,0,1
...,...,...
399,0,1
400,1,0
401,1,0
402,0,1


In [191]:
mapped_gse_8052.astype('float64')

Gene Symbol,DDR1 /// MIR4640,RFC2,HSPA6,PAX8,GUCA1A,MIR5193 /// UBA7,THRA,PTPN21,CCL5,CYP2E1,...,ACTB,ACTB.1,ACTB.2,GAPDH,GAPDH.1,GAPDH.2,STAT1,STAT1.1,STAT1.2,STAT1.3
0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,7.828,2.952,...,13.260,12.360,12.636,12.862,12.753,12.851,10.800,7.925,8.792,8.936
1,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,7.258,2.998,...,13.041,12.759,12.902,12.790,12.812,12.758,11.271,8.728,9.810,9.951
2,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,8.038,2.894,...,13.459,13.207,13.486,13.133,12.936,13.192,10.731,7.707,9.518,9.369
3,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,8.063,3.042,...,13.260,13.078,13.213,12.721,12.734,12.812,10.603,8.308,9.279,9.253
4,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,5.236,3.058,...,13.187,13.112,13.226,12.880,12.964,12.991,10.531,8.737,9.547,9.223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,7.044,3.052,...,13.354,13.422,13.544,12.906,12.988,13.069,10.386,8.673,9.641,9.072
400,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,5.900,3.150,...,13.117,13.023,13.153,12.681,12.766,12.801,10.425,8.561,9.636,9.077
401,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,5.240,3.032,...,13.125,12.952,13.145,12.864,12.954,12.954,10.148,8.154,9.187,8.732
402,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,6.275,3.164,...,13.150,13.134,13.264,12.747,12.833,12.903,10.426,8.818,9.761,9.227


In [192]:
mapped_gse_8052_df = pd.concat([mapped_gse_8052.astype('float64'), asthma_gse_8052.iloc[:, :2]], axis = 1)

In [239]:
mapped_gse_8052_df

Unnamed: 0,DDR1 /// MIR4640,RFC2,HSPA6,PAX8,GUCA1A,MIR5193 /// UBA7,THRA,PTPN21,CCL5,CYP2E1,...,ACTB,GAPDH,GAPDH.1,GAPDH.2,STAT1,STAT1.1,STAT1.2,STAT1.3,Gender,Label
0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,7.828,2.952,...,12.636,12.862,12.753,12.851,10.800,7.925,8.792,8.936,1,0
1,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,7.258,2.998,...,12.902,12.790,12.812,12.758,11.271,8.728,9.810,9.951,0,0
2,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,8.038,2.894,...,13.486,13.133,12.936,13.192,10.731,7.707,9.518,9.369,1,1
3,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,8.063,3.042,...,13.213,12.721,12.734,12.812,10.603,8.308,9.279,9.253,1,1
4,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,5.236,3.058,...,13.226,12.880,12.964,12.991,10.531,8.737,9.547,9.223,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,7.044,3.052,...,13.544,12.906,12.988,13.069,10.386,8.673,9.641,9.072,0,1
400,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,5.900,3.150,...,13.153,12.681,12.766,12.801,10.425,8.561,9.636,9.077,1,0
401,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,5.240,3.032,...,13.145,12.864,12.954,12.954,10.148,8.154,9.187,8.732,1,0
402,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,6.275,3.164,...,13.264,12.747,12.833,12.903,10.426,8.818,9.761,9.227,0,1


In [56]:
mapped_gse_8052_df["STAT1"].sum(axis = 0)

NameError: name 'mapped_gse_8052_df' is not defined

In [203]:
unique_col = np.unique(mapped_gse_8052_df.columns)
unique_col.size

23520

In [204]:
unique_col

array(['1-Dec', '1-Mar', '1-Sep', ...,
       'hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10',
       'hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10',
       'mir-223'], dtype=object)

### Select 

In [279]:
new_gse_8052_df = pd.DataFrame()
for col in unique_col:
    gene_df = pd.DataFrame(mapped_gse_8052_df[col])
    if gene_df.columns.size == 1:
        new_gse_8052_df[col] = gene_df[col]
    else:
        new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]

In [280]:
new_gse_8052_df

Unnamed: 0,1-Dec,1-Mar,1-Sep,10-Mar,10-Sep,11-Mar,11-Sep,12-Sep,15-Sep,2-Mar,...,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,abParts /// IGKC /// IGKV4-1 /// IGKV4-1,av27s1 /// TRAV39 /// TRAV39,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10,mir-223
0,2.808,5.375,6.708,4.489,3.817,3.125,8.196,4.833,9.464,5.714,...,4.719,7.619,6.160,6.828,7.323,7.370,3.553,5.034,3.493,2.569
1,2.745,4.995,7.218,4.298,4.185,3.119,8.574,4.613,9.638,5.601,...,4.196,7.753,5.776,7.040,7.722,7.049,3.313,4.602,3.310,2.978
2,2.846,5.603,7.708,4.719,8.948,3.186,8.526,4.899,8.968,6.726,...,5.119,7.312,7.466,6.385,6.436,8.557,3.446,4.822,3.647,2.864
3,2.771,5.332,6.991,4.403,5.533,3.085,8.441,4.944,9.517,5.387,...,4.918,7.463,6.938,7.126,7.671,8.604,3.456,4.993,3.331,2.741
4,2.501,5.191,7.424,4.224,5.268,3.217,8.121,4.698,9.758,5.305,...,4.432,7.416,6.168,6.692,7.487,10.699,3.143,4.629,3.419,3.124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,2.888,4.710,7.514,4.163,5.377,3.271,8.552,4.940,9.376,5.998,...,4.590,7.566,6.775,6.786,7.241,10.978,3.386,4.807,3.205,3.059
400,2.994,4.760,6.984,4.703,4.473,3.274,8.018,4.859,9.762,4.946,...,3.294,7.725,6.303,6.655,7.435,10.478,3.252,4.922,3.602,2.731
401,2.943,5.279,7.146,4.606,6.384,2.922,8.264,4.923,9.748,5.595,...,4.125,7.743,6.035,6.916,7.577,11.603,3.260,4.755,3.350,2.729
402,2.863,4.944,7.328,4.156,4.241,2.924,8.445,4.537,9.947,5.116,...,3.701,7.940,5.628,6.722,7.613,10.910,3.279,4.213,3.229,3.295


In [288]:
new_gse_8052_data = new_gse_8052_df.drop(["Gender", "Label"], axis = 1)

In [290]:
new_gse_8052_data

Unnamed: 0,1-Dec,1-Mar,1-Sep,10-Mar,10-Sep,11-Mar,11-Sep,12-Sep,15-Sep,2-Mar,...,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,abParts /// IGKC /// IGKV4-1 /// IGKV4-1,av27s1 /// TRAV39 /// TRAV39,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10,mir-223
0,2.808,5.375,6.708,4.489,3.817,3.125,8.196,4.833,9.464,5.714,...,4.719,7.619,6.160,6.828,7.323,7.370,3.553,5.034,3.493,2.569
1,2.745,4.995,7.218,4.298,4.185,3.119,8.574,4.613,9.638,5.601,...,4.196,7.753,5.776,7.040,7.722,7.049,3.313,4.602,3.310,2.978
2,2.846,5.603,7.708,4.719,8.948,3.186,8.526,4.899,8.968,6.726,...,5.119,7.312,7.466,6.385,6.436,8.557,3.446,4.822,3.647,2.864
3,2.771,5.332,6.991,4.403,5.533,3.085,8.441,4.944,9.517,5.387,...,4.918,7.463,6.938,7.126,7.671,8.604,3.456,4.993,3.331,2.741
4,2.501,5.191,7.424,4.224,5.268,3.217,8.121,4.698,9.758,5.305,...,4.432,7.416,6.168,6.692,7.487,10.699,3.143,4.629,3.419,3.124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,2.888,4.710,7.514,4.163,5.377,3.271,8.552,4.940,9.376,5.998,...,4.590,7.566,6.775,6.786,7.241,10.978,3.386,4.807,3.205,3.059
400,2.994,4.760,6.984,4.703,4.473,3.274,8.018,4.859,9.762,4.946,...,3.294,7.725,6.303,6.655,7.435,10.478,3.252,4.922,3.602,2.731
401,2.943,5.279,7.146,4.606,6.384,2.922,8.264,4.923,9.748,5.595,...,4.125,7.743,6.035,6.916,7.577,11.603,3.260,4.755,3.350,2.729
402,2.863,4.944,7.328,4.156,4.241,2.924,8.445,4.537,9.947,5.116,...,3.701,7.940,5.628,6.722,7.613,10.910,3.279,4.213,3.229,3.295


In [289]:
new_gse_8052_label = new_gse_8052_df[["Gender", "Label"]]

In [293]:
new_gse_8052_label

Unnamed: 0,Gender,Label
0,1,0
1,0,0
2,1,1
3,1,1
4,0,1
...,...,...
399,0,1
400,1,0
401,1,0
402,0,1


In [294]:
new_gse_8052_df = pd.concat([new_gse_8052_label, new_gse_8052_data], axis = 1)

In [295]:
new_gse_8052_df

Unnamed: 0,Gender,Label,1-Dec,1-Mar,1-Sep,10-Mar,10-Sep,11-Mar,11-Sep,12-Sep,...,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,abParts /// IGKC /// IGKV4-1 /// IGKV4-1,av27s1 /// TRAV39 /// TRAV39,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10,mir-223
0,1,0,2.808,5.375,6.708,4.489,3.817,3.125,8.196,4.833,...,4.719,7.619,6.160,6.828,7.323,7.370,3.553,5.034,3.493,2.569
1,0,0,2.745,4.995,7.218,4.298,4.185,3.119,8.574,4.613,...,4.196,7.753,5.776,7.040,7.722,7.049,3.313,4.602,3.310,2.978
2,1,1,2.846,5.603,7.708,4.719,8.948,3.186,8.526,4.899,...,5.119,7.312,7.466,6.385,6.436,8.557,3.446,4.822,3.647,2.864
3,1,1,2.771,5.332,6.991,4.403,5.533,3.085,8.441,4.944,...,4.918,7.463,6.938,7.126,7.671,8.604,3.456,4.993,3.331,2.741
4,0,1,2.501,5.191,7.424,4.224,5.268,3.217,8.121,4.698,...,4.432,7.416,6.168,6.692,7.487,10.699,3.143,4.629,3.419,3.124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,2.888,4.710,7.514,4.163,5.377,3.271,8.552,4.940,...,4.590,7.566,6.775,6.786,7.241,10.978,3.386,4.807,3.205,3.059
400,1,0,2.994,4.760,6.984,4.703,4.473,3.274,8.018,4.859,...,3.294,7.725,6.303,6.655,7.435,10.478,3.252,4.922,3.602,2.731
401,1,0,2.943,5.279,7.146,4.606,6.384,2.922,8.264,4.923,...,4.125,7.743,6.035,6.916,7.577,11.603,3.260,4.755,3.350,2.729
402,0,1,2.863,4.944,7.328,4.156,4.241,2.924,8.445,4.537,...,3.701,7.940,5.628,6.722,7.613,10.910,3.279,4.213,3.229,3.295


In [298]:
new_gse_8052_df.iloc[:, 2:]

Unnamed: 0,1-Dec,1-Mar,1-Sep,10-Mar,10-Sep,11-Mar,11-Sep,12-Sep,15-Sep,2-Mar,...,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,abParts /// IGKC /// IGKV4-1 /// IGKV4-1,av27s1 /// TRAV39 /// TRAV39,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10,mir-223
0,2.808,5.375,6.708,4.489,3.817,3.125,8.196,4.833,9.464,5.714,...,4.719,7.619,6.160,6.828,7.323,7.370,3.553,5.034,3.493,2.569
1,2.745,4.995,7.218,4.298,4.185,3.119,8.574,4.613,9.638,5.601,...,4.196,7.753,5.776,7.040,7.722,7.049,3.313,4.602,3.310,2.978
2,2.846,5.603,7.708,4.719,8.948,3.186,8.526,4.899,8.968,6.726,...,5.119,7.312,7.466,6.385,6.436,8.557,3.446,4.822,3.647,2.864
3,2.771,5.332,6.991,4.403,5.533,3.085,8.441,4.944,9.517,5.387,...,4.918,7.463,6.938,7.126,7.671,8.604,3.456,4.993,3.331,2.741
4,2.501,5.191,7.424,4.224,5.268,3.217,8.121,4.698,9.758,5.305,...,4.432,7.416,6.168,6.692,7.487,10.699,3.143,4.629,3.419,3.124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,2.888,4.710,7.514,4.163,5.377,3.271,8.552,4.940,9.376,5.998,...,4.590,7.566,6.775,6.786,7.241,10.978,3.386,4.807,3.205,3.059
400,2.994,4.760,6.984,4.703,4.473,3.274,8.018,4.859,9.762,4.946,...,3.294,7.725,6.303,6.655,7.435,10.478,3.252,4.922,3.602,2.731
401,2.943,5.279,7.146,4.606,6.384,2.922,8.264,4.923,9.748,5.595,...,4.125,7.743,6.035,6.916,7.577,11.603,3.260,4.755,3.350,2.729
402,2.863,4.944,7.328,4.156,4.241,2.924,8.445,4.537,9.947,5.116,...,3.701,7.940,5.628,6.722,7.613,10.910,3.279,4.213,3.229,3.295


In [299]:
new_gse_8052_df["Label"]

0      0
1      0
2      1
3      1
4      1
      ..
399    1
400    0
401    0
402    1
403    1
Name: Label, Length: 404, dtype: int64

In [297]:
np.unique(new_gse_8052_df.columns), np.unique(new_gse_8052_df.columns).size

(array(['1-Dec', '1-Mar', '1-Sep', ...,
        'hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10',
        'hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10',
        'mir-223'], dtype=object), 23520)

In [319]:
new_gse_8052_df.to_csv("GSE_8052/Preprocessed_Asthma_GSE_8052.csv", index = False)

## GSE172367

In [36]:
asthma_gse_172367 = pd.read_csv(data_path + "expr_normalized.GSE172367-outcome.csv", header = None).T

In [37]:
asthma_gse_172367.columns = asthma_gse_172367.loc[0]
asthma_gse_172367 = asthma_gse_172367.drop(0).reset_index(drop = True)

In [38]:
asthma_gse_172367

Unnamed: 0,ID,Astha status,sex,age,smokingstatus,A2ML1,A4GALT,AAAS,AACS,AADAT,...,ZSWIM9,ZUP1,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
0,76b3_439a_1A,Control,Male,66,smoking: Current,11.26277043,11.70782026,10.19449532,10.98239746,7.201320584,...,8.71244,9.4362,10.2855,9.70126,10.745,10.5148,10.2715,13.3416,11.3458,10.7375
1,76b3_439a_2A,Control,Male,66,smoking: Current,10.80428201,11.76620557,10.17716131,11.1617882,7.089056595,...,8.63175,9.38492,10.263,9.35008,10.4968,10.5935,10.51,13.3414,11.4162,10.8231
2,db6c_413d_1A,Asthma,Male,31,smoking: Never,10.96104261,11.56179994,10.79612493,10.97000529,7.5272221,...,8.88493,9.30034,10.2334,10.2515,11.2972,10.4806,9.77559,13.7929,11.5671,11.1429
3,db6c_413d_2A,Asthma,Male,31,smoking: Never,10.98960849,11.61274312,10.79534327,11.13099084,7.569946152,...,9.00215,9.24455,10.2267,10.0231,11.0728,10.4891,9.82957,13.7626,11.4896,11.2081
4,9623_473f_1A,Asthma,Male,62,smoking: Prior,8.980355904,11.09227383,10.56830344,10.69486456,7.439625724,...,8.63467,9.42603,10.0973,10.2414,10.8029,10.1019,10.4993,14.3796,11.4035,11.3108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,1413_46H2_2A,Asthma,Male,32,smoking: Never,8.669623511,11.63044846,10.75347811,11.37833834,7.722190152,...,9.24414,8.96687,10.0069,9.81089,10.5649,9.90491,10.2855,14.5355,11.4407,10.4166
186,6f9H_42Hd_1A,Asthma,Female,38,smoking: Never,11.16545099,11.25316755,10.80917088,11.2691359,7.676080427,...,9.25697,8.98965,10.2343,9.66254,10.9973,10.1297,10.0943,14.3967,11.4649,10.2278
187,6f9H_42Hd_2A,Asthma,Female,38,smoking: Never,10.78478852,11.41048454,10.79614436,11.51920914,7.565932529,...,9.19119,9.00754,10.0834,9.65809,10.8122,9.96472,10.0349,14.3207,11.4428,10.3131
188,8f7H_4HbH_1A,Asthma,Male,24,smoking: Never,9.438896338,11.16327666,11.19346802,10.87379956,8.012580033,...,8.60208,9.1581,10.6159,10.86,11.662,9.67822,10.6057,13.4176,11.2266,10.9113


In [43]:
asthma_gse_172367 = pd.concat([asthma_gse_172367.iloc[:, 5:], asthma_gse_172367["sex"], asthma_gse_172367["Astha status"]], axis = 1)

In [44]:
asthma_gse_172367

Unnamed: 0,A2ML1,A4GALT,AAAS,AACS,AADAT,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,sex,Astha status
0,11.26277043,11.70782026,10.19449532,10.98239746,7.201320584,11.6615504,12.41538764,6.899753785,12.46986548,10.56304537,...,10.2855,9.70126,10.745,10.5148,10.2715,13.3416,11.3458,10.7375,Male,Control
1,10.80428201,11.76620557,10.17716131,11.1617882,7.089056595,11.70509317,12.43493536,7.046407038,12.36572933,10.66968172,...,10.263,9.35008,10.4968,10.5935,10.51,13.3414,11.4162,10.8231,Male,Control
2,10.96104261,11.56179994,10.79612493,10.97000529,7.5272221,11.45307918,12.09571346,6.778553685,12.27191576,10.68196385,...,10.2334,10.2515,11.2972,10.4806,9.77559,13.7929,11.5671,11.1429,Male,Asthma
3,10.98960849,11.61274312,10.79534327,11.13099084,7.569946152,11.40570429,12.03296894,6.719778351,12.27368854,10.70456218,...,10.2267,10.0231,11.0728,10.4891,9.82957,13.7626,11.4896,11.2081,Male,Asthma
4,8.980355904,11.09227383,10.56830344,10.69486456,7.439625724,11.47183605,12.30056033,6.957044666,12.20922533,10.34190587,...,10.0973,10.2414,10.8029,10.1019,10.4993,14.3796,11.4035,11.3108,Male,Asthma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,8.669623511,11.63044846,10.75347811,11.37833834,7.722190152,11.17987921,11.18323117,7.588338801,12.79177711,10.64678964,...,10.0069,9.81089,10.5649,9.90491,10.2855,14.5355,11.4407,10.4166,Male,Asthma
186,11.16545099,11.25316755,10.80917088,11.2691359,7.676080427,11.02580461,10.75349474,7.395532254,12.68342293,10.81788291,...,10.2343,9.66254,10.9973,10.1297,10.0943,14.3967,11.4649,10.2278,Female,Asthma
187,10.78478852,11.41048454,10.79614436,11.51920914,7.565932529,11.24989011,10.79290895,7.502488142,12.73637941,10.71641757,...,10.0834,9.65809,10.8122,9.96472,10.0349,14.3207,11.4428,10.3131,Female,Asthma
188,9.438896338,11.16327666,11.19346802,10.87379956,8.012580033,11.39242805,10.9201516,7.004261456,12.36019363,10.72941693,...,10.6159,10.86,11.662,9.67822,10.6057,13.4176,11.2266,10.9113,Male,Asthma


In [45]:
asthma_gse_172367["sex"] = asthma_gse_172367["sex"].apply(lambda x : 0 if x.find('Female') != -1 else 1)

In [47]:
asthma_gse_172367["Astha status"] = asthma_gse_172367["Astha status"].apply(lambda x : 0 if x.find('Control') != -1 else 1)

In [48]:
asthma_gse_172367

Unnamed: 0,A2ML1,A4GALT,AAAS,AACS,AADAT,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,sex,Astha status
0,11.26277043,11.70782026,10.19449532,10.98239746,7.201320584,11.6615504,12.41538764,6.899753785,12.46986548,10.56304537,...,10.2855,9.70126,10.745,10.5148,10.2715,13.3416,11.3458,10.7375,1,0
1,10.80428201,11.76620557,10.17716131,11.1617882,7.089056595,11.70509317,12.43493536,7.046407038,12.36572933,10.66968172,...,10.263,9.35008,10.4968,10.5935,10.51,13.3414,11.4162,10.8231,1,0
2,10.96104261,11.56179994,10.79612493,10.97000529,7.5272221,11.45307918,12.09571346,6.778553685,12.27191576,10.68196385,...,10.2334,10.2515,11.2972,10.4806,9.77559,13.7929,11.5671,11.1429,1,1
3,10.98960849,11.61274312,10.79534327,11.13099084,7.569946152,11.40570429,12.03296894,6.719778351,12.27368854,10.70456218,...,10.2267,10.0231,11.0728,10.4891,9.82957,13.7626,11.4896,11.2081,1,1
4,8.980355904,11.09227383,10.56830344,10.69486456,7.439625724,11.47183605,12.30056033,6.957044666,12.20922533,10.34190587,...,10.0973,10.2414,10.8029,10.1019,10.4993,14.3796,11.4035,11.3108,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,8.669623511,11.63044846,10.75347811,11.37833834,7.722190152,11.17987921,11.18323117,7.588338801,12.79177711,10.64678964,...,10.0069,9.81089,10.5649,9.90491,10.2855,14.5355,11.4407,10.4166,1,1
186,11.16545099,11.25316755,10.80917088,11.2691359,7.676080427,11.02580461,10.75349474,7.395532254,12.68342293,10.81788291,...,10.2343,9.66254,10.9973,10.1297,10.0943,14.3967,11.4649,10.2278,0,1
187,10.78478852,11.41048454,10.79614436,11.51920914,7.565932529,11.24989011,10.79290895,7.502488142,12.73637941,10.71641757,...,10.0834,9.65809,10.8122,9.96472,10.0349,14.3207,11.4428,10.3131,0,1
188,9.438896338,11.16327666,11.19346802,10.87379956,8.012580033,11.39242805,10.9201516,7.004261456,12.36019363,10.72941693,...,10.6159,10.86,11.662,9.67822,10.6057,13.4176,11.2266,10.9113,1,1


In [50]:
asthma_gse_172367 = asthma_gse_172367.rename(columns = {"Astha status" : "Type"})

In [51]:
asthma_gse_172367

Unnamed: 0,A2ML1,A4GALT,AAAS,AACS,AADAT,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,sex,Type
0,11.26277043,11.70782026,10.19449532,10.98239746,7.201320584,11.6615504,12.41538764,6.899753785,12.46986548,10.56304537,...,10.2855,9.70126,10.745,10.5148,10.2715,13.3416,11.3458,10.7375,1,0
1,10.80428201,11.76620557,10.17716131,11.1617882,7.089056595,11.70509317,12.43493536,7.046407038,12.36572933,10.66968172,...,10.263,9.35008,10.4968,10.5935,10.51,13.3414,11.4162,10.8231,1,0
2,10.96104261,11.56179994,10.79612493,10.97000529,7.5272221,11.45307918,12.09571346,6.778553685,12.27191576,10.68196385,...,10.2334,10.2515,11.2972,10.4806,9.77559,13.7929,11.5671,11.1429,1,1
3,10.98960849,11.61274312,10.79534327,11.13099084,7.569946152,11.40570429,12.03296894,6.719778351,12.27368854,10.70456218,...,10.2267,10.0231,11.0728,10.4891,9.82957,13.7626,11.4896,11.2081,1,1
4,8.980355904,11.09227383,10.56830344,10.69486456,7.439625724,11.47183605,12.30056033,6.957044666,12.20922533,10.34190587,...,10.0973,10.2414,10.8029,10.1019,10.4993,14.3796,11.4035,11.3108,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,8.669623511,11.63044846,10.75347811,11.37833834,7.722190152,11.17987921,11.18323117,7.588338801,12.79177711,10.64678964,...,10.0069,9.81089,10.5649,9.90491,10.2855,14.5355,11.4407,10.4166,1,1
186,11.16545099,11.25316755,10.80917088,11.2691359,7.676080427,11.02580461,10.75349474,7.395532254,12.68342293,10.81788291,...,10.2343,9.66254,10.9973,10.1297,10.0943,14.3967,11.4649,10.2278,0,1
187,10.78478852,11.41048454,10.79614436,11.51920914,7.565932529,11.24989011,10.79290895,7.502488142,12.73637941,10.71641757,...,10.0834,9.65809,10.8122,9.96472,10.0349,14.3207,11.4428,10.3131,0,1
188,9.438896338,11.16327666,11.19346802,10.87379956,8.012580033,11.39242805,10.9201516,7.004261456,12.36019363,10.72941693,...,10.6159,10.86,11.662,9.67822,10.6057,13.4176,11.2266,10.9113,1,1


In [52]:
asthma_gse_172367.to_csv(data_path + "Asthma_GSE_172367_with_Gender_1124.csv", index = False)