In [155]:
import numpy as np
import pandas as pd
import math
import scipy.io as sio
import os
from scipy import sparse
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

# GSE8052

In [2]:
path = "../../../../nasdatafolder/MTL/Data/Raw_Data/"

In [3]:
annotation = pd.read_csv(path + "GPL570-55999.txt", sep='\t')

In [4]:
annotation

Unnamed: 0,ID,Gene Symbol,GB_ACC,Representative Public ID,Gene Title
0,1007_s_at,DDR1 /// MIR4640,U48705,U48705,discoidin domain receptor tyrosine kinase 1 //...
1,1053_at,RFC2,M87338,M87338,"replication factor C (activator 1) 2, 40kDa"
2,117_at,HSPA6,X51757,X51757,heat shock 70kDa protein 6 (HSP70B')
3,121_at,PAX8,X69699,X69699,paired box 8
4,1255_g_at,GUCA1A,L36861,L36861,guanylate cyclase activator 1A (retina)
...,...,...,...,...,...
54670,AFFX-ThrX-5_at,,,AFFX-ThrX-5,
54671,AFFX-ThrX-M_at,,,AFFX-ThrX-M,
54672,AFFX-TrpnX-3_at,,,AFFX-TrpnX-3,
54673,AFFX-TrpnX-5_at,,,AFFX-TrpnX-5,


In [5]:
annotation = annotation[~annotation["Gene Symbol"].isna()].reset_index(drop = True)

In [6]:
annotation

Unnamed: 0,ID,Gene Symbol,GB_ACC,Representative Public ID,Gene Title
0,1007_s_at,DDR1 /// MIR4640,U48705,U48705,discoidin domain receptor tyrosine kinase 1 //...
1,1053_at,RFC2,M87338,M87338,"replication factor C (activator 1) 2, 40kDa"
2,117_at,HSPA6,X51757,X51757,heat shock 70kDa protein 6 (HSP70B')
3,121_at,PAX8,X69699,X69699,paired box 8
4,1255_g_at,GUCA1A,L36861,L36861,guanylate cyclase activator 1A (retina)
...,...,...,...,...,...
45777,AFFX-HUMGAPDH/M33197_M_at,GAPDH,,AFFX-HUMGAPDH/M33197_M,glyceraldehyde-3-phosphate dehydrogenase
45778,AFFX-HUMISGF3A/M97935_3_at,STAT1,,AFFX-HUMISGF3A/M97935_3,signal transducer and activator of transcripti...
45779,AFFX-HUMISGF3A/M97935_5_at,STAT1,,AFFX-HUMISGF3A/M97935_5,signal transducer and activator of transcripti...
45780,AFFX-HUMISGF3A/M97935_MA_at,STAT1,,AFFX-HUMISGF3A/M97935_MA,signal transducer and activator of transcripti...


In [7]:
annotation[annotation["Gene Symbol"] == "1-Dec"]

Unnamed: 0,ID,Gene Symbol,GB_ACC,Representative Public ID,Gene Title
26949,220781_at,1-Dec,NM_017418,NM_017418,deleted in esophageal cancer 1


In [8]:
mapping_annotation = annotation[["ID", "Gene Symbol"]]

In [9]:
mapping_annotation

Unnamed: 0,ID,Gene Symbol
0,1007_s_at,DDR1 /// MIR4640
1,1053_at,RFC2
2,117_at,HSPA6
3,121_at,PAX8
4,1255_g_at,GUCA1A
...,...,...
45777,AFFX-HUMGAPDH/M33197_M_at,GAPDH
45778,AFFX-HUMISGF3A/M97935_3_at,STAT1
45779,AFFX-HUMISGF3A/M97935_5_at,STAT1
45780,AFFX-HUMISGF3A/M97935_MA_at,STAT1


In [10]:
np.unique(mapping_annotation["Gene Symbol"]), np.unique(mapping_annotation["Gene Symbol"]).size

(array(['1-Dec', '1-Mar', '1-Sep', ...,
        'hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10',
        'hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10',
        'mir-223'], dtype=object),
 23518)

In [11]:
%time
asthma_gse_8052 = pd.read_excel(path + "GSE8052.xlsx")

CPU times: user 2 µs, sys: 3 µs, total: 5 µs
Wall time: 11.4 µs


In [12]:
asthma_gse_8052

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column396,Column397,Column398,Column399,Column400,Column401,Column402,Column403,Column404,Column405
0,,,,,,,,,,,...,,,,,,,,,,
1,Gender,MALE,FEMALE,MALE,MALE,FEMALE,FEMALE,MALE,MALE,FEMALE,...,MALE,MALE,MALE,MALE,FEMALE,FEMALE,MALE,MALE,FEMALE,FEMALE
2,status,CONTROL,CONTROL,ASTHMA,ASTHMA,ASTHMA,CONTROL,ASTHMA,ASTHMA,ASTHMA,...,CONTROL,ASTHMA,CONTROL,ASTHMA,CONTROL,ASTHMA,CONTROL,CONTROL,ASTHMA,ASTHMA
3,Sample ID,GSM199024,GSM199025,GSM199026,GSM199027,GSM199028,GSM199029,GSM199030,GSM199031,GSM199032,...,GSM199418,GSM199419,GSM199420,GSM199421,GSM199422,GSM199423,GSM199424,GSM199425,GSM199426,GSM199427
4,1007_s_at,6.945,6.300,6.097,6.693,6.156,6.163,5.675,6.070,6.129,...,6.103,6.040,5.938,6.421,6.058,5.943,5.971,6.208,6.303,6.361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54675,AFFX-ThrX-M_at,2.882,2.826,2.988,2.898,2.968,2.794,2.906,2.891,2.955,...,2.823,2.886,3.043,2.928,3.033,2.969,3.159,2.940,2.820,2.894
54676,AFFX-TrpnX-3_at,2.532,2.632,2.630,2.608,2.871,2.605,2.543,2.700,3.224,...,2.870,2.741,2.663,2.712,2.547,2.774,2.764,2.782,2.653,2.655
54677,AFFX-TrpnX-5_at,2.929,2.714,2.932,2.894,2.888,2.997,2.878,2.837,3.007,...,2.888,3.205,3.075,2.956,2.968,2.987,2.896,2.793,3.009,2.931
54678,AFFX-TrpnX-M_at,2.858,2.687,2.989,2.752,2.994,3.035,2.912,3.027,3.020,...,2.787,2.765,2.923,2.747,2.820,2.672,3.055,2.941,2.713,2.773


In [13]:
asthma_gse_8052 = asthma_gse_8052.T
asthma_gse_8052

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54670,54671,54672,54673,54674,54675,54676,54677,54678,54679
Column1,,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,...,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at,!series_matrix_table_end
Column2,,MALE,CONTROL,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,...,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858,
Column3,,FEMALE,CONTROL,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,...,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687,
Column4,,MALE,ASTHMA,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,...,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989,
Column5,,MALE,ASTHMA,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,...,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Column401,,FEMALE,ASTHMA,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,...,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672,
Column402,,MALE,CONTROL,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,...,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055,
Column403,,MALE,CONTROL,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,...,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941,
Column404,,FEMALE,ASTHMA,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,...,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713,


In [14]:
asthma_gse_8052 = asthma_gse_8052.drop(0, axis = 1).reset_index(drop = True)
asthma_gse_8052

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,54670,54671,54672,54673,54674,54675,54676,54677,54678,54679
0,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at,!series_matrix_table_end
1,MALE,CONTROL,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858,
2,FEMALE,CONTROL,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687,
3,MALE,ASTHMA,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989,
4,MALE,ASTHMA,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,FEMALE,ASTHMA,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672,
401,MALE,CONTROL,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055,
402,MALE,CONTROL,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941,
403,FEMALE,ASTHMA,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713,


In [15]:
asthma_gse_8052.columns = asthma_gse_8052.loc[0]

In [16]:
asthma_gse_8052 = asthma_gse_8052.drop(0).reset_index(drop = True)

In [17]:
asthma_gse_8052

Unnamed: 0,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at,!series_matrix_table_end
0,MALE,CONTROL,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858,
1,FEMALE,CONTROL,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687,
2,MALE,ASTHMA,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989,
3,MALE,ASTHMA,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752,
4,FEMALE,ASTHMA,GSM199028,6.156,7.674,4.479,6.636,2.777,7.500,4.328,...,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,FEMALE,ASTHMA,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672,
400,MALE,CONTROL,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055,
401,MALE,CONTROL,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941,
402,FEMALE,ASTHMA,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713,


In [18]:
%time
check_gse_8052 = pd.read_excel(path + "GSE_8052.xlsx")

CPU times: user 15 µs, sys: 0 ns, total: 15 µs
Wall time: 27.7 µs


In [19]:
check_gse_8052

Unnamed: 0,Sample name,Unnamed: 1,Unnamed: 2,title,CEL file,source name,organism,ped,id,fatid,...,rs981684,rs12453124,rs4132126,rs7209228,rs7219451,rs7211017,molecule,label,description,platform
0,8001.4,GSM199024,Lymphoblastoid Cell Line for Individual 8001.4...,Lymphoblastoid_Cell_Line_for_Individual_8001.4...,8001.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8001,4,1,...,AG,CT,AG,TC,TC,GG,total_rna,biotin,-,GPL570
1,8002.4,GSM199025,Lymphoblastoid Cell Line for Individual 8002.4...,Lymphoblastoid_Cell_Line_for_Individual_8002.4...,8002.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8002,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
2,8003.6,GSM199026,Lymphoblastoid Cell Line for Individual 8003.6...,Lymphoblastoid_Cell_Line_for_Individual_8003.6...,8003.6.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,6,1,...,AA,CC,AG,CC,TC,GG,total_rna,biotin,-,GPL570
3,8003.5,GSM199027,Lymphoblastoid Cell Line for Individual 8003.5...,Lymphoblastoid_Cell_Line_for_Individual_8003.5...,8003.5.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,5,1,...,AG,CT,AG,CC,TC,GG,total_rna,biotin,-,GPL570
4,8003.4,GSM199028,Lymphoblastoid Cell Line for Individual 8003.4...,Lymphoblastoid_Cell_Line_for_Individual_8003.4...,8003.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764,GK1394.1,,,DNA_Sample_for_Individual_GK1394.1,,,H.sapiens,GK1394,1,0,...,GG,CC,AA,CC,TT,AA,,,,
2765,GK1395.1,,,DNA_Sample_for_Individual_GK1395.1,,,H.sapiens,GK1395,1,0,...,AG,CC,AG,TC,TC,GG,,,,
2766,GK1396.1,,,DNA_Sample_for_Individual_GK1396.1,,,H.sapiens,GK1396,1,0,...,AA,CC,AG,TC,CC,GG,,,,
2767,GK1397.1,,,DNA_Sample_for_Individual_GK1397.1,,,H.sapiens,GK1397,1,0,...,AA,CC,AG,CC,TC,GG,,,,


In [20]:
check_gse_8052 = check_gse_8052.rename(columns = {"Unnamed: 1" : "Sample ID", "Unnamed: 2" : "Type of Sample"})

In [21]:
check_gse_8052

Unnamed: 0,Sample name,Sample ID,Type of Sample,title,CEL file,source name,organism,ped,id,fatid,...,rs981684,rs12453124,rs4132126,rs7209228,rs7219451,rs7211017,molecule,label,description,platform
0,8001.4,GSM199024,Lymphoblastoid Cell Line for Individual 8001.4...,Lymphoblastoid_Cell_Line_for_Individual_8001.4...,8001.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8001,4,1,...,AG,CT,AG,TC,TC,GG,total_rna,biotin,-,GPL570
1,8002.4,GSM199025,Lymphoblastoid Cell Line for Individual 8002.4...,Lymphoblastoid_Cell_Line_for_Individual_8002.4...,8002.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8002,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
2,8003.6,GSM199026,Lymphoblastoid Cell Line for Individual 8003.6...,Lymphoblastoid_Cell_Line_for_Individual_8003.6...,8003.6.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,6,1,...,AA,CC,AG,CC,TC,GG,total_rna,biotin,-,GPL570
3,8003.5,GSM199027,Lymphoblastoid Cell Line for Individual 8003.5...,Lymphoblastoid_Cell_Line_for_Individual_8003.5...,8003.5.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,5,1,...,AG,CT,AG,CC,TC,GG,total_rna,biotin,-,GPL570
4,8003.4,GSM199028,Lymphoblastoid Cell Line for Individual 8003.4...,Lymphoblastoid_Cell_Line_for_Individual_8003.4...,8003.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764,GK1394.1,,,DNA_Sample_for_Individual_GK1394.1,,,H.sapiens,GK1394,1,0,...,GG,CC,AA,CC,TT,AA,,,,
2765,GK1395.1,,,DNA_Sample_for_Individual_GK1395.1,,,H.sapiens,GK1395,1,0,...,AG,CC,AG,TC,TC,GG,,,,
2766,GK1396.1,,,DNA_Sample_for_Individual_GK1396.1,,,H.sapiens,GK1396,1,0,...,AA,CC,AG,TC,CC,GG,,,,
2767,GK1397.1,,,DNA_Sample_for_Individual_GK1397.1,,,H.sapiens,GK1397,1,0,...,AA,CC,AG,CC,TC,GG,,,,


In [22]:
check_gse_8052 = check_gse_8052[~check_gse_8052["Type of Sample"].isna()]

In [23]:
check_gse_8052

Unnamed: 0,Sample name,Sample ID,Type of Sample,title,CEL file,source name,organism,ped,id,fatid,...,rs981684,rs12453124,rs4132126,rs7209228,rs7219451,rs7211017,molecule,label,description,platform
0,8001.4,GSM199024,Lymphoblastoid Cell Line for Individual 8001.4...,Lymphoblastoid_Cell_Line_for_Individual_8001.4...,8001.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8001,4,1,...,AG,CT,AG,TC,TC,GG,total_rna,biotin,-,GPL570
1,8002.4,GSM199025,Lymphoblastoid Cell Line for Individual 8002.4...,Lymphoblastoid_Cell_Line_for_Individual_8002.4...,8002.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8002,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
2,8003.6,GSM199026,Lymphoblastoid Cell Line for Individual 8003.6...,Lymphoblastoid_Cell_Line_for_Individual_8003.6...,8003.6.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,6,1,...,AA,CC,AG,CC,TC,GG,total_rna,biotin,-,GPL570
3,8003.5,GSM199027,Lymphoblastoid Cell Line for Individual 8003.5...,Lymphoblastoid_Cell_Line_for_Individual_8003.5...,8003.5.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,5,1,...,AG,CT,AG,CC,TC,GG,total_rna,biotin,-,GPL570
4,8003.4,GSM199028,Lymphoblastoid Cell Line for Individual 8003.4...,Lymphoblastoid_Cell_Line_for_Individual_8003.4...,8003.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8003,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,8211.3,GSM199423,Lymphoblastoid Cell Line for Individual 8211.3...,Lymphoblastoid_Cell_Line_for_Individual_8211.3...,8211.3.CEL,Lymphoblastoid_cell_line,H.sapiens,8211,3,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
400,8213.5,GSM199424,Lymphoblastoid Cell Line for Individual 8213.5...,Lymphoblastoid_Cell_Line_for_Individual_8213.5...,8213.5.CEL,Lymphoblastoid_cell_line,H.sapiens,8213,5,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
401,8213.4,GSM199425,Lymphoblastoid Cell Line for Individual 8213.4...,Lymphoblastoid_Cell_Line_for_Individual_8213.4...,8213.4.CEL,Lymphoblastoid_cell_line,H.sapiens,8213,4,1,...,--,--,--,--,--,--,total_rna,biotin,-,GPL570
402,8213.3,GSM199426,Lymphoblastoid Cell Line for Individual 8213.3...,Lymphoblastoid_Cell_Line_for_Individual_8213.3...,8213.3.CEL,Lymphoblastoid_cell_line,H.sapiens,8213,3,1,...,AA,CC,AG,CC,TT,AG,total_rna,biotin,-,GPL570


In [24]:
to_compare_check_gse_8052 = check_gse_8052[["Sample ID", "sex", "DDAST"]]

In [25]:
to_compare_asthma_gse_8052 = asthma_gse_8052[["Gender", "status", "Sample ID"]]

In [26]:
compared_gse_8052 = pd.merge(to_compare_check_gse_8052, to_compare_asthma_gse_8052, how = "inner")

In [27]:
compared_gse_8052

Unnamed: 0,Sample ID,sex,DDAST,Gender,status
0,GSM199024,1,CONTROL,MALE,CONTROL
1,GSM199025,2,CONTROL,FEMALE,CONTROL
2,GSM199026,1,CASE,MALE,ASTHMA
3,GSM199027,1,CASE,MALE,ASTHMA
4,GSM199028,2,CASE,FEMALE,ASTHMA
...,...,...,...,...,...
399,GSM199423,2,CASE,FEMALE,ASTHMA
400,GSM199424,1,CONTROL,MALE,CONTROL
401,GSM199425,1,CONTROL,MALE,CONTROL
402,GSM199426,2,CASE,FEMALE,ASTHMA


In [28]:
compared_gse_8052['sex'] = compared_gse_8052['sex'].apply(lambda x : "MALE" if x == 1 else "FEMALE")

In [29]:
compared_gse_8052['status'] = compared_gse_8052['status'].apply(lambda x : x if x.find("CONTROL") != -1 else "CASE")

In [30]:
compared_gse_8052

Unnamed: 0,Sample ID,sex,DDAST,Gender,status
0,GSM199024,MALE,CONTROL,MALE,CONTROL
1,GSM199025,FEMALE,CONTROL,FEMALE,CONTROL
2,GSM199026,MALE,CASE,MALE,CASE
3,GSM199027,MALE,CASE,MALE,CASE
4,GSM199028,FEMALE,CASE,FEMALE,CASE
...,...,...,...,...,...
399,GSM199423,FEMALE,CASE,FEMALE,CASE
400,GSM199424,MALE,CONTROL,MALE,CONTROL
401,GSM199425,MALE,CONTROL,MALE,CONTROL
402,GSM199426,FEMALE,CASE,FEMALE,CASE


In [33]:
(compared_gse_8052["sex"].values == compared_gse_8052["Gender"].values).sum()

404

In [34]:
(compared_gse_8052['DDAST'] == compared_gse_8052['status']).sum()

404

---

In [36]:
asthma_gse_8052["!series_matrix_table_end"].isna().sum()

404

In [37]:
asthma_gse_8052 = asthma_gse_8052.drop(columns = ["!series_matrix_table_end"])

In [38]:
asthma_gse_8052

Unnamed: 0,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,MALE,CONTROL,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,FEMALE,CONTROL,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,MALE,ASTHMA,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,MALE,ASTHMA,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,FEMALE,ASTHMA,GSM199028,6.156,7.674,4.479,6.636,2.777,7.500,4.328,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,FEMALE,ASTHMA,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,MALE,CONTROL,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,MALE,CONTROL,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,FEMALE,ASTHMA,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [39]:
asthma_gse_8052.iloc[:, 3:].isna().sum().sum()

0

In [40]:
(asthma_gse_8052.iloc[:, 3:].sum(axis = 0) == 0).sum()

0

In [41]:
np.unique(asthma_gse_8052["status"])

array(['ASTHMA', 'CONTROL'], dtype=object)

In [42]:
np.unique(asthma_gse_8052["Gender"])

array(['FEMALE', 'MALE'], dtype=object)

In [43]:
asthma_gse_8052['Gender'] = asthma_gse_8052['Gender'].apply(lambda x : 0 if x.find('FEMALE') != -1 else 1)

In [44]:
asthma_gse_8052

Unnamed: 0,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,1,CONTROL,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,0,CONTROL,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,1,ASTHMA,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,1,ASTHMA,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,0,ASTHMA,GSM199028,6.156,7.674,4.479,6.636,2.777,7.500,4.328,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,ASTHMA,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,1,CONTROL,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,1,CONTROL,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,0,ASTHMA,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [45]:
asthma_gse_8052['status'] = asthma_gse_8052['status'].apply(lambda x : 1 if x.find('ASTHMA') != -1 else 0)

In [46]:
asthma_gse_8052

Unnamed: 0,Gender,status,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,1,0,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,0,0,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,1,1,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,1,1,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,0,1,GSM199028,6.156,7.674,4.479,6.636,2.777,7.500,4.328,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,1,0,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,1,0,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,0,1,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [47]:
asthma_gse_8052 = asthma_gse_8052.rename(columns = {"status" : "Label"})

In [48]:
asthma_gse_8052

Unnamed: 0,Gender,Label,Sample ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,1,0,GSM199024,6.945,7.507,4.402,6.424,2.572,7.299,4.167,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,0,0,GSM199025,6.300,7.044,4.510,6.282,2.518,7.500,4.160,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,1,1,GSM199026,6.097,7.318,5.014,6.450,2.599,8.192,4.329,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,1,1,GSM199027,6.693,7.027,4.654,6.434,2.531,7.676,4.335,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,0,1,GSM199028,6.156,7.674,4.479,6.636,2.777,7.500,4.328,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,GSM199423,5.943,7.790,4.358,6.542,2.544,7.431,4.154,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,1,0,GSM199424,5.971,7.540,4.887,6.350,2.612,7.698,4.185,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,1,0,GSM199425,6.208,7.600,4.661,6.631,2.652,7.441,4.307,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,0,1,GSM199426,6.303,7.833,4.162,6.594,2.604,7.573,4.277,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [49]:
asthma_gse_8052 = asthma_gse_8052.drop(columns = ["Sample ID"])

In [50]:
asthma_gse_8052

Unnamed: 0,Gender,Label,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,1,0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,0,0,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,1,1,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,1,1,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,0,1,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,1,0,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,1,0,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,0,1,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [51]:
asthma_gse_8052.iloc[:, (asthma_gse_8052.columns.str.find("_") != -1)]

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,7.828,2.952,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,7.258,2.998,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,8.038,2.894,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,8.063,3.042,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,5.236,3.058,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,7.044,3.052,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,5.900,3.150,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,5.240,3.032,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,6.275,3.164,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [52]:
asthma_gse_8052.iloc[:, 2:].columns

Index(['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at',
       '1316_at', '1320_at', '1405_i_at', '1431_at',
       ...
       'AFFX-r2-Ec-bioD-3_at', 'AFFX-r2-Ec-bioD-5_at', 'AFFX-r2-P1-cre-3_at',
       'AFFX-r2-P1-cre-5_at', 'AFFX-ThrX-3_at', 'AFFX-ThrX-5_at',
       'AFFX-ThrX-M_at', 'AFFX-TrpnX-3_at', 'AFFX-TrpnX-5_at',
       'AFFX-TrpnX-M_at'],
      dtype='object', name=0, length=54675)

In [53]:
asthma_gse_8052.iloc[:, 2:].columns.str.rstrip('_at')

Index(['1007_s', '1053', '117', '121', '1255_g', '1294', '1316', '1320',
       '1405_i', '1431',
       ...
       'AFFX-r2-Ec-bioD-3', 'AFFX-r2-Ec-bioD-5', 'AFFX-r2-P1-cre-3',
       'AFFX-r2-P1-cre-5', 'AFFX-ThrX-3', 'AFFX-ThrX-5', 'AFFX-ThrX-M',
       'AFFX-TrpnX-3', 'AFFX-TrpnX-5', 'AFFX-TrpnX-M'],
      dtype='object', name=0, length=54675)

In [54]:
uniq_col = asthma_gse_8052.iloc[:, 2:].columns.str.rstrip('-3_at')

In [55]:
uniq_col

Index(['1007_s', '105', '117', '121', '1255_g', '1294', '1316', '1320',
       '1405_i', '1431',
       ...
       'AFFX-r2-Ec-bioD', 'AFFX-r2-Ec-bioD-5', 'AFFX-r2-P1-cre',
       'AFFX-r2-P1-cre-5', 'AFFX-ThrX', 'AFFX-ThrX-5', 'AFFX-ThrX-M',
       'AFFX-TrpnX', 'AFFX-TrpnX-5', 'AFFX-TrpnX-M'],
      dtype='object', name=0, length=54675)

In [56]:
np.unique(uniq_col.str.rstrip('-5_at')), np.unique(uniq_col.str.rstrip('-5_at')).size

(array(['1', '10', '1007_s', ..., 'AFFX-r2-Ec-bioC', 'AFFX-r2-Ec-bioD',
        'AFFX-r2-P1-cre'], dtype=object),
 51927)

In [57]:
uniq_col = np.unique(uniq_col.str.rstrip('-5_at'))

In [58]:
uniq_col, uniq_col.size

(array(['1', '10', '1007_s', ..., 'AFFX-r2-Ec-bioC', 'AFFX-r2-Ec-bioD',
        'AFFX-r2-P1-cre'], dtype=object),
 51927)

In [59]:
np.unique(asthma_gse_8052.iloc[:, 2:].columns.str.rstrip('_at')), np.unique(asthma_gse_8052.iloc[:, 2:].columns.str.rstrip('_at')).size

(array(['1007_s', '1053', '117', ..., 'AFFX-r2-Ec-bioD-5',
        'AFFX-r2-P1-cre-3', 'AFFX-r2-P1-cre-5'], dtype=object),
 54675)

In [60]:
asthma_gse_8052.iloc[:, 2:]

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,7.828,2.952,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,7.258,2.998,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,8.038,2.894,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,8.063,3.042,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,5.236,3.058,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,7.044,3.052,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,5.900,3.150,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,5.240,3.032,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,6.275,3.164,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [61]:
asthma_gse_8052.iloc[:, :2]

Unnamed: 0,Gender,Label
0,1,0
1,0,0
2,1,1
3,1,1
4,0,1
...,...,...
399,0,1
400,1,0
401,1,0
402,0,1


In [62]:
asthma_gse_8052["Label"]

0      0
1      0
2      1
3      1
4      1
      ..
399    1
400    0
401    0
402    1
403    1
Name: Label, Length: 404, dtype: int64

## Mapping GSE 8052 with Annotation

In [65]:
asthma_gse_8052

Unnamed: 0,Gender,Label,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,1,0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,...,12.054,11.645,13.341,13.386,3.427,3.104,2.882,2.532,2.929,2.858
1,0,0,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,...,10.743,10.294,12.572,12.424,3.367,3.125,2.826,2.632,2.714,2.687
2,1,1,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,...,11.639,11.170,12.907,12.832,3.281,3.343,2.988,2.630,2.932,2.989
3,1,1,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,...,11.411,10.909,12.846,12.690,3.350,3.186,2.898,2.608,2.894,2.752
4,0,1,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,...,11.459,11.085,12.897,12.856,3.291,3.315,2.968,2.871,2.888,2.994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,...,11.148,10.779,12.789,12.692,3.386,3.072,2.969,2.774,2.987,2.672
400,1,0,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,...,11.484,11.003,12.834,12.861,3.809,3.529,3.159,2.764,2.896,3.055
401,1,0,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,...,11.188,10.754,12.836,12.764,3.546,3.298,2.940,2.782,2.793,2.941
402,0,1,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,...,11.367,10.925,12.873,12.751,3.282,2.984,2.820,2.653,3.009,2.713


In [74]:
mapping_gse_8052 = asthma_gse_8052.iloc[:, 2:].T
mapping_gse_8052

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,394,395,396,397,398,399,400,401,402,403
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1007_s_at,6.945,6.300,6.097,6.693,6.156,6.163,5.675,6.070,6.129,6.066,...,6.103,6.040,5.938,6.421,6.058,5.943,5.971,6.208,6.303,6.361
1053_at,7.507,7.044,7.318,7.027,7.674,7.628,7.896,7.449,7.104,7.533,...,7.635,7.642,7.439,8.069,7.703,7.790,7.540,7.600,7.833,7.688
117_at,4.402,4.510,5.014,4.654,4.479,4.428,4.344,4.652,4.765,4.582,...,4.559,4.498,4.753,4.894,4.164,4.358,4.887,4.661,4.162,4.137
121_at,6.424,6.282,6.450,6.434,6.636,6.745,6.220,6.259,6.782,6.617,...,6.357,6.413,6.673,6.765,6.646,6.542,6.350,6.631,6.594,6.356
1255_g_at,2.572,2.518,2.599,2.531,2.777,2.590,2.703,2.542,2.955,2.593,...,2.734,2.614,2.570,2.683,2.676,2.544,2.612,2.652,2.604,2.630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AFFX-ThrX-5_at,3.104,3.125,3.343,3.186,3.315,3.231,3.187,3.210,3.177,3.119,...,2.972,3.092,3.282,3.218,3.319,3.072,3.529,3.298,2.984,3.197
AFFX-ThrX-M_at,2.882,2.826,2.988,2.898,2.968,2.794,2.906,2.891,2.955,2.992,...,2.823,2.886,3.043,2.928,3.033,2.969,3.159,2.940,2.820,2.894
AFFX-TrpnX-3_at,2.532,2.632,2.630,2.608,2.871,2.605,2.543,2.700,3.224,2.800,...,2.870,2.741,2.663,2.712,2.547,2.774,2.764,2.782,2.653,2.655
AFFX-TrpnX-5_at,2.929,2.714,2.932,2.894,2.888,2.997,2.878,2.837,3.007,3.398,...,2.888,3.205,3.075,2.956,2.968,2.987,2.896,2.793,3.009,2.931


In [75]:
mapping_gse_8052.index.name = "ID"

In [77]:
mapping_gse_8052.reset_index()

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,394,395,396,397,398,399,400,401,402,403
0,1007_s_at,6.945,6.300,6.097,6.693,6.156,6.163,5.675,6.070,6.129,...,6.103,6.040,5.938,6.421,6.058,5.943,5.971,6.208,6.303,6.361
1,1053_at,7.507,7.044,7.318,7.027,7.674,7.628,7.896,7.449,7.104,...,7.635,7.642,7.439,8.069,7.703,7.790,7.540,7.600,7.833,7.688
2,117_at,4.402,4.510,5.014,4.654,4.479,4.428,4.344,4.652,4.765,...,4.559,4.498,4.753,4.894,4.164,4.358,4.887,4.661,4.162,4.137
3,121_at,6.424,6.282,6.450,6.434,6.636,6.745,6.220,6.259,6.782,...,6.357,6.413,6.673,6.765,6.646,6.542,6.350,6.631,6.594,6.356
4,1255_g_at,2.572,2.518,2.599,2.531,2.777,2.590,2.703,2.542,2.955,...,2.734,2.614,2.570,2.683,2.676,2.544,2.612,2.652,2.604,2.630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54670,AFFX-ThrX-5_at,3.104,3.125,3.343,3.186,3.315,3.231,3.187,3.210,3.177,...,2.972,3.092,3.282,3.218,3.319,3.072,3.529,3.298,2.984,3.197
54671,AFFX-ThrX-M_at,2.882,2.826,2.988,2.898,2.968,2.794,2.906,2.891,2.955,...,2.823,2.886,3.043,2.928,3.033,2.969,3.159,2.940,2.820,2.894
54672,AFFX-TrpnX-3_at,2.532,2.632,2.630,2.608,2.871,2.605,2.543,2.700,3.224,...,2.870,2.741,2.663,2.712,2.547,2.774,2.764,2.782,2.653,2.655
54673,AFFX-TrpnX-5_at,2.929,2.714,2.932,2.894,2.888,2.997,2.878,2.837,3.007,...,2.888,3.205,3.075,2.956,2.968,2.987,2.896,2.793,3.009,2.931


In [78]:
mapping_annotation

Unnamed: 0,ID,Gene Symbol
0,1007_s_at,DDR1 /// MIR4640
1,1053_at,RFC2
2,117_at,HSPA6
3,121_at,PAX8
4,1255_g_at,GUCA1A
...,...,...
45777,AFFX-HUMGAPDH/M33197_M_at,GAPDH
45778,AFFX-HUMISGF3A/M97935_3_at,STAT1
45779,AFFX-HUMISGF3A/M97935_5_at,STAT1
45780,AFFX-HUMISGF3A/M97935_MA_at,STAT1


In [79]:
mapped_gse_8052 = pd.merge(mapping_gse_8052, mapping_annotation, on = "ID")

In [80]:
mapped_gse_8052

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,395,396,397,398,399,400,401,402,403,Gene Symbol
0,1007_s_at,6.945,6.300,6.097,6.693,6.156,6.163,5.675,6.070,6.129,...,6.040,5.938,6.421,6.058,5.943,5.971,6.208,6.303,6.361,DDR1 /// MIR4640
1,1053_at,7.507,7.044,7.318,7.027,7.674,7.628,7.896,7.449,7.104,...,7.642,7.439,8.069,7.703,7.790,7.540,7.600,7.833,7.688,RFC2
2,117_at,4.402,4.510,5.014,4.654,4.479,4.428,4.344,4.652,4.765,...,4.498,4.753,4.894,4.164,4.358,4.887,4.661,4.162,4.137,HSPA6
3,121_at,6.424,6.282,6.450,6.434,6.636,6.745,6.220,6.259,6.782,...,6.413,6.673,6.765,6.646,6.542,6.350,6.631,6.594,6.356,PAX8
4,1255_g_at,2.572,2.518,2.599,2.531,2.777,2.590,2.703,2.542,2.955,...,2.614,2.570,2.683,2.676,2.544,2.612,2.652,2.604,2.630,GUCA1A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45777,AFFX-HUMGAPDH/M33197_M_at,12.851,12.758,13.192,12.812,12.991,12.999,12.898,12.963,12.939,...,12.970,13.020,13.304,12.921,13.069,12.801,12.954,12.903,12.958,GAPDH
45778,AFFX-HUMISGF3A/M97935_3_at,10.800,11.271,10.731,10.603,10.531,10.578,10.258,10.768,10.286,...,10.663,10.357,10.739,10.262,10.386,10.425,10.148,10.426,10.579,STAT1
45779,AFFX-HUMISGF3A/M97935_5_at,7.925,8.728,7.707,8.308,8.737,9.068,5.526,8.701,8.145,...,8.467,8.723,8.805,8.594,8.673,8.561,8.154,8.818,8.614,STAT1
45780,AFFX-HUMISGF3A/M97935_MA_at,8.792,9.810,9.518,9.279,9.547,9.776,8.309,9.581,8.644,...,9.653,9.749,9.641,9.492,9.641,9.636,9.187,9.761,9.529,STAT1


In [81]:
mapped_gse_8052["Gene Symbol"].isna().sum()

0

In [82]:
mapped_gse_8052 = mapped_gse_8052.drop(["ID", "Gene Symbol"], axis = 1).T

In [83]:
mapped_gse_8052

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45772,45773,45774,45775,45776,45777,45778,45779,45780,45781
0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,7.828,2.952,...,13.260,12.360,12.636,12.862,12.753,12.851,10.800,7.925,8.792,8.936
1,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,7.258,2.998,...,13.041,12.759,12.902,12.790,12.812,12.758,11.271,8.728,9.810,9.951
2,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,8.038,2.894,...,13.459,13.207,13.486,13.133,12.936,13.192,10.731,7.707,9.518,9.369
3,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,8.063,3.042,...,13.260,13.078,13.213,12.721,12.734,12.812,10.603,8.308,9.279,9.253
4,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,5.236,3.058,...,13.187,13.112,13.226,12.880,12.964,12.991,10.531,8.737,9.547,9.223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,7.044,3.052,...,13.354,13.422,13.544,12.906,12.988,13.069,10.386,8.673,9.641,9.072
400,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,5.900,3.150,...,13.117,13.023,13.153,12.681,12.766,12.801,10.425,8.561,9.636,9.077
401,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,5.240,3.032,...,13.125,12.952,13.145,12.864,12.954,12.954,10.148,8.154,9.187,8.732
402,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,6.275,3.164,...,13.150,13.134,13.264,12.747,12.833,12.903,10.426,8.818,9.761,9.227


In [84]:
mapped_gse_8052.columns = mapping_annotation["Gene Symbol"]

In [85]:
mapped_gse_8052

Gene Symbol,DDR1 /// MIR4640,RFC2,HSPA6,PAX8,GUCA1A,MIR5193 /// UBA7,THRA,PTPN21,CCL5,CYP2E1,...,ACTB,ACTB.1,ACTB.2,GAPDH,GAPDH.1,GAPDH.2,STAT1,STAT1.1,STAT1.2,STAT1.3
0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,7.828,2.952,...,13.260,12.360,12.636,12.862,12.753,12.851,10.800,7.925,8.792,8.936
1,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,7.258,2.998,...,13.041,12.759,12.902,12.790,12.812,12.758,11.271,8.728,9.810,9.951
2,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,8.038,2.894,...,13.459,13.207,13.486,13.133,12.936,13.192,10.731,7.707,9.518,9.369
3,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,8.063,3.042,...,13.260,13.078,13.213,12.721,12.734,12.812,10.603,8.308,9.279,9.253
4,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,5.236,3.058,...,13.187,13.112,13.226,12.880,12.964,12.991,10.531,8.737,9.547,9.223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,7.044,3.052,...,13.354,13.422,13.544,12.906,12.988,13.069,10.386,8.673,9.641,9.072
400,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,5.900,3.150,...,13.117,13.023,13.153,12.681,12.766,12.801,10.425,8.561,9.636,9.077
401,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,5.240,3.032,...,13.125,12.952,13.145,12.864,12.954,12.954,10.148,8.154,9.187,8.732
402,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,6.275,3.164,...,13.150,13.134,13.264,12.747,12.833,12.903,10.426,8.818,9.761,9.227


In [86]:
asthma_gse_8052.iloc[:, :2]

Unnamed: 0,Gender,Label
0,1,0
1,0,0
2,1,1
3,1,1
4,0,1
...,...,...
399,0,1
400,1,0
401,1,0
402,0,1


In [88]:
mapped_gse_8052_df = pd.concat([mapped_gse_8052.astype('float64'), asthma_gse_8052.iloc[:, :2]], axis = 1)

In [89]:
mapped_gse_8052_df

Unnamed: 0,DDR1 /// MIR4640,RFC2,HSPA6,PAX8,GUCA1A,MIR5193 /// UBA7,THRA,PTPN21,CCL5,CYP2E1,...,ACTB,GAPDH,GAPDH.1,GAPDH.2,STAT1,STAT1.1,STAT1.2,STAT1.3,Gender,Label
0,6.945,7.507,4.402,6.424,2.572,7.299,4.167,3.478,7.828,2.952,...,12.636,12.862,12.753,12.851,10.800,7.925,8.792,8.936,1,0
1,6.300,7.044,4.510,6.282,2.518,7.500,4.160,3.286,7.258,2.998,...,12.902,12.790,12.812,12.758,11.271,8.728,9.810,9.951,0,0
2,6.097,7.318,5.014,6.450,2.599,8.192,4.329,3.447,8.038,2.894,...,13.486,13.133,12.936,13.192,10.731,7.707,9.518,9.369,1,1
3,6.693,7.027,4.654,6.434,2.531,7.676,4.335,3.532,8.063,3.042,...,13.213,12.721,12.734,12.812,10.603,8.308,9.279,9.253,1,1
4,6.156,7.674,4.479,6.636,2.777,7.500,4.328,3.407,5.236,3.058,...,13.226,12.880,12.964,12.991,10.531,8.737,9.547,9.223,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,5.943,7.790,4.358,6.542,2.544,7.431,4.154,3.341,7.044,3.052,...,13.544,12.906,12.988,13.069,10.386,8.673,9.641,9.072,0,1
400,5.971,7.540,4.887,6.350,2.612,7.698,4.185,3.885,5.900,3.150,...,13.153,12.681,12.766,12.801,10.425,8.561,9.636,9.077,1,0
401,6.208,7.600,4.661,6.631,2.652,7.441,4.307,3.778,5.240,3.032,...,13.145,12.864,12.954,12.954,10.148,8.154,9.187,8.732,1,0
402,6.303,7.833,4.162,6.594,2.604,7.573,4.277,3.394,6.275,3.164,...,13.264,12.747,12.833,12.903,10.426,8.818,9.761,9.227,0,1


In [91]:
unique_col = np.unique(mapped_gse_8052_df.columns)
unique_col.size

23520

In [92]:
unique_col

array(['1-Dec', '1-Mar', '1-Sep', ...,
       'hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10',
       'hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10',
       'mir-223'], dtype=object)

### Select a gene out of redundant genes

In [93]:
new_gse_8052_df = pd.DataFrame()
for col in unique_col:
    gene_df = pd.DataFrame(mapped_gse_8052_df[col])
    if gene_df.columns.size == 1:
        new_gse_8052_df[col] = gene_df[col]
    else:
        new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]

  new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]
  new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]
  new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]
  new_gse_8052_df[col] = gene_df[col]
  new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]
  new_gse_8052_df[col] = gene_df[col]
  new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]
  new_gse_8052_df[col] = gene_df[col]
  new_gse_8052_df[col] = gene_df[col]
  new_gse_8052_df[col] = gene_df[col]
  new_gse_8052_df[col] = gene_df[col]
  new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]
  new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]
  new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]
  new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]
  new_gse_8052_df[col] = gene_df.iloc[:, gene_df.sum(axis = 0).argmax()]
  new_gse_8052_df[col] = gene_df.iloc[:, g

In [101]:
new_gse_8052_df

Unnamed: 0,Gender,Label,1-Dec,1-Mar,1-Sep,10-Mar,10-Sep,11-Mar,11-Sep,12-Sep,...,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,abParts /// IGKC /// IGKV4-1 /// IGKV4-1,av27s1 /// TRAV39 /// TRAV39,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10,mir-223
0,1,0,2.808,5.375,6.708,4.489,3.817,3.125,8.196,4.833,...,4.719,7.619,6.160,6.828,7.323,7.370,3.553,5.034,3.493,2.569
1,0,0,2.745,4.995,7.218,4.298,4.185,3.119,8.574,4.613,...,4.196,7.753,5.776,7.040,7.722,7.049,3.313,4.602,3.310,2.978
2,1,1,2.846,5.603,7.708,4.719,8.948,3.186,8.526,4.899,...,5.119,7.312,7.466,6.385,6.436,8.557,3.446,4.822,3.647,2.864
3,1,1,2.771,5.332,6.991,4.403,5.533,3.085,8.441,4.944,...,4.918,7.463,6.938,7.126,7.671,8.604,3.456,4.993,3.331,2.741
4,0,1,2.501,5.191,7.424,4.224,5.268,3.217,8.121,4.698,...,4.432,7.416,6.168,6.692,7.487,10.699,3.143,4.629,3.419,3.124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,2.888,4.710,7.514,4.163,5.377,3.271,8.552,4.940,...,4.590,7.566,6.775,6.786,7.241,10.978,3.386,4.807,3.205,3.059
400,1,0,2.994,4.760,6.984,4.703,4.473,3.274,8.018,4.859,...,3.294,7.725,6.303,6.655,7.435,10.478,3.252,4.922,3.602,2.731
401,1,0,2.943,5.279,7.146,4.606,6.384,2.922,8.264,4.923,...,4.125,7.743,6.035,6.916,7.577,11.603,3.260,4.755,3.350,2.729
402,0,1,2.863,4.944,7.328,4.156,4.241,2.924,8.445,4.537,...,3.701,7.940,5.628,6.722,7.613,10.910,3.279,4.213,3.229,3.295


In [102]:
new_gse_8052_df.to_csv("Preprocessed_Asthma_GSE8052.csv", index = False)

---

## Pathway

In [103]:
def import_selected_pathway(filename, database = ['KEGG']):
    f = open(filename, "r")
    data = f.readlines()
    pathway_db = pd.DataFrame()
    for line in data:
        pathway_db = pd.concat([pathway_db, pd.DataFrame([tuple(line.strip().split('\t'))])], ignore_index=True)
    # use the database name as index
    pathway_db.index = pathway_db.iloc[:, 0].apply(lambda x: x.split('_')[0])
    # keep the recodes from reactome and kegg database
    pathways = pathway_db.loc[database]
    print(pathways)
    # reset the index, and drop url
    #pathways = pathways.rename_axis('DATABASE').rename_axis(None, 1).reset_index(drop=True).drop(1, axis=1)
    pathways = pathways.rename_axis('DATABASE').reset_index(drop=True).drop(1, axis=1)
    # drop all missing columns
    pathways_data = pathways.dropna(axis = 1, how = 'all')
    return(pathways_data)

filename = "c2.cp.v7.4.symbols.gmt"
# Feb 7, 2019: use defalut rather than 'KEGG', 'REACTOME', 'BIOCARTA', 'PID'
pathways_data = import_selected_pathway(filename)
#print("Number of pathways in KEGG, REACTOME: ", pathways_data.shape[0])

                                                   0     \
0                                                         
KEGG                         KEGG_N_GLYCAN_BIOSYNTHESIS   
KEGG                      KEGG_OTHER_GLYCAN_DEGRADATION   
KEGG                         KEGG_O_GLYCAN_BIOSYNTHESIS   
KEGG                 KEGG_GLYCOSAMINOGLYCAN_DEGRADATION   
KEGG  KEGG_GLYCOSAMINOGLYCAN_BIOSYNTHESIS_KERATAN_SU...   
...                                                 ...   
KEGG                                        KEGG_ASTHMA   
KEGG                    KEGG_AUTOIMMUNE_THYROID_DISEASE   
KEGG                           KEGG_ALLOGRAFT_REJECTION   
KEGG                     KEGG_GRAFT_VERSUS_HOST_DISEASE   
KEGG                             KEGG_VIRAL_MYOCARDITIS   

                                                   1         2         3     \
0                                                                             
KEGG  http://www.gsea-msigdb.org/gsea/msigdb/cards/K...     ALG13    DOLPP

In [104]:
### Discussed with Dr. Kang on March 12, 2019
# exclude large (i.e. > 300) AND small (i.e. < 15) pathways 
'''Exclude small and large sizes of pathways'''
def pathway_filter(pathways_data, small_cutoff, large_cutoff):
    # minus first column (i.e. pathway_name)
    pathway_sizes = pathways_data.count(axis=1) - 1
    pathways = pathways_data[(pathway_sizes > small_cutoff) & (pathway_sizes < large_cutoff)]
    return(pathways)

pathways = pathway_filter(pathways_data, 15, 300)
print("Number of pathways with length between 15 and 300: ", pathways.shape[0])

Number of pathways with length between 15 and 300:  173


In [105]:
pathways

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,381,382,383,384,385,386,387,388,389,390
0,KEGG_N_GLYCAN_BIOSYNTHESIS,ALG13,DOLPP1,RPN1,ALG14,MAN1B1,ALG3,B4GALT1,MGAT5,RPN2,...,,,,,,,,,,
1,KEGG_OTHER_GLYCAN_DEGRADATION,ENGASE,GLB1,MANBA,MAN2B1,GBA,NEU4,NEU2,NEU1,FUCA1,...,,,,,,,,,,
2,KEGG_O_GLYCAN_BIOSYNTHESIS,GALNT4,GALNT15,GALNTL5,GALNT6,GALNT5,GALNT16,GALNTL6,GALNT13,GCNT3,...,,,,,,,,,,
3,KEGG_GLYCOSAMINOGLYCAN_DEGRADATION,HS3ST3A1,HPSE,HPSE2,GLB1,GUSB,HYAL3,GNS,HYAL4,HYAL1,...,,,,,,,,,,
5,KEGG_GLYCEROLIPID_METABOLISM,MBOAT2,GPAM,LIPG,DGKZ,DGKE,DGKD,DGKH,MBOAT1,GK,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,KEGG_ASTHMA,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,IL3,TNF,CCL11,EPX,...,,,,,,,,,,
182,KEGG_AUTOIMMUNE_THYROID_DISEASE,HLA-DOA,HLA-DOB,CD80,CD86,CD28,IFNA5,IFNA4,IFNA2,TSHR,...,,,,,,,,,,
183,KEGG_ALLOGRAFT_REJECTION,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,CD28,TNF,...,,,,,,,,,,
184,KEGG_GRAFT_VERSUS_HOST_DISEASE,HLA-DRB4,KIR2DL1,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,KLRD1,...,,,,,,,,,,


In [106]:
pathways = pathways.reset_index(drop = True)

In [107]:
pathways

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,381,382,383,384,385,386,387,388,389,390
0,KEGG_N_GLYCAN_BIOSYNTHESIS,ALG13,DOLPP1,RPN1,ALG14,MAN1B1,ALG3,B4GALT1,MGAT5,RPN2,...,,,,,,,,,,
1,KEGG_OTHER_GLYCAN_DEGRADATION,ENGASE,GLB1,MANBA,MAN2B1,GBA,NEU4,NEU2,NEU1,FUCA1,...,,,,,,,,,,
2,KEGG_O_GLYCAN_BIOSYNTHESIS,GALNT4,GALNT15,GALNTL5,GALNT6,GALNT5,GALNT16,GALNTL6,GALNT13,GCNT3,...,,,,,,,,,,
3,KEGG_GLYCOSAMINOGLYCAN_DEGRADATION,HS3ST3A1,HPSE,HPSE2,GLB1,GUSB,HYAL3,GNS,HYAL4,HYAL1,...,,,,,,,,,,
4,KEGG_GLYCEROLIPID_METABOLISM,MBOAT2,GPAM,LIPG,DGKZ,DGKE,DGKD,DGKH,MBOAT1,GK,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,KEGG_ASTHMA,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,IL3,TNF,CCL11,EPX,...,,,,,,,,,,
169,KEGG_AUTOIMMUNE_THYROID_DISEASE,HLA-DOA,HLA-DOB,CD80,CD86,CD28,IFNA5,IFNA4,IFNA2,TSHR,...,,,,,,,,,,
170,KEGG_ALLOGRAFT_REJECTION,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,CD28,TNF,...,,,,,,,,,,
171,KEGG_GRAFT_VERSUS_HOST_DISEASE,HLA-DRB4,KIR2DL1,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,KLRD1,...,,,,,,,,,,


In [108]:
pathways = pathways.dropna(axis = 1, how = 'all')

In [109]:
pathways.columns = range(pathways.shape[1])

In [110]:
pathways

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,263,264,265,266,267,268,269,270,271,272
0,KEGG_N_GLYCAN_BIOSYNTHESIS,ALG13,DOLPP1,RPN1,ALG14,MAN1B1,ALG3,B4GALT1,MGAT5,RPN2,...,,,,,,,,,,
1,KEGG_OTHER_GLYCAN_DEGRADATION,ENGASE,GLB1,MANBA,MAN2B1,GBA,NEU4,NEU2,NEU1,FUCA1,...,,,,,,,,,,
2,KEGG_O_GLYCAN_BIOSYNTHESIS,GALNT4,GALNT15,GALNTL5,GALNT6,GALNT5,GALNT16,GALNTL6,GALNT13,GCNT3,...,,,,,,,,,,
3,KEGG_GLYCOSAMINOGLYCAN_DEGRADATION,HS3ST3A1,HPSE,HPSE2,GLB1,GUSB,HYAL3,GNS,HYAL4,HYAL1,...,,,,,,,,,,
4,KEGG_GLYCEROLIPID_METABOLISM,MBOAT2,GPAM,LIPG,DGKZ,DGKE,DGKD,DGKH,MBOAT1,GK,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,KEGG_ASTHMA,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,IL3,TNF,CCL11,EPX,...,,,,,,,,,,
169,KEGG_AUTOIMMUNE_THYROID_DISEASE,HLA-DOA,HLA-DOB,CD80,CD86,CD28,IFNA5,IFNA4,IFNA2,TSHR,...,,,,,,,,,,
170,KEGG_ALLOGRAFT_REJECTION,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,CD28,TNF,...,,,,,,,,,,
171,KEGG_GRAFT_VERSUS_HOST_DISEASE,HLA-DRB4,KIR2DL1,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,KLRD1,...,,,,,,,,,,


In [111]:
pathways = pathways.rename(columns = {0 : 'Name'})

In [112]:
pathways

Unnamed: 0,Name,1,2,3,4,5,6,7,8,9,...,263,264,265,266,267,268,269,270,271,272
0,KEGG_N_GLYCAN_BIOSYNTHESIS,ALG13,DOLPP1,RPN1,ALG14,MAN1B1,ALG3,B4GALT1,MGAT5,RPN2,...,,,,,,,,,,
1,KEGG_OTHER_GLYCAN_DEGRADATION,ENGASE,GLB1,MANBA,MAN2B1,GBA,NEU4,NEU2,NEU1,FUCA1,...,,,,,,,,,,
2,KEGG_O_GLYCAN_BIOSYNTHESIS,GALNT4,GALNT15,GALNTL5,GALNT6,GALNT5,GALNT16,GALNTL6,GALNT13,GCNT3,...,,,,,,,,,,
3,KEGG_GLYCOSAMINOGLYCAN_DEGRADATION,HS3ST3A1,HPSE,HPSE2,GLB1,GUSB,HYAL3,GNS,HYAL4,HYAL1,...,,,,,,,,,,
4,KEGG_GLYCEROLIPID_METABOLISM,MBOAT2,GPAM,LIPG,DGKZ,DGKE,DGKD,DGKH,MBOAT1,GK,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,KEGG_ASTHMA,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,IL3,TNF,CCL11,EPX,...,,,,,,,,,,
169,KEGG_AUTOIMMUNE_THYROID_DISEASE,HLA-DOA,HLA-DOB,CD80,CD86,CD28,IFNA5,IFNA4,IFNA2,TSHR,...,,,,,,,,,,
170,KEGG_ALLOGRAFT_REJECTION,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,CD28,TNF,...,,,,,,,,,,
171,KEGG_GRAFT_VERSUS_HOST_DISEASE,HLA-DRB4,KIR2DL1,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,KLRD1,...,,,,,,,,,,


In [113]:
pathway_name = pathways['Name']

In [114]:
pathway_name

0              KEGG_N_GLYCAN_BIOSYNTHESIS
1           KEGG_OTHER_GLYCAN_DEGRADATION
2              KEGG_O_GLYCAN_BIOSYNTHESIS
3      KEGG_GLYCOSAMINOGLYCAN_DEGRADATION
4            KEGG_GLYCEROLIPID_METABOLISM
                      ...                
168                           KEGG_ASTHMA
169       KEGG_AUTOIMMUNE_THYROID_DISEASE
170              KEGG_ALLOGRAFT_REJECTION
171        KEGG_GRAFT_VERSUS_HOST_DISEASE
172                KEGG_VIRAL_MYOCARDITIS
Name: Name, Length: 173, dtype: object

In [115]:
pathways.Name.size

173

In [116]:
set_pathway = {'pathway' : [], 'gene' : []}
def make_pathway_set(pathway_df):
    set_pathway['pathway'].append(pathway_df[~pathway_df.isna()].Name)
    set_pathway['gene'].append(pathway_df[~pathway_df.isna()][1:])
    #print(pathway_df[~pathway_df.isna()][1:])
    #print(pathway_df[~pathway_df.isna()].Name)
    #print(pathway_df.index())
    
pathways.apply(make_pathway_set, 1)

0      None
1      None
2      None
3      None
4      None
       ... 
168    None
169    None
170    None
171    None
172    None
Length: 173, dtype: object

In [119]:
pathway_list = np.concatenate(pathways.iloc[:, 1:].values.astype('str'), axis = None)
pathway_list

array(['ALG13', 'DOLPP1', 'RPN1', ..., 'nan', 'nan', 'nan'], dtype='<U13')

In [120]:
pathway_list.size

47056

In [121]:
np.unique(pathway_list), np.unique(pathway_list).size 

(array(['A2M', 'AACS', 'AADAT', ..., 'ZNF274', 'ZYX', 'nan'], dtype='<U13'),
 4818)

In [122]:
unique_gene_in_pathway = np.delete(np.unique(pathway_list), np.where(np.unique(pathway_list) == 'nan'))
unique_gene_in_pathway

array(['A2M', 'AACS', 'AADAT', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
      dtype='<U13')

In [123]:
unique_gene_in_pathway.size ### number of genes in KEGG

4817

---

In [124]:
new_gse_8052_df

Unnamed: 0,Gender,Label,1-Dec,1-Mar,1-Sep,10-Mar,10-Sep,11-Mar,11-Sep,12-Sep,...,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,abParts /// IGKC /// IGKV4-1 /// IGKV4-1,av27s1 /// TRAV39 /// TRAV39,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// MIRLET7BHG /// RP4-695O20__B.10,hsa-let-7a-3 /// hsa-let-7b /// hsa-mir-4763 /// RP4-695O20__B.10,mir-223
0,1,0,2.808,5.375,6.708,4.489,3.817,3.125,8.196,4.833,...,4.719,7.619,6.160,6.828,7.323,7.370,3.553,5.034,3.493,2.569
1,0,0,2.745,4.995,7.218,4.298,4.185,3.119,8.574,4.613,...,4.196,7.753,5.776,7.040,7.722,7.049,3.313,4.602,3.310,2.978
2,1,1,2.846,5.603,7.708,4.719,8.948,3.186,8.526,4.899,...,5.119,7.312,7.466,6.385,6.436,8.557,3.446,4.822,3.647,2.864
3,1,1,2.771,5.332,6.991,4.403,5.533,3.085,8.441,4.944,...,4.918,7.463,6.938,7.126,7.671,8.604,3.456,4.993,3.331,2.741
4,0,1,2.501,5.191,7.424,4.224,5.268,3.217,8.121,4.698,...,4.432,7.416,6.168,6.692,7.487,10.699,3.143,4.629,3.419,3.124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,2.888,4.710,7.514,4.163,5.377,3.271,8.552,4.940,...,4.590,7.566,6.775,6.786,7.241,10.978,3.386,4.807,3.205,3.059
400,1,0,2.994,4.760,6.984,4.703,4.473,3.274,8.018,4.859,...,3.294,7.725,6.303,6.655,7.435,10.478,3.252,4.922,3.602,2.731
401,1,0,2.943,5.279,7.146,4.606,6.384,2.922,8.264,4.923,...,4.125,7.743,6.035,6.916,7.577,11.603,3.260,4.755,3.350,2.729
402,0,1,2.863,4.944,7.328,4.156,4.241,2.924,8.445,4.537,...,3.701,7.940,5.628,6.722,7.613,10.910,3.279,4.213,3.229,3.295


In [125]:
asthma_gse_8052_df = new_gse_8052_df.iloc[:, ~new_gse_8052_df.columns.str.contains('/')]

In [126]:
asthma_gse_8052_df

Unnamed: 0,Gender,Label,1-Dec,1-Mar,1-Sep,10-Mar,10-Sep,11-Mar,11-Sep,12-Sep,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,mir-223
0,1,0,2.808,5.375,6.708,4.489,3.817,3.125,8.196,4.833,...,9.619,4.121,4.953,5.675,4.719,7.619,6.160,6.828,7.323,2.569
1,0,0,2.745,4.995,7.218,4.298,4.185,3.119,8.574,4.613,...,8.099,4.835,5.583,6.861,4.196,7.753,5.776,7.040,7.722,2.978
2,1,1,2.846,5.603,7.708,4.719,8.948,3.186,8.526,4.899,...,9.222,3.665,4.197,5.505,5.119,7.312,7.466,6.385,6.436,2.864
3,1,1,2.771,5.332,6.991,4.403,5.533,3.085,8.441,4.944,...,8.525,4.985,5.454,6.871,4.918,7.463,6.938,7.126,7.671,2.741
4,0,1,2.501,5.191,7.424,4.224,5.268,3.217,8.121,4.698,...,10.051,4.240,4.957,6.020,4.432,7.416,6.168,6.692,7.487,3.124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,2.888,4.710,7.514,4.163,5.377,3.271,8.552,4.940,...,10.040,3.929,4.904,5.985,4.590,7.566,6.775,6.786,7.241,3.059
400,1,0,2.994,4.760,6.984,4.703,4.473,3.274,8.018,4.859,...,9.675,3.826,4.684,5.676,3.294,7.725,6.303,6.655,7.435,2.731
401,1,0,2.943,5.279,7.146,4.606,6.384,2.922,8.264,4.923,...,9.955,4.085,4.728,5.846,4.125,7.743,6.035,6.916,7.577,2.729
402,0,1,2.863,4.944,7.328,4.156,4.241,2.924,8.445,4.537,...,9.602,4.388,5.116,6.024,3.701,7.940,5.628,6.722,7.613,3.295


In [127]:
(asthma_gse_8052_df.iloc[:, 2:] == 0).sum(axis = 0)

1-Dec      0
1-Mar      0
1-Sep      0
10-Mar     0
10-Sep     0
          ..
ZYG11B     0
ZYX        0
ZZEF1      0
ZZZ3       0
mir-223    0
Length: 21651, dtype: int64

In [133]:
(asthma_gse_8052_df.iloc[:, 2:] == 0).sum(axis = 0).sum()

0

In [134]:
filtered_columns = (((asthma_gse_8052_df.iloc[:, 2:] == 0).sum(axis = 0) / asthma_gse_8052_df.shape[0]) < 0.8)

In [135]:
filtered_columns

1-Dec      True
1-Mar      True
1-Sep      True
10-Mar     True
10-Sep     True
           ... 
ZYG11B     True
ZYX        True
ZZEF1      True
ZZZ3       True
mir-223    True
Length: 21651, dtype: bool

In [136]:
filtered_columns.sum()

21651

---

### Remove non pathway-gene from GSE 8052

In [138]:
asthma_gse_8052_df.columns.values[2:], asthma_gse_8052_df.columns.values[2:].size

(array(['1-Dec', '1-Mar', '1-Sep', ..., 'ZZEF1', 'ZZZ3', 'mir-223'],
       dtype=object),
 21651)

In [139]:
removable_gene_gse_8052 = np.setdiff1d(asthma_gse_8052_df.columns.values[2:], unique_gene_in_pathway)

In [140]:
removable_gene_gse_8052, removable_gene_gse_8052.size

(array(['1-Dec', '1-Mar', '1-Sep', ..., 'ZZEF1', 'ZZZ3', 'mir-223'],
       dtype=object),
 17257)

In [141]:
21651 - 17257

4394

In [142]:
only_pathway_gene_asthma_gse_8052_df = asthma_gse_8052_df.drop(removable_gene_gse_8052, axis = 1)

In [143]:
only_pathway_gene_asthma_gse_8052_df

Unnamed: 0,Gender,Label,A2M,AACS,AADAT,AANAT,AARS2,AASDH,AASDHPPT,AASS,...,ZBTB16,ZBTB17,ZCCHC7,ZFYVE16,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX
0,1,0,3.779,6.841,3.176,3.077,6.758,6.973,9.062,3.263,...,3.346,6.884,8.523,6.047,5.843,2.877,8.200,9.680,7.758,6.160
1,0,0,3.882,7.722,2.936,2.972,6.129,7.425,9.037,3.554,...,3.229,5.801,9.736,6.355,4.870,2.614,8.008,10.632,8.100,5.776
2,1,1,4.012,7.845,3.221,3.335,6.649,6.732,8.450,3.912,...,3.278,6.488,8.178,4.964,5.187,2.411,8.515,9.045,7.160,7.466
3,1,1,4.241,7.359,3.157,3.170,6.342,6.094,8.874,3.488,...,3.639,6.268,9.012,6.003,5.195,2.843,7.925,8.894,7.980,6.938
4,0,1,4.152,7.545,3.033,3.259,6.541,6.560,8.993,3.932,...,3.166,6.106,8.839,5.919,5.498,2.705,8.717,8.103,7.658,6.168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,1,3.911,7.408,3.183,3.020,6.602,7.136,9.091,3.205,...,3.483,6.628,8.475,5.843,5.165,2.805,8.508,8.469,7.544,6.775
400,1,0,3.917,7.531,3.074,3.895,6.358,6.239,9.119,3.492,...,3.942,6.227,8.934,5.767,4.307,2.578,8.352,7.713,7.205,6.303
401,1,0,3.349,7.602,3.016,3.429,6.473,6.273,9.302,3.491,...,3.537,6.146,8.691,6.369,4.884,2.640,8.387,7.748,7.696,6.035
402,0,1,3.724,7.727,3.136,3.336,6.477,6.537,9.298,3.007,...,3.580,6.413,9.315,6.364,4.950,2.701,8.294,7.587,7.317,5.628


In [144]:
only_pathway_gene_asthma_gse_8052_df.iloc[:, 2:].isna().sum().sum()

0

In [145]:
np.intersect1d(only_pathway_gene_asthma_gse_8052_df.columns.values[2:], unique_gene_in_pathway)

array(['A2M', 'AACS', 'AADAT', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
      dtype=object)

In [146]:
np.intersect1d(only_pathway_gene_asthma_gse_8052_df.columns.values[2:], unique_gene_in_pathway).size

4394

In [147]:
np.savetxt("GSE_8052_gene_list.txt", only_pathway_gene_asthma_gse_8052_df.columns.values[2:], delimiter = ",", fmt = '%s')

In [148]:
only_pathway_gene_asthma_gse_8052_df.to_csv("KEGG_Based_Genes_GSE_8052_Dataset.csv", index = False, header= True)

---

## Mask between Genes and Pathways

In [151]:
gene_name = only_pathway_gene_asthma_gse_8052_df.columns.values[2:]

In [152]:
gene_name.size, np.unique(gene_name).size

(4394, 4394)

In [153]:
pathway_name, pathway_name.size

(0              KEGG_N_GLYCAN_BIOSYNTHESIS
 1           KEGG_OTHER_GLYCAN_DEGRADATION
 2              KEGG_O_GLYCAN_BIOSYNTHESIS
 3      KEGG_GLYCOSAMINOGLYCAN_DEGRADATION
 4            KEGG_GLYCEROLIPID_METABOLISM
                       ...                
 168                           KEGG_ASTHMA
 169       KEGG_AUTOIMMUNE_THYROID_DISEASE
 170              KEGG_ALLOGRAFT_REJECTION
 171        KEGG_GRAFT_VERSUS_HOST_DISEASE
 172                KEGG_VIRAL_MYOCARDITIS
 Name: Name, Length: 173, dtype: object,
 173)

In [156]:
pathway_sparse_mat = sparse.coo_matrix((pathway_name.size, gene_name.size)).toarray()
pathway_sparse_mat.shape

(173, 4394)

In [157]:
pathway_sparse_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [158]:
pathway_sparse_mat.sum().sum()

0.0

In [159]:
for i in range(len(pathway_name)):
    pathway_sparse_mat[i, np.argwhere(np.isin(gene_name, set_pathway['gene'][i])).reshape((-1, ))] = 1.
            
pathway_sparse_mat, pathway_sparse_mat.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 (173, 4394))

In [160]:
pathway_sparse_mat.sum()

11013.0

In [162]:
pathway_sparse_mat = sparse.coo_matrix(pathway_sparse_mat)
sparse.save_npz("Asthma_GSE_8052_Gene_KEGG_Mask.npz", pathway_sparse_mat)

---

---

---

# GSE172367

In [164]:
data_path = "../../../../nasdatafolder/MTL/Data/Raw_Data/"

In [165]:
asthma_gse_172367 = pd.read_csv(data_path + "expr_normalized.GSE172367-outcome.csv", header = None).T

  asthma_gse_172367 = pd.read_csv(data_path + "expr_normalized.GSE172367-outcome.csv", header = None).T


In [166]:
asthma_gse_172367.columns = asthma_gse_172367.loc[0]
asthma_gse_172367 = asthma_gse_172367.drop(0).reset_index(drop = True)

In [167]:
asthma_gse_172367

Unnamed: 0,ID,Astha status,sex,age,smokingstatus,A2ML1,A4GALT,AAAS,AACS,AADAT,...,ZSWIM9,ZUP1,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
0,76b3_439a_1A,Control,Male,66,smoking: Current,11.26277043,11.70782026,10.19449532,10.98239746,7.201320584,...,8.712442,9.436202,10.28552,9.701255,10.744954,10.514774,10.271467,13.341647,11.345814,10.737504
1,76b3_439a_2A,Control,Male,66,smoking: Current,10.80428201,11.76620557,10.17716131,11.1617882,7.089056595,...,8.631747,9.38492,10.262957,9.350077,10.496802,10.593466,10.509996,13.341385,11.41617,10.823129
2,db6c_413d_1A,Asthma,Male,31,smoking: Never,10.96104261,11.56179994,10.79612493,10.97000529,7.5272221,...,8.884935,9.300335,10.233382,10.251471,11.297232,10.480638,9.775593,13.792914,11.567059,11.142879
3,db6c_413d_2A,Asthma,Male,31,smoking: Never,10.98960849,11.61274312,10.79534327,11.13099084,7.569946152,...,9.002151,9.244552,10.226691,10.023102,11.072794,10.489052,9.829573,13.762602,11.489648,11.208111
4,9623_473f_1A,Asthma,Male,62,smoking: Prior,8.980355904,11.09227383,10.56830344,10.69486456,7.439625724,...,8.634675,9.426026,10.097321,10.241384,10.802852,10.10187,10.499305,14.379587,11.403497,11.310799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,1413_46H2_2A,Asthma,Male,32,smoking: Never,8.669623511,11.63044846,10.75347811,11.37833834,7.722190152,...,9.244143,8.966868,10.006864,9.810887,10.564854,9.904911,10.285497,14.535487,11.440723,10.41663
186,6f9H_42Hd_1A,Asthma,Female,38,smoking: Never,11.16545099,11.25316755,10.80917088,11.2691359,7.676080427,...,9.256966,8.989647,10.234348,9.662538,10.997327,10.129738,10.094306,14.396685,11.464855,10.227823
187,6f9H_42Hd_2A,Asthma,Female,38,smoking: Never,10.78478852,11.41048454,10.79614436,11.51920914,7.565932529,...,9.191192,9.007537,10.083404,9.658088,10.812213,9.964722,10.034907,14.320728,11.442836,10.313125
188,8f7H_4HbH_1A,Asthma,Male,24,smoking: Never,9.438896338,11.16327666,11.19346802,10.87379956,8.012580033,...,8.602085,9.158097,10.615867,10.859995,11.661953,9.678222,10.60566,13.417643,11.226631,10.911255


In [168]:
asthma_gse_172367 = pd.concat([asthma_gse_172367.iloc[:, 5:], asthma_gse_172367["sex"], asthma_gse_172367["Astha status"]], axis = 1)

In [169]:
asthma_gse_172367

Unnamed: 0,A2ML1,A4GALT,AAAS,AACS,AADAT,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,sex,Astha status
0,11.26277043,11.70782026,10.19449532,10.98239746,7.201320584,11.6615504,12.41538764,6.899753785,12.46986548,10.56304537,...,10.28552,9.701255,10.744954,10.514774,10.271467,13.341647,11.345814,10.737504,Male,Control
1,10.80428201,11.76620557,10.17716131,11.1617882,7.089056595,11.70509317,12.43493536,7.046407038,12.36572933,10.66968172,...,10.262957,9.350077,10.496802,10.593466,10.509996,13.341385,11.41617,10.823129,Male,Control
2,10.96104261,11.56179994,10.79612493,10.97000529,7.5272221,11.45307918,12.09571346,6.778553685,12.27191576,10.68196385,...,10.233382,10.251471,11.297232,10.480638,9.775593,13.792914,11.567059,11.142879,Male,Asthma
3,10.98960849,11.61274312,10.79534327,11.13099084,7.569946152,11.40570429,12.03296894,6.719778351,12.27368854,10.70456218,...,10.226691,10.023102,11.072794,10.489052,9.829573,13.762602,11.489648,11.208111,Male,Asthma
4,8.980355904,11.09227383,10.56830344,10.69486456,7.439625724,11.47183605,12.30056033,6.957044666,12.20922533,10.34190587,...,10.097321,10.241384,10.802852,10.10187,10.499305,14.379587,11.403497,11.310799,Male,Asthma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,8.669623511,11.63044846,10.75347811,11.37833834,7.722190152,11.17987921,11.18323117,7.588338801,12.79177711,10.64678964,...,10.006864,9.810887,10.564854,9.904911,10.285497,14.535487,11.440723,10.41663,Male,Asthma
186,11.16545099,11.25316755,10.80917088,11.2691359,7.676080427,11.02580461,10.75349474,7.395532254,12.68342293,10.81788291,...,10.234348,9.662538,10.997327,10.129738,10.094306,14.396685,11.464855,10.227823,Female,Asthma
187,10.78478852,11.41048454,10.79614436,11.51920914,7.565932529,11.24989011,10.79290895,7.502488142,12.73637941,10.71641757,...,10.083404,9.658088,10.812213,9.964722,10.034907,14.320728,11.442836,10.313125,Female,Asthma
188,9.438896338,11.16327666,11.19346802,10.87379956,8.012580033,11.39242805,10.9201516,7.004261456,12.36019363,10.72941693,...,10.615867,10.859995,11.661953,9.678222,10.60566,13.417643,11.226631,10.911255,Male,Asthma


In [170]:
asthma_gse_172367["sex"] = asthma_gse_172367["sex"].apply(lambda x : 0 if x.find('Female') != -1 else 1)

In [171]:
asthma_gse_172367["Astha status"] = asthma_gse_172367["Astha status"].apply(lambda x : 0 if x.find('Control') != -1 else 1)

In [172]:
asthma_gse_172367

Unnamed: 0,A2ML1,A4GALT,AAAS,AACS,AADAT,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,sex,Astha status
0,11.26277043,11.70782026,10.19449532,10.98239746,7.201320584,11.6615504,12.41538764,6.899753785,12.46986548,10.56304537,...,10.28552,9.701255,10.744954,10.514774,10.271467,13.341647,11.345814,10.737504,1,0
1,10.80428201,11.76620557,10.17716131,11.1617882,7.089056595,11.70509317,12.43493536,7.046407038,12.36572933,10.66968172,...,10.262957,9.350077,10.496802,10.593466,10.509996,13.341385,11.41617,10.823129,1,0
2,10.96104261,11.56179994,10.79612493,10.97000529,7.5272221,11.45307918,12.09571346,6.778553685,12.27191576,10.68196385,...,10.233382,10.251471,11.297232,10.480638,9.775593,13.792914,11.567059,11.142879,1,1
3,10.98960849,11.61274312,10.79534327,11.13099084,7.569946152,11.40570429,12.03296894,6.719778351,12.27368854,10.70456218,...,10.226691,10.023102,11.072794,10.489052,9.829573,13.762602,11.489648,11.208111,1,1
4,8.980355904,11.09227383,10.56830344,10.69486456,7.439625724,11.47183605,12.30056033,6.957044666,12.20922533,10.34190587,...,10.097321,10.241384,10.802852,10.10187,10.499305,14.379587,11.403497,11.310799,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,8.669623511,11.63044846,10.75347811,11.37833834,7.722190152,11.17987921,11.18323117,7.588338801,12.79177711,10.64678964,...,10.006864,9.810887,10.564854,9.904911,10.285497,14.535487,11.440723,10.41663,1,1
186,11.16545099,11.25316755,10.80917088,11.2691359,7.676080427,11.02580461,10.75349474,7.395532254,12.68342293,10.81788291,...,10.234348,9.662538,10.997327,10.129738,10.094306,14.396685,11.464855,10.227823,0,1
187,10.78478852,11.41048454,10.79614436,11.51920914,7.565932529,11.24989011,10.79290895,7.502488142,12.73637941,10.71641757,...,10.083404,9.658088,10.812213,9.964722,10.034907,14.320728,11.442836,10.313125,0,1
188,9.438896338,11.16327666,11.19346802,10.87379956,8.012580033,11.39242805,10.9201516,7.004261456,12.36019363,10.72941693,...,10.615867,10.859995,11.661953,9.678222,10.60566,13.417643,11.226631,10.911255,1,1


In [173]:
asthma_gse_172367 = asthma_gse_172367.rename(columns = {"Astha status" : "Label"})

In [174]:
asthma_gse_172367

Unnamed: 0,A2ML1,A4GALT,AAAS,AACS,AADAT,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,sex,Label
0,11.26277043,11.70782026,10.19449532,10.98239746,7.201320584,11.6615504,12.41538764,6.899753785,12.46986548,10.56304537,...,10.28552,9.701255,10.744954,10.514774,10.271467,13.341647,11.345814,10.737504,1,0
1,10.80428201,11.76620557,10.17716131,11.1617882,7.089056595,11.70509317,12.43493536,7.046407038,12.36572933,10.66968172,...,10.262957,9.350077,10.496802,10.593466,10.509996,13.341385,11.41617,10.823129,1,0
2,10.96104261,11.56179994,10.79612493,10.97000529,7.5272221,11.45307918,12.09571346,6.778553685,12.27191576,10.68196385,...,10.233382,10.251471,11.297232,10.480638,9.775593,13.792914,11.567059,11.142879,1,1
3,10.98960849,11.61274312,10.79534327,11.13099084,7.569946152,11.40570429,12.03296894,6.719778351,12.27368854,10.70456218,...,10.226691,10.023102,11.072794,10.489052,9.829573,13.762602,11.489648,11.208111,1,1
4,8.980355904,11.09227383,10.56830344,10.69486456,7.439625724,11.47183605,12.30056033,6.957044666,12.20922533,10.34190587,...,10.097321,10.241384,10.802852,10.10187,10.499305,14.379587,11.403497,11.310799,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,8.669623511,11.63044846,10.75347811,11.37833834,7.722190152,11.17987921,11.18323117,7.588338801,12.79177711,10.64678964,...,10.006864,9.810887,10.564854,9.904911,10.285497,14.535487,11.440723,10.41663,1,1
186,11.16545099,11.25316755,10.80917088,11.2691359,7.676080427,11.02580461,10.75349474,7.395532254,12.68342293,10.81788291,...,10.234348,9.662538,10.997327,10.129738,10.094306,14.396685,11.464855,10.227823,0,1
187,10.78478852,11.41048454,10.79614436,11.51920914,7.565932529,11.24989011,10.79290895,7.502488142,12.73637941,10.71641757,...,10.083404,9.658088,10.812213,9.964722,10.034907,14.320728,11.442836,10.313125,0,1
188,9.438896338,11.16327666,11.19346802,10.87379956,8.012580033,11.39242805,10.9201516,7.004261456,12.36019363,10.72941693,...,10.615867,10.859995,11.661953,9.678222,10.60566,13.417643,11.226631,10.911255,1,1


In [175]:
asthma_gse_172367.to_csv("Asthma_GSE_172367_with_Gender.csv", index = False)

### Remove non pathway-genes from GSE 172367

In [176]:
asthma_gse_172367.columns.values[:-2], asthma_gse_172367.columns.values[:-2].size

(array(['A2ML1', 'A4GALT', 'AAAS', ..., 'ZYX', 'ZZEF1', 'ZZZ3'],
       dtype=object),
 11521)

In [177]:
removable_gene_gse_172367 = np.setdiff1d(asthma_gse_172367.columns.values[:-2], unique_gene_in_pathway)

In [178]:
removable_gene_gse_172367, removable_gene_gse_172367.size

(array(['A2ML1', 'A4GALT', 'AAAS', ..., 'ZYG11B', 'ZZEF1', 'ZZZ3'],
       dtype=object),
 8502)

In [181]:
only_pathway_gene_asthma_gse_172367_df = asthma_gse_172367.drop(removable_gene_gse_172367, axis = 1)

In [182]:
only_pathway_gene_asthma_gse_172367_df

Unnamed: 0,AACS,AADAT,AARS1,AARS2,AASDH,AASDHPPT,AASS,ABAT,ABCA1,ABCA10,...,ZBTB17,ZCCHC7,ZFYVE16,ZFYVE9,ZMAT2,ZMAT3,ZNF274,ZYX,sex,Label
0,10.98239746,7.201320584,14.51830819,8.829623089,8.973184548,10.05503968,8.218121944,8.109477398,11.21388528,6.79440559,...,9.983884,10.232379,11.17871,11.038989,11.20254,9.859287,10.983392,13.341647,1,0
1,11.1617882,7.089056595,14.44468551,9.055671707,9.102322122,10.17384838,8.267345872,7.995847121,11.09807639,6.463591354,...,9.931934,10.165771,11.023819,10.822222,11.227147,10.566657,10.883747,13.341385,1,0
2,10.97000529,7.5272221,14.28764053,9.511803168,8.752173563,10.24327665,8.537421192,8.084297947,11.10450641,6.942282723,...,10.32008,10.226747,10.369099,10.603494,10.910209,9.380333,10.870341,13.792914,1,1
3,11.13099084,7.569946152,14.27309354,9.564250636,9.00215131,10.18454871,8.489560067,8.154700507,10.77317817,6.755555,...,10.362484,10.141137,10.37592,10.513571,10.886389,9.587557,10.876972,13.762602,1,1
4,10.69486456,7.439625724,14.18018964,9.208839128,8.599558084,10.35434959,8.726278577,8.286347808,12.02878268,7.097376019,...,10.246529,10.196364,10.722445,11.089414,10.668281,9.541149,10.84483,14.379587,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,11.37833834,7.722190152,12.01374124,9.932280154,8.318602476,10.32610473,8.376386507,10.16855523,12.19600382,7.035358594,...,9.974708,9.103704,10.066196,10.200679,10.909774,10.622299,10.457079,14.535487,1,1
186,11.2691359,7.676080427,12.22844072,10.00553503,8.462805355,9.939172408,8.258832141,8.560015181,11.42550031,6.409200491,...,10.353013,8.974097,10.150587,10.087114,10.642621,9.955064,10.664523,14.396685,0,1
187,11.51920914,7.565932529,12.30825075,10.11485157,8.342639074,9.961841985,8.057188001,9.14107267,11.68630295,6.479216817,...,10.212559,9.21076,9.976184,10.070096,10.807411,10.540452,10.602757,14.320728,0,1
188,10.87379956,8.012580033,12.41538652,9.804505571,8.919671808,10.81368237,8.981878656,7.912935814,10.68759755,6.201496362,...,9.761272,9.734117,10.564094,10.320843,11.005033,10.572987,10.067485,13.417643,1,1


In [183]:
np.savetxt("GSE_172367_gene_list.txt", only_pathway_gene_asthma_gse_172367_df.columns.values[:-2], delimiter = ",", fmt = '%s')

In [184]:
11521 - 8502

3019

In [185]:
np.intersect1d(only_pathway_gene_asthma_gse_172367_df.columns.values[:-2], unique_gene_in_pathway)

array(['AACS', 'AADAT', 'AARS1', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
      dtype=object)

In [186]:
np.intersect1d(only_pathway_gene_asthma_gse_172367_df.columns.values[:-2], unique_gene_in_pathway).size

3019

In [187]:
only_pathway_gene_asthma_gse_172367_df.to_csv("KEGG_Based_Genes_GSE_172367_Dataset.csv", index = False, header= True)

---

## Mask between Genes and Pathways

In [188]:
only_pathway_gene_asthma_gse_172367_df.columns.values[:-2], only_pathway_gene_asthma_gse_172367_df.columns.values[:-2].size

(array(['AACS', 'AADAT', 'AARS1', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
       dtype=object),
 3019)

In [189]:
gene_name = only_pathway_gene_asthma_gse_172367_df.columns.values[:-2]

In [190]:
gene_name.size, np.unique(gene_name).size

(3019, 3019)

In [191]:
pathway_sparse_mat = sparse.coo_matrix((pathway_name.size, gene_name.size)).toarray()
pathway_sparse_mat.shape

(173, 3019)

In [192]:
pathway_sparse_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [193]:
pathway_sparse_mat.sum().sum()

0.0

In [194]:
for i in range(len(pathway_name)):
    pathway_sparse_mat[i, np.argwhere(np.isin(gene_name, set_pathway['gene'][i])).reshape((-1, ))] = 1.
            
pathway_sparse_mat, pathway_sparse_mat.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 (173, 3019))

In [195]:
pathway_sparse_mat.sum()

7579.0

In [196]:
pathway_sparse_mat = sparse.coo_matrix(pathway_sparse_mat)
sparse.save_npz(f"Asthma_GSE_172367_Gene_KEGG_Mask.npz", pathway_sparse_mat)