In [1]:
import numpy as np
import pandas as pd
from scipy import sparse

In [2]:
pd.set_option("display.max_rows", 11)

In [3]:
data_path = "../../../../nasdatafolder/MTL/Data/TCGA/"

---

# KEGG Pathway

In [6]:
def import_selected_pathway(filename, database = ['KEGG']):
    f = open(filename, "r")
    data = f.readlines()
    pathway_db = pd.DataFrame()
    for line in data:
        pathway_db = pd.concat([pathway_db, pd.DataFrame([tuple(line.strip().split('\t'))])], ignore_index=True)
    # use the database name as index
    pathway_db.index = pathway_db.iloc[:, 0].apply(lambda x: x.split('_')[0])
    # keep the recodes from reactome and kegg database
    pathways = pathway_db.loc[database]
    #print(pathways)
    # reset the index, and drop url
    #pathways = pathways.rename_axis('DATABASE').rename_axis(None, 1).reset_index(drop=True).drop(1, axis=1)
    pathways = pathways.rename_axis('DATABASE').reset_index(drop=True).drop(1, axis=1)
    # drop all missing columns
    pathways_data = pathways.dropna(axis = 1, how = 'all')
    return(pathways_data)

filename = "c2.cp.v7.4.symbols.gmt"
# Feb 7, 2019: use defalut rather than 'KEGG', 'REACTOME', 'BIOCARTA', 'PID'
pathways_data = import_selected_pathway(filename)
#print("Number of pathways in KEGG, REACTOME: ", pathways_data.shape[0])

In [7]:
### Discussed with Dr. Kang on March 12, 2019
# exclude large (i.e. > 300) AND small (i.e. < 15) pathways 
'''Exclude small and large sizes of pathways'''
def pathway_filter(pathways_data, small_cutoff, large_cutoff):
    # minus first column (i.e. pathway_name)
    pathway_sizes = pathways_data.count(axis=1) - 1
    pathways = pathways_data[(pathway_sizes > small_cutoff) & (pathway_sizes < large_cutoff)]
    return(pathways)

pathways = pathway_filter(pathways_data, 15, 300)
print("Number of pathways with length between 15 and 300: ", pathways.shape[0])

Number of pathways with length between 15 and 300:  173


In [8]:
pathways

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,381,382,383,384,385,386,387,388,389,390
0,KEGG_N_GLYCAN_BIOSYNTHESIS,ALG13,DOLPP1,RPN1,ALG14,MAN1B1,ALG3,B4GALT1,MGAT5,RPN2,...,,,,,,,,,,
1,KEGG_OTHER_GLYCAN_DEGRADATION,ENGASE,GLB1,MANBA,MAN2B1,GBA,NEU4,NEU2,NEU1,FUCA1,...,,,,,,,,,,
2,KEGG_O_GLYCAN_BIOSYNTHESIS,GALNT4,GALNT15,GALNTL5,GALNT6,GALNT5,GALNT16,GALNTL6,GALNT13,GCNT3,...,,,,,,,,,,
3,KEGG_GLYCOSAMINOGLYCAN_DEGRADATION,HS3ST3A1,HPSE,HPSE2,GLB1,GUSB,HYAL3,GNS,HYAL4,HYAL1,...,,,,,,,,,,
5,KEGG_GLYCEROLIPID_METABOLISM,MBOAT2,GPAM,LIPG,DGKZ,DGKE,DGKD,DGKH,MBOAT1,GK,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,KEGG_ASTHMA,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,IL3,TNF,CCL11,EPX,...,,,,,,,,,,
182,KEGG_AUTOIMMUNE_THYROID_DISEASE,HLA-DOA,HLA-DOB,CD80,CD86,CD28,IFNA5,IFNA4,IFNA2,TSHR,...,,,,,,,,,,
183,KEGG_ALLOGRAFT_REJECTION,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,CD28,TNF,...,,,,,,,,,,
184,KEGG_GRAFT_VERSUS_HOST_DISEASE,HLA-DRB4,KIR2DL1,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,KLRD1,...,,,,,,,,,,


In [9]:
pathways = pathways.reset_index(drop = True)

In [10]:
pathways

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,381,382,383,384,385,386,387,388,389,390
0,KEGG_N_GLYCAN_BIOSYNTHESIS,ALG13,DOLPP1,RPN1,ALG14,MAN1B1,ALG3,B4GALT1,MGAT5,RPN2,...,,,,,,,,,,
1,KEGG_OTHER_GLYCAN_DEGRADATION,ENGASE,GLB1,MANBA,MAN2B1,GBA,NEU4,NEU2,NEU1,FUCA1,...,,,,,,,,,,
2,KEGG_O_GLYCAN_BIOSYNTHESIS,GALNT4,GALNT15,GALNTL5,GALNT6,GALNT5,GALNT16,GALNTL6,GALNT13,GCNT3,...,,,,,,,,,,
3,KEGG_GLYCOSAMINOGLYCAN_DEGRADATION,HS3ST3A1,HPSE,HPSE2,GLB1,GUSB,HYAL3,GNS,HYAL4,HYAL1,...,,,,,,,,,,
4,KEGG_GLYCEROLIPID_METABOLISM,MBOAT2,GPAM,LIPG,DGKZ,DGKE,DGKD,DGKH,MBOAT1,GK,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,KEGG_ASTHMA,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,IL3,TNF,CCL11,EPX,...,,,,,,,,,,
169,KEGG_AUTOIMMUNE_THYROID_DISEASE,HLA-DOA,HLA-DOB,CD80,CD86,CD28,IFNA5,IFNA4,IFNA2,TSHR,...,,,,,,,,,,
170,KEGG_ALLOGRAFT_REJECTION,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,CD28,TNF,...,,,,,,,,,,
171,KEGG_GRAFT_VERSUS_HOST_DISEASE,HLA-DRB4,KIR2DL1,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,KLRD1,...,,,,,,,,,,


In [11]:
pathways = pathways.dropna(axis = 1, how = 'all')

In [12]:
pathways.columns = range(pathways.shape[1])

In [13]:
pathways

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,263,264,265,266,267,268,269,270,271,272
0,KEGG_N_GLYCAN_BIOSYNTHESIS,ALG13,DOLPP1,RPN1,ALG14,MAN1B1,ALG3,B4GALT1,MGAT5,RPN2,...,,,,,,,,,,
1,KEGG_OTHER_GLYCAN_DEGRADATION,ENGASE,GLB1,MANBA,MAN2B1,GBA,NEU4,NEU2,NEU1,FUCA1,...,,,,,,,,,,
2,KEGG_O_GLYCAN_BIOSYNTHESIS,GALNT4,GALNT15,GALNTL5,GALNT6,GALNT5,GALNT16,GALNTL6,GALNT13,GCNT3,...,,,,,,,,,,
3,KEGG_GLYCOSAMINOGLYCAN_DEGRADATION,HS3ST3A1,HPSE,HPSE2,GLB1,GUSB,HYAL3,GNS,HYAL4,HYAL1,...,,,,,,,,,,
4,KEGG_GLYCEROLIPID_METABOLISM,MBOAT2,GPAM,LIPG,DGKZ,DGKE,DGKD,DGKH,MBOAT1,GK,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,KEGG_ASTHMA,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,IL3,TNF,CCL11,EPX,...,,,,,,,,,,
169,KEGG_AUTOIMMUNE_THYROID_DISEASE,HLA-DOA,HLA-DOB,CD80,CD86,CD28,IFNA5,IFNA4,IFNA2,TSHR,...,,,,,,,,,,
170,KEGG_ALLOGRAFT_REJECTION,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,CD28,TNF,...,,,,,,,,,,
171,KEGG_GRAFT_VERSUS_HOST_DISEASE,HLA-DRB4,KIR2DL1,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,KLRD1,...,,,,,,,,,,


In [14]:
pathways = pathways.rename(columns = {0 : 'Name'})

In [15]:
pathways

Unnamed: 0,Name,1,2,3,4,5,6,7,8,9,...,263,264,265,266,267,268,269,270,271,272
0,KEGG_N_GLYCAN_BIOSYNTHESIS,ALG13,DOLPP1,RPN1,ALG14,MAN1B1,ALG3,B4GALT1,MGAT5,RPN2,...,,,,,,,,,,
1,KEGG_OTHER_GLYCAN_DEGRADATION,ENGASE,GLB1,MANBA,MAN2B1,GBA,NEU4,NEU2,NEU1,FUCA1,...,,,,,,,,,,
2,KEGG_O_GLYCAN_BIOSYNTHESIS,GALNT4,GALNT15,GALNTL5,GALNT6,GALNT5,GALNT16,GALNTL6,GALNT13,GCNT3,...,,,,,,,,,,
3,KEGG_GLYCOSAMINOGLYCAN_DEGRADATION,HS3ST3A1,HPSE,HPSE2,GLB1,GUSB,HYAL3,GNS,HYAL4,HYAL1,...,,,,,,,,,,
4,KEGG_GLYCEROLIPID_METABOLISM,MBOAT2,GPAM,LIPG,DGKZ,DGKE,DGKD,DGKH,MBOAT1,GK,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,KEGG_ASTHMA,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,IL3,TNF,CCL11,EPX,...,,,,,,,,,,
169,KEGG_AUTOIMMUNE_THYROID_DISEASE,HLA-DOA,HLA-DOB,CD80,CD86,CD28,IFNA5,IFNA4,IFNA2,TSHR,...,,,,,,,,,,
170,KEGG_ALLOGRAFT_REJECTION,HLA-DRB4,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,CD28,TNF,...,,,,,,,,,,
171,KEGG_GRAFT_VERSUS_HOST_DISEASE,HLA-DRB4,KIR2DL1,HLA-DRB5,HLA-DOA,HLA-DOB,HLA-DRB3,CD80,CD86,KLRD1,...,,,,,,,,,,


In [16]:
pathway_name = pathways['Name']

In [17]:
pathway_name

0              KEGG_N_GLYCAN_BIOSYNTHESIS
1           KEGG_OTHER_GLYCAN_DEGRADATION
2              KEGG_O_GLYCAN_BIOSYNTHESIS
3      KEGG_GLYCOSAMINOGLYCAN_DEGRADATION
4            KEGG_GLYCEROLIPID_METABOLISM
                      ...                
168                           KEGG_ASTHMA
169       KEGG_AUTOIMMUNE_THYROID_DISEASE
170              KEGG_ALLOGRAFT_REJECTION
171        KEGG_GRAFT_VERSUS_HOST_DISEASE
172                KEGG_VIRAL_MYOCARDITIS
Name: Name, Length: 173, dtype: object

In [18]:
np.savetxt("pathway_list.txt", pathway_name.values, delimiter = ",", fmt = '%s')

In [19]:
set_pathway = {'pathway' : [], 'gene' : []}
def make_pathway_set(pathway_df):
    set_pathway['pathway'].append(pathway_df[~pathway_df.isna()].Name)
    set_pathway['gene'].append(pathway_df[~pathway_df.isna()][1:])
    #print(pathway_df[~pathway_df.isna()][1:])
    #print(pathway_df[~pathway_df.isna()].Name)
    #print(pathway_df.index())
    
pathways.apply(make_pathway_set, 1)

0      None
1      None
2      None
3      None
4      None
       ... 
168    None
169    None
170    None
171    None
172    None
Length: 173, dtype: object

In [20]:
pathway_list = np.concatenate(pathways.iloc[:, 1:].values.astype('str'), axis = None)
pathway_list

array(['ALG13', 'DOLPP1', 'RPN1', ..., 'nan', 'nan', 'nan'], dtype='<U13')

In [21]:
pathway_list.size

47056

In [22]:
np.unique(pathway_list), np.unique(pathway_list).size 

(array(['A2M', 'AACS', 'AADAT', ..., 'ZNF274', 'ZYX', 'nan'], dtype='<U13'),
 4818)

In [23]:
unique_gene_in_pathway = np.delete(np.unique(pathway_list), np.where(np.unique(pathway_list) == 'nan'))
unique_gene_in_pathway

array(['A2M', 'AACS', 'AADAT', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
      dtype='<U13')

In [24]:
unique_gene_in_pathway.size ### number of genes in KEGG

4817

---

# TCGA - LIHC

## clinical data

In [25]:
lihc_survival_info_df = pd.read_csv(data_path + "lihc_tcga/data_clinical_patient.txt", sep = '\t')

In [26]:
lihc_survival_info_df

Unnamed: 0,#Other Patient ID,Patient Identifier,Form completion date,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,Sex,Patient Height,Patient Weight,Race Category,Ethnicity Category,...,Informed consent verified,Project code,Stage Other,Tissue Source Site,Tumor Tissue Site,Year of initial pathologic diagnosis,Overall Survival Status,Overall Survival (Months),Disease Free Status,Disease Free (Months)
0,#Legacy DMP patient identifier (DMPnnnn),Identifier to uniquely specify a patient.,Form completion date,Text indicator for the time frame of tissue pr...,Text indicator for the time frame of tissue pr...,Sex,Height in centimeters.,Weight measured in kilograms.,The text for reporting information about race.,The text for reporting information about ethni...,...,Informed consent verified,Project code,Stage Other,"A Tissue Source Site collects samples (tissue,...",Text term that describes the anatomic site of ...,Year of initial pathologic diagnosis.,Overall patient survival status.,Overall survival in months since initial diago...,Disease free status since initial treatment.,Disease free (months) since initial treatment.
1,#STRING,STRING,STRING,STRING,STRING,STRING,NUMBER,NUMBER,STRING,STRING,...,STRING,STRING,STRING,STRING,STRING,NUMBER,STRING,NUMBER,STRING,NUMBER
2,#1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,OTHER_PATIENT_ID,PATIENT_ID,FORM_COMPLETION_DATE,PROSPECTIVE_COLLECTION,RETROSPECTIVE_COLLECTION,SEX,HEIGHT,WEIGHT,RACE,ETHNICITY,...,INFORMED_CONSENT_VERIFIED,PROJECT_CODE,STAGE_OTHER,TISSUE_SOURCE_SITE,SITE_OF_TUMOR_TISSUE,YEAR_OF_INITIAL_PATHOLOGIC_DIAGNOSIS,OS_STATUS,OS_MONTHS,DFS_STATUS,DFS_MONTHS
4,2C041473-EAAA-4B7C-9B2E-B9DA3543AD18,TCGA-2V-A95S,6/5/15,NO,YES,Male,173,78,ASIAN,NOT HISPANIC OR LATINO,...,YES,[Not Available],[Not Available],2V,Liver,[Not Available],0:LIVING,[Not Available],0:DiseaseFree,[Not Available]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,4244912D-8527-410C-A084-A56C55EB1F82,TCGA-ZP-A9D4,3/28/14,NO,YES,Female,155,55,WHITE,NOT HISPANIC OR LATINO,...,YES,[Not Available],[Not Available],ZP,Liver,2012,0:LIVING,12.98,0:DiseaseFree,12.98
377,9197CA4C-E48C-43AF-BA61-D894FAF3CC24,TCGA-ZS-A9CD,4/30/14,NO,YES,Male,[Not Available],[Not Available],WHITE,NOT HISPANIC OR LATINO,...,YES,[Not Available],[Not Available],ZS,Liver,2010,1:DECEASED,45.53,1:Recurred/Progressed,12.19
378,7BDC5F86-4D7D-4F1F-BC23-AB51FA9FB947,TCGA-ZS-A9CE,4/30/14,NO,YES,Female,[Not Available],[Not Available],WHITE,NOT HISPANIC OR LATINO,...,YES,[Not Available],[Not Available],ZS,Liver,2011,0:LIVING,40.77,1:Recurred/Progressed,29.3
379,7FE5E2D9-C514-47BC-8387-3F940F7A822D,TCGA-ZS-A9CF,5/1/14,NO,YES,Male,[Not Available],[Not Available],WHITE,NOT HISPANIC OR LATINO,...,YES,[Not Available],[Not Available],ZS,Liver,2008,0:LIVING,79.24,1:Recurred/Progressed,20.89


In [27]:
lihc_survival_info_df = lihc_survival_info_df[["Patient Identifier", "Overall Survival Status", "Overall Survival (Months)"]]

In [28]:
lihc_survival_info_df

Unnamed: 0,Patient Identifier,Overall Survival Status,Overall Survival (Months)
0,Identifier to uniquely specify a patient.,Overall patient survival status.,Overall survival in months since initial diago...
1,STRING,STRING,NUMBER
2,1,1,1
3,PATIENT_ID,OS_STATUS,OS_MONTHS
4,TCGA-2V-A95S,0:LIVING,[Not Available]
...,...,...,...
376,TCGA-ZP-A9D4,0:LIVING,12.98
377,TCGA-ZS-A9CD,1:DECEASED,45.53
378,TCGA-ZS-A9CE,0:LIVING,40.77
379,TCGA-ZS-A9CF,0:LIVING,79.24


In [29]:
lihc_survival_info_df = lihc_survival_info_df.loc[3:]

In [30]:
lihc_survival_info_df

Unnamed: 0,Patient Identifier,Overall Survival Status,Overall Survival (Months)
3,PATIENT_ID,OS_STATUS,OS_MONTHS
4,TCGA-2V-A95S,0:LIVING,[Not Available]
5,TCGA-2Y-A9GS,1:DECEASED,23.78
6,TCGA-2Y-A9GT,1:DECEASED,53.35
7,TCGA-2Y-A9GU,0:LIVING,63.7
...,...,...,...
376,TCGA-ZP-A9D4,0:LIVING,12.98
377,TCGA-ZS-A9CD,1:DECEASED,45.53
378,TCGA-ZS-A9CE,0:LIVING,40.77
379,TCGA-ZS-A9CF,0:LIVING,79.24


In [31]:
lihc_survival_info_df = lihc_survival_info_df.rename(columns = {"Patient Identifier" : "Patient ID", "Overall Survival Status" : "OS_STATUS", "Overall Survival (Months)" : "OS_MONTHS"}).drop(3).reset_index(drop = True)

In [32]:
lihc_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-2V-A95S,0:LIVING,[Not Available]
1,TCGA-2Y-A9GS,1:DECEASED,23.78
2,TCGA-2Y-A9GT,1:DECEASED,53.35
3,TCGA-2Y-A9GU,0:LIVING,63.7
4,TCGA-2Y-A9GV,1:DECEASED,83.18
...,...,...,...
372,TCGA-ZP-A9D4,0:LIVING,12.98
373,TCGA-ZS-A9CD,1:DECEASED,45.53
374,TCGA-ZS-A9CE,0:LIVING,40.77
375,TCGA-ZS-A9CF,0:LIVING,79.24


In [33]:
lihc_survival_info_df = lihc_survival_info_df[~lihc_survival_info_df["OS_MONTHS"].str.contains("Not")].reset_index(drop = True)

In [34]:
lihc_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-2Y-A9GS,1:DECEASED,23.78
1,TCGA-2Y-A9GT,1:DECEASED,53.35
2,TCGA-2Y-A9GU,0:LIVING,63.7
3,TCGA-2Y-A9GV,1:DECEASED,83.18
4,TCGA-2Y-A9GW,1:DECEASED,41.75
...,...,...,...
371,TCGA-ZP-A9D4,0:LIVING,12.98
372,TCGA-ZS-A9CD,1:DECEASED,45.53
373,TCGA-ZS-A9CE,0:LIVING,40.77
374,TCGA-ZS-A9CF,0:LIVING,79.24


In [35]:
lihc_survival_info_df["OS_STATUS"][lihc_survival_info_df["OS_STATUS"] == "1:DECEASED"].shape

(132,)

In [36]:
lihc_survival_info_df["OS_STATUS"][lihc_survival_info_df["OS_STATUS"] == "0:LIVING"].shape

(244,)

In [37]:
132 + 244

376

In [38]:
lihc_survival_info_df["OS_STATUS"] = lihc_survival_info_df["OS_STATUS"].apply(lambda x : 1 if x.find('1:') != -1 else 0)

In [39]:
lihc_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-2Y-A9GS,1,23.78
1,TCGA-2Y-A9GT,1,53.35
2,TCGA-2Y-A9GU,0,63.7
3,TCGA-2Y-A9GV,1,83.18
4,TCGA-2Y-A9GW,1,41.75
...,...,...,...
371,TCGA-ZP-A9D4,0,12.98
372,TCGA-ZS-A9CD,1,45.53
373,TCGA-ZS-A9CE,0,40.77
374,TCGA-ZS-A9CF,0,79.24


---

## gene expression data

In [40]:
lihc_df = pd.read_csv(data_path + "lihc_tcga/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt", sep = '\t').T

In [41]:
lihc_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20521,20522,20523,20524,20525,20526,20527,20528,20529,20530
Hugo_Symbol,LOC100130426,UBE2Q2P3,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
Entrez_Gene_Id,100130426,100133144,100134869,10357,10431,136542,155060,26823,280660,317712,...,7789,158586,79364,440590,79699,7791,23140,26009,387590,389932
TCGA-2V-A95S-01,-3.4841,-1.2291,-0.2271,0.1181,-0.2593,-5.0936,0.102,-0.3353,-1.524,,...,-0.0767,-0.7757,-0.6186,-0.2364,-1.0434,-0.1128,0.7418,-0.121,-0.0078,-1.1678
TCGA-2Y-A9GS-01,-3.4841,2.5111,-0.6416,-0.4579,-1.6792,-5.0936,-0.0577,1.4818,-1.524,,...,1.6343,0.3389,0.7077,0.7829,0.3493,0.7811,0.381,1.2172,0.487,1.5427
TCGA-2Y-A9GT-01,-3.4841,-2.6646,0.0259,0.259,-1.2217,-5.0936,-0.3453,2.0169,-0.9116,,...,0.9827,1.2123,0.6443,-0.5059,0.8576,-0.0875,0.6682,-0.0543,1.3632,0.406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZS-A9CD-01,-3.4841,-0.0807,-1.0957,-0.6873,-0.0535,-5.0936,-1.2764,-2.5813,-1.524,,...,0.2454,1.2487,-0.7111,-0.8634,0.7764,0.3031,0.0004,-0.5332,0.5432,0.958
TCGA-ZS-A9CE-01,-3.4841,-2.6646,0.9808,-0.6714,0.0334,-5.0936,-1.4599,-2.5813,-1.524,,...,0.4204,-0.9016,0.113,0.1048,0.0989,0.3087,-0.5642,0.1946,-1.9218,0.6654
TCGA-ZS-A9CF-01,-3.4841,0.651,1.1637,0.5088,-2.1696,-5.0936,0.8016,-2.5813,-1.524,,...,1.287,1.3301,1.54,0.5866,0.8067,-1.8457,-0.8524,1.1452,-0.3297,0.6619
TCGA-ZS-A9CF-02,-3.4841,0.8235,1.3877,0.5742,-1.5076,-5.0936,1.0062,1.0209,-1.0637,,...,0.9951,0.8085,0.9097,1.1194,0.7427,-2.9768,0.7493,1.4011,-0.0581,0.1876


In [42]:
lihc_df.loc["Hugo_Symbol"]

0        LOC100130426
1            UBE2Q2P3
2            UBE2Q2P3
3             HMGB1P1
4              TIMM23
             ...     
20526             ZYX
20527        FLJ10821
20528            ZZZ3
20529          TPTEP1
20530         AKR1C6P
Name: Hugo_Symbol, Length: 20531, dtype: object

In [43]:
lihc_df.columns = lihc_df.loc["Hugo_Symbol"]

In [44]:
lihc_df

Hugo_Symbol,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
Hugo_Symbol,LOC100130426,UBE2Q2P3,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
Entrez_Gene_Id,100130426,100133144,100134869,10357,10431,136542,155060,26823,280660,317712,...,7789,158586,79364,440590,79699,7791,23140,26009,387590,389932
TCGA-2V-A95S-01,-3.4841,-1.2291,-0.2271,0.1181,-0.2593,-5.0936,0.102,-0.3353,-1.524,,...,-0.0767,-0.7757,-0.6186,-0.2364,-1.0434,-0.1128,0.7418,-0.121,-0.0078,-1.1678
TCGA-2Y-A9GS-01,-3.4841,2.5111,-0.6416,-0.4579,-1.6792,-5.0936,-0.0577,1.4818,-1.524,,...,1.6343,0.3389,0.7077,0.7829,0.3493,0.7811,0.381,1.2172,0.487,1.5427
TCGA-2Y-A9GT-01,-3.4841,-2.6646,0.0259,0.259,-1.2217,-5.0936,-0.3453,2.0169,-0.9116,,...,0.9827,1.2123,0.6443,-0.5059,0.8576,-0.0875,0.6682,-0.0543,1.3632,0.406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZS-A9CD-01,-3.4841,-0.0807,-1.0957,-0.6873,-0.0535,-5.0936,-1.2764,-2.5813,-1.524,,...,0.2454,1.2487,-0.7111,-0.8634,0.7764,0.3031,0.0004,-0.5332,0.5432,0.958
TCGA-ZS-A9CE-01,-3.4841,-2.6646,0.9808,-0.6714,0.0334,-5.0936,-1.4599,-2.5813,-1.524,,...,0.4204,-0.9016,0.113,0.1048,0.0989,0.3087,-0.5642,0.1946,-1.9218,0.6654
TCGA-ZS-A9CF-01,-3.4841,0.651,1.1637,0.5088,-2.1696,-5.0936,0.8016,-2.5813,-1.524,,...,1.287,1.3301,1.54,0.5866,0.8067,-1.8457,-0.8524,1.1452,-0.3297,0.6619
TCGA-ZS-A9CF-02,-3.4841,0.8235,1.3877,0.5742,-1.5076,-5.0936,1.0062,1.0209,-1.0637,,...,0.9951,0.8085,0.9097,1.1194,0.7427,-2.9768,0.7493,1.4011,-0.0581,0.1876


In [45]:
lihc_df = lihc_df.drop(["Hugo_Symbol", "Entrez_Gene_Id"])

In [46]:
lihc_df

Hugo_Symbol,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
TCGA-2V-A95S-01,-3.4841,-1.2291,-0.2271,0.1181,-0.2593,-5.0936,0.102,-0.3353,-1.524,,...,-0.0767,-0.7757,-0.6186,-0.2364,-1.0434,-0.1128,0.7418,-0.121,-0.0078,-1.1678
TCGA-2Y-A9GS-01,-3.4841,2.5111,-0.6416,-0.4579,-1.6792,-5.0936,-0.0577,1.4818,-1.524,,...,1.6343,0.3389,0.7077,0.7829,0.3493,0.7811,0.381,1.2172,0.487,1.5427
TCGA-2Y-A9GT-01,-3.4841,-2.6646,0.0259,0.259,-1.2217,-5.0936,-0.3453,2.0169,-0.9116,,...,0.9827,1.2123,0.6443,-0.5059,0.8576,-0.0875,0.6682,-0.0543,1.3632,0.406
TCGA-2Y-A9GU-01,-3.4841,0.3139,0.2085,-0.7977,0.213,-5.0936,0.8754,0.1338,-1.524,,...,-0.5698,-0.7205,0.6969,-0.7263,0.2281,-0.157,1.6425,-1.1314,-0.414,-1.6647
TCGA-2Y-A9GV-01,-3.4841,1.2833,0.2882,0.4761,-0.7084,-5.0936,0.8847,-2.5813,-1.524,,...,0.7846,0.947,1.9398,-1.238,0.9947,-0.2676,-0.2191,0.5605,-0.4018,0.6367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZS-A9CD-01,-3.4841,-0.0807,-1.0957,-0.6873,-0.0535,-5.0936,-1.2764,-2.5813,-1.524,,...,0.2454,1.2487,-0.7111,-0.8634,0.7764,0.3031,0.0004,-0.5332,0.5432,0.958
TCGA-ZS-A9CE-01,-3.4841,-2.6646,0.9808,-0.6714,0.0334,-5.0936,-1.4599,-2.5813,-1.524,,...,0.4204,-0.9016,0.113,0.1048,0.0989,0.3087,-0.5642,0.1946,-1.9218,0.6654
TCGA-ZS-A9CF-01,-3.4841,0.651,1.1637,0.5088,-2.1696,-5.0936,0.8016,-2.5813,-1.524,,...,1.287,1.3301,1.54,0.5866,0.8067,-1.8457,-0.8524,1.1452,-0.3297,0.6619
TCGA-ZS-A9CF-02,-3.4841,0.8235,1.3877,0.5742,-1.5076,-5.0936,1.0062,1.0209,-1.0637,,...,0.9951,0.8085,0.9097,1.1194,0.7427,-2.9768,0.7493,1.4011,-0.0581,0.1876


In [47]:
lihc_df = lihc_df.reset_index()

In [48]:
lihc_df = lihc_df.rename(columns = {"index" : "Patient ID"})

In [49]:
lihc_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-2V-A95S-01,-3.4841,-1.2291,-0.2271,0.1181,-0.2593,-5.0936,0.102,-0.3353,-1.524,...,-0.0767,-0.7757,-0.6186,-0.2364,-1.0434,-0.1128,0.7418,-0.121,-0.0078,-1.1678
1,TCGA-2Y-A9GS-01,-3.4841,2.5111,-0.6416,-0.4579,-1.6792,-5.0936,-0.0577,1.4818,-1.524,...,1.6343,0.3389,0.7077,0.7829,0.3493,0.7811,0.381,1.2172,0.487,1.5427
2,TCGA-2Y-A9GT-01,-3.4841,-2.6646,0.0259,0.259,-1.2217,-5.0936,-0.3453,2.0169,-0.9116,...,0.9827,1.2123,0.6443,-0.5059,0.8576,-0.0875,0.6682,-0.0543,1.3632,0.406
3,TCGA-2Y-A9GU-01,-3.4841,0.3139,0.2085,-0.7977,0.213,-5.0936,0.8754,0.1338,-1.524,...,-0.5698,-0.7205,0.6969,-0.7263,0.2281,-0.157,1.6425,-1.1314,-0.414,-1.6647
4,TCGA-2Y-A9GV-01,-3.4841,1.2833,0.2882,0.4761,-0.7084,-5.0936,0.8847,-2.5813,-1.524,...,0.7846,0.947,1.9398,-1.238,0.9947,-0.2676,-0.2191,0.5605,-0.4018,0.6367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,TCGA-ZS-A9CD-01,-3.4841,-0.0807,-1.0957,-0.6873,-0.0535,-5.0936,-1.2764,-2.5813,-1.524,...,0.2454,1.2487,-0.7111,-0.8634,0.7764,0.3031,0.0004,-0.5332,0.5432,0.958
369,TCGA-ZS-A9CE-01,-3.4841,-2.6646,0.9808,-0.6714,0.0334,-5.0936,-1.4599,-2.5813,-1.524,...,0.4204,-0.9016,0.113,0.1048,0.0989,0.3087,-0.5642,0.1946,-1.9218,0.6654
370,TCGA-ZS-A9CF-01,-3.4841,0.651,1.1637,0.5088,-2.1696,-5.0936,0.8016,-2.5813,-1.524,...,1.287,1.3301,1.54,0.5866,0.8067,-1.8457,-0.8524,1.1452,-0.3297,0.6619
371,TCGA-ZS-A9CF-02,-3.4841,0.8235,1.3877,0.5742,-1.5076,-5.0936,1.0062,1.0209,-1.0637,...,0.9951,0.8085,0.9097,1.1194,0.7427,-2.9768,0.7493,1.4011,-0.0581,0.1876


In [50]:
lihc_df["Patient ID"] = lihc_df["Patient ID"].apply(lambda x : x[:-3])

In [51]:
lihc_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-2V-A95S,-3.4841,-1.2291,-0.2271,0.1181,-0.2593,-5.0936,0.102,-0.3353,-1.524,...,-0.0767,-0.7757,-0.6186,-0.2364,-1.0434,-0.1128,0.7418,-0.121,-0.0078,-1.1678
1,TCGA-2Y-A9GS,-3.4841,2.5111,-0.6416,-0.4579,-1.6792,-5.0936,-0.0577,1.4818,-1.524,...,1.6343,0.3389,0.7077,0.7829,0.3493,0.7811,0.381,1.2172,0.487,1.5427
2,TCGA-2Y-A9GT,-3.4841,-2.6646,0.0259,0.259,-1.2217,-5.0936,-0.3453,2.0169,-0.9116,...,0.9827,1.2123,0.6443,-0.5059,0.8576,-0.0875,0.6682,-0.0543,1.3632,0.406
3,TCGA-2Y-A9GU,-3.4841,0.3139,0.2085,-0.7977,0.213,-5.0936,0.8754,0.1338,-1.524,...,-0.5698,-0.7205,0.6969,-0.7263,0.2281,-0.157,1.6425,-1.1314,-0.414,-1.6647
4,TCGA-2Y-A9GV,-3.4841,1.2833,0.2882,0.4761,-0.7084,-5.0936,0.8847,-2.5813,-1.524,...,0.7846,0.947,1.9398,-1.238,0.9947,-0.2676,-0.2191,0.5605,-0.4018,0.6367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,TCGA-ZS-A9CD,-3.4841,-0.0807,-1.0957,-0.6873,-0.0535,-5.0936,-1.2764,-2.5813,-1.524,...,0.2454,1.2487,-0.7111,-0.8634,0.7764,0.3031,0.0004,-0.5332,0.5432,0.958
369,TCGA-ZS-A9CE,-3.4841,-2.6646,0.9808,-0.6714,0.0334,-5.0936,-1.4599,-2.5813,-1.524,...,0.4204,-0.9016,0.113,0.1048,0.0989,0.3087,-0.5642,0.1946,-1.9218,0.6654
370,TCGA-ZS-A9CF,-3.4841,0.651,1.1637,0.5088,-2.1696,-5.0936,0.8016,-2.5813,-1.524,...,1.287,1.3301,1.54,0.5866,0.8067,-1.8457,-0.8524,1.1452,-0.3297,0.6619
371,TCGA-ZS-A9CF,-3.4841,0.8235,1.3877,0.5742,-1.5076,-5.0936,1.0062,1.0209,-1.0637,...,0.9951,0.8085,0.9097,1.1194,0.7427,-2.9768,0.7493,1.4011,-0.0581,0.1876


---

### remove nan column

In [52]:
lihc_df.iloc[:, lihc_df.columns.isna()]

Hugo_Symbol,NaN
0,-0.5504
1,1.4898
2,-0.8357
3,-1.2435
4,0.5248
...,...
368,0.467
369,1.5113
370,0.3613
371,0.5293


In [53]:
lihc_df = lihc_df.iloc[:, ~lihc_df.columns.isna()]

In [54]:
lihc_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-2V-A95S,-3.4841,-1.2291,-0.2271,0.1181,-0.2593,-5.0936,0.102,-0.3353,-1.524,...,-0.0767,-0.7757,-0.6186,-0.2364,-1.0434,-0.1128,0.7418,-0.121,-0.0078,-1.1678
1,TCGA-2Y-A9GS,-3.4841,2.5111,-0.6416,-0.4579,-1.6792,-5.0936,-0.0577,1.4818,-1.524,...,1.6343,0.3389,0.7077,0.7829,0.3493,0.7811,0.381,1.2172,0.487,1.5427
2,TCGA-2Y-A9GT,-3.4841,-2.6646,0.0259,0.259,-1.2217,-5.0936,-0.3453,2.0169,-0.9116,...,0.9827,1.2123,0.6443,-0.5059,0.8576,-0.0875,0.6682,-0.0543,1.3632,0.406
3,TCGA-2Y-A9GU,-3.4841,0.3139,0.2085,-0.7977,0.213,-5.0936,0.8754,0.1338,-1.524,...,-0.5698,-0.7205,0.6969,-0.7263,0.2281,-0.157,1.6425,-1.1314,-0.414,-1.6647
4,TCGA-2Y-A9GV,-3.4841,1.2833,0.2882,0.4761,-0.7084,-5.0936,0.8847,-2.5813,-1.524,...,0.7846,0.947,1.9398,-1.238,0.9947,-0.2676,-0.2191,0.5605,-0.4018,0.6367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,TCGA-ZS-A9CD,-3.4841,-0.0807,-1.0957,-0.6873,-0.0535,-5.0936,-1.2764,-2.5813,-1.524,...,0.2454,1.2487,-0.7111,-0.8634,0.7764,0.3031,0.0004,-0.5332,0.5432,0.958
369,TCGA-ZS-A9CE,-3.4841,-2.6646,0.9808,-0.6714,0.0334,-5.0936,-1.4599,-2.5813,-1.524,...,0.4204,-0.9016,0.113,0.1048,0.0989,0.3087,-0.5642,0.1946,-1.9218,0.6654
370,TCGA-ZS-A9CF,-3.4841,0.651,1.1637,0.5088,-2.1696,-5.0936,0.8016,-2.5813,-1.524,...,1.287,1.3301,1.54,0.5866,0.8067,-1.8457,-0.8524,1.1452,-0.3297,0.6619
371,TCGA-ZS-A9CF,-3.4841,0.8235,1.3877,0.5742,-1.5076,-5.0936,1.0062,1.0209,-1.0637,...,0.9951,0.8085,0.9097,1.1194,0.7427,-2.9768,0.7493,1.4011,-0.0581,0.1876


### check redundant genes

In [55]:
lihc_df.columns

Index(['Patient ID', 'LOC100130426', 'UBE2Q2P3', 'UBE2Q2P3', 'HMGB1P1',
       'TIMM23', 'MOXD2', 'LOC155060', 'RNU12-2P', 'SSX9',
       ...
       'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX', 'FLJ10821', 'ZZZ3',
       'TPTEP1', 'AKR1C6P'],
      dtype='object', name='Hugo_Symbol', length=20531)

In [56]:
duplicated_genes_lihc = lihc_df.columns[lihc_df.columns.duplicated()]

In [57]:
duplicated_genes_lihc, duplicated_genes_lihc.size

(Index(['UBE2Q2P3', 'CC2D2B', 'CCDC7', 'CYorf15B', 'C1orf84', 'LINC00875',
        'NBPF16', 'NEBL', 'NKAIN3', 'C5orf23', 'PALM2AKAP2', 'PLEKHG7', 'QSOX1',
        'SH3D20', 'NCRNA00185'],
       dtype='object', name='Hugo_Symbol'),
 15)

In [58]:
lihc_df[duplicated_genes_lihc]

Hugo_Symbol,UBE2Q2P3,UBE2Q2P3.1,CC2D2B,CC2D2B.1,CCDC7,CCDC7.1,CYorf15B,CYorf15B.1,C1orf84,C1orf84.1,...,PALM2AKAP2,PALM2AKAP2.1,PLEKHG7,PLEKHG7.1,QSOX1,QSOX1.1,SH3D20,SH3D20.1,NCRNA00185,NCRNA00185.1
0,-1.2291,-0.2271,-2.0211,0.2624,0.727,-0.7109,0.5096,0.6214,0.2405,0.4636,...,-1.9656,-1.9013,-0.5774,0.1907,1.2898,1.716,0.6589,1.138,-1.0403,-2.1226
1,2.5111,-0.6416,-2.0211,-0.4155,-2.2726,-0.9711,0.7467,0.9465,1.9306,0.7115,...,0.4473,0.1749,0.1054,-0.5173,-0.5361,-0.6475,1.4797,0.2311,0.5436,-0.4735
2,-2.6646,0.0259,-0.1573,-0.532,0.8178,0.6785,0.7119,1.0124,0.8969,1.7673,...,1.0827,1.5213,-0.2547,1.3856,-0.0645,-0.331,0.7761,0.3936,-1.0221,-1.462
3,0.3139,0.2085,-2.0211,-0.6183,1.6471,-0.1915,-3.0519,-3.1867,-2.6101,0.1766,...,0.0586,-0.1281,-0.9583,-0.9723,0.1372,0.2999,-1.3353,-0.7346,-2.0565,-2.1226
4,1.2833,0.2882,-0.2362,1.3956,-1.318,1.1017,-3.0519,-3.1867,-0.0133,1.0573,...,1.1144,1.0802,-1.9727,0.8108,-0.5454,-0.6569,0.0907,0.0432,-2.0565,-2.1226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,-0.0807,-1.0957,-2.0211,0.7476,-0.1118,-0.0741,0.5651,0.1362,-0.6881,-0.606,...,0.4411,0.2656,-1.0355,-0.3713,-0.1682,-0.1346,-0.3402,-0.835,0.9518,1.2315
369,-2.6646,0.9808,-2.0211,0.702,0.9161,0.8167,-3.0519,-3.1867,-0.2372,0.9911,...,0.6414,0.6317,1.4115,2.0683,-0.7258,-1.3857,1.8677,-1.5885,-2.0565,-2.1226
370,0.651,1.1637,-0.4947,0.0264,-2.2726,0.5599,0.8033,0.8379,1.9228,1.1023,...,0.6854,1.5716,2.187,2.019,-0.8345,-1.1604,0.6641,0.2909,0.4975,0.7041
371,0.8235,1.3877,1.0325,-0.9513,-0.9604,0.7644,0.8016,0.8512,2.0484,0.8128,...,0.4634,1.4012,1.3244,2.4697,-1.1479,-1.1403,0.0998,-1.0114,0.6229,0.3415


In [59]:
for gene in duplicated_genes_lihc:
    print(gene)
    print(lihc_df[gene])
    #print(lihc_df[gene])
    print(lihc_df[gene].iloc[:, 0])
    print(lihc_df[gene].iloc[:, 1])

UBE2Q2P3
Hugo_Symbol UBE2Q2P3 UBE2Q2P3
0            -1.2291  -0.2271
1             2.5111  -0.6416
2            -2.6646   0.0259
3             0.3139   0.2085
4             1.2833   0.2882
..               ...      ...
368          -0.0807  -1.0957
369          -2.6646   0.9808
370            0.651   1.1637
371           0.8235   1.3877
372          -2.6646   0.8888

[373 rows x 2 columns]
0     -1.2291
1      2.5111
2     -2.6646
3      0.3139
4      1.2833
        ...  
368   -0.0807
369   -2.6646
370     0.651
371    0.8235
372   -2.6646
Name: UBE2Q2P3, Length: 373, dtype: object
0     -0.2271
1     -0.6416
2      0.0259
3      0.2085
4      0.2882
        ...  
368   -1.0957
369    0.9808
370    1.1637
371    1.3877
372    0.8888
Name: UBE2Q2P3, Length: 373, dtype: object
CC2D2B
Hugo_Symbol  CC2D2B  CC2D2B
0           -2.0211  0.2624
1           -2.0211 -0.4155
2           -0.1573  -0.532
3           -2.0211 -0.6183
4           -0.2362  1.3956
..              ...     ...
368       

In [60]:
for gene in duplicated_genes_lihc:
    var_gene = np.var(lihc_df[gene])
    print(var_gene)
    print(lihc_df[gene])
    #print(var_gene.values)
    #print(np.where(var_gene.values[0]))
    #print(var_gene.values[0])
    #print(var_gene.values[1])
    if var_gene.values[0] < var_gene.values[1]:
        print(var_gene.values[1])
        print(lihc_df[gene].iloc[:, 1])
        lihc_df[gene] = lihc_df[gene].iloc[:, 1]
    else:
        print(var_gene.values[0])
        print(lihc_df[gene].iloc[:, 0])
        lihc_df[gene] = lihc_df[gene].iloc[:, 0]

Hugo_Symbol
UBE2Q2P3    1.894733
UBE2Q2P3    1.599405
dtype: float64
Hugo_Symbol UBE2Q2P3 UBE2Q2P3
0            -1.2291  -0.2271
1             2.5111  -0.6416
2            -2.6646   0.0259
3             0.3139   0.2085
4             1.2833   0.2882
..               ...      ...
368          -0.0807  -1.0957
369          -2.6646   0.9808
370            0.651   1.1637
371           0.8235   1.3877
372          -2.6646   0.8888

[373 rows x 2 columns]
1.894733235161038
0     -1.2291
1      2.5111
2     -2.6646
3      0.3139
4      1.2833
        ...  
368   -0.0807
369   -2.6646
370     0.651
371    0.8235
372   -2.6646
Name: UBE2Q2P3, Length: 373, dtype: object
Hugo_Symbol
CC2D2B    1.412131
CC2D2B    1.434578
dtype: float64
Hugo_Symbol  CC2D2B  CC2D2B
0           -2.0211  0.2624
1           -2.0211 -0.4155
2           -0.1573  -0.532
3           -2.0211 -0.6183
4           -0.2362  1.3956
..              ...     ...
368         -2.0211  0.7476
369         -2.0211   0.702
370         -0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lihc_df[gene] = lihc_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lihc_df[gene] = lihc_df[gene].iloc[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lihc_df[gene] = lihc_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

In [61]:
lihc_df = lihc_df.iloc[:, ~lihc_df.columns.duplicated()]

In [62]:
lihc_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-2V-A95S,-3.4841,-1.2291,0.1181,-0.2593,-5.0936,0.102,-0.3353,-1.524,,...,-0.0767,-0.7757,-0.6186,-0.2364,-1.0434,-0.1128,0.7418,-0.121,-0.0078,-1.1678
1,TCGA-2Y-A9GS,-3.4841,2.5111,-0.4579,-1.6792,-5.0936,-0.0577,1.4818,-1.524,,...,1.6343,0.3389,0.7077,0.7829,0.3493,0.7811,0.381,1.2172,0.487,1.5427
2,TCGA-2Y-A9GT,-3.4841,-2.6646,0.259,-1.2217,-5.0936,-0.3453,2.0169,-0.9116,,...,0.9827,1.2123,0.6443,-0.5059,0.8576,-0.0875,0.6682,-0.0543,1.3632,0.406
3,TCGA-2Y-A9GU,-3.4841,0.3139,-0.7977,0.213,-5.0936,0.8754,0.1338,-1.524,,...,-0.5698,-0.7205,0.6969,-0.7263,0.2281,-0.157,1.6425,-1.1314,-0.414,-1.6647
4,TCGA-2Y-A9GV,-3.4841,1.2833,0.4761,-0.7084,-5.0936,0.8847,-2.5813,-1.524,,...,0.7846,0.947,1.9398,-1.238,0.9947,-0.2676,-0.2191,0.5605,-0.4018,0.6367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,TCGA-ZS-A9CD,-3.4841,-0.0807,-0.6873,-0.0535,-5.0936,-1.2764,-2.5813,-1.524,,...,0.2454,1.2487,-0.7111,-0.8634,0.7764,0.3031,0.0004,-0.5332,0.5432,0.958
369,TCGA-ZS-A9CE,-3.4841,-2.6646,-0.6714,0.0334,-5.0936,-1.4599,-2.5813,-1.524,,...,0.4204,-0.9016,0.113,0.1048,0.0989,0.3087,-0.5642,0.1946,-1.9218,0.6654
370,TCGA-ZS-A9CF,-3.4841,0.651,0.5088,-2.1696,-5.0936,0.8016,-2.5813,-1.524,,...,1.287,1.3301,1.54,0.5866,0.8067,-1.8457,-0.8524,1.1452,-0.3297,0.6619
371,TCGA-ZS-A9CF,-3.4841,0.8235,0.5742,-1.5076,-5.0936,1.0062,1.0209,-1.0637,,...,0.9951,0.8085,0.9097,1.1194,0.7427,-2.9768,0.7493,1.4011,-0.0581,0.1876


---

### sex data

In [63]:
lihc_sex_info_df = pd.read_csv(data_path + "lihc_tcga/Sex.txt", sep ='\t')

In [64]:
lihc_sex_info_df

Unnamed: 0,Study ID,Patient ID,Sex
0,lihc_tcga,TCGA-2V-A95S,Male
1,lihc_tcga,TCGA-2Y-A9GS,Male
2,lihc_tcga,TCGA-2Y-A9GT,Male
3,lihc_tcga,TCGA-2Y-A9GU,Female
4,lihc_tcga,TCGA-2Y-A9GV,Female
...,...,...,...
372,lihc_tcga,TCGA-ZP-A9D4,Female
373,lihc_tcga,TCGA-ZS-A9CD,Male
374,lihc_tcga,TCGA-ZS-A9CE,Female
375,lihc_tcga,TCGA-ZS-A9CF,Male


In [65]:
lihc_sex_info_df["Sex"] = lihc_sex_info_df["Sex"].apply(lambda x : 0 if x.find('Female') != -1 else 1)

In [66]:
lihc_sex_info_df

Unnamed: 0,Study ID,Patient ID,Sex
0,lihc_tcga,TCGA-2V-A95S,1
1,lihc_tcga,TCGA-2Y-A9GS,1
2,lihc_tcga,TCGA-2Y-A9GT,1
3,lihc_tcga,TCGA-2Y-A9GU,0
4,lihc_tcga,TCGA-2Y-A9GV,0
...,...,...,...
372,lihc_tcga,TCGA-ZP-A9D4,0
373,lihc_tcga,TCGA-ZS-A9CD,1
374,lihc_tcga,TCGA-ZS-A9CE,0
375,lihc_tcga,TCGA-ZS-A9CF,1


---

### merge gene expression, clincic, and sex data

In [67]:
lihc_df_with_sex = pd.merge(lihc_df, lihc_sex_info_df, how = 'inner', on = 'Patient ID')

In [68]:
lihc_df_with_sex

Unnamed: 0,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Study ID,Sex
0,TCGA-2V-A95S,-3.4841,-1.2291,0.1181,-0.2593,-5.0936,0.102,-0.3353,-1.524,,...,-0.6186,-0.2364,-1.0434,-0.1128,0.7418,-0.121,-0.0078,-1.1678,lihc_tcga,1
1,TCGA-2Y-A9GS,-3.4841,2.5111,-0.4579,-1.6792,-5.0936,-0.0577,1.4818,-1.524,,...,0.7077,0.7829,0.3493,0.7811,0.381,1.2172,0.487,1.5427,lihc_tcga,1
2,TCGA-2Y-A9GT,-3.4841,-2.6646,0.259,-1.2217,-5.0936,-0.3453,2.0169,-0.9116,,...,0.6443,-0.5059,0.8576,-0.0875,0.6682,-0.0543,1.3632,0.406,lihc_tcga,1
3,TCGA-2Y-A9GU,-3.4841,0.3139,-0.7977,0.213,-5.0936,0.8754,0.1338,-1.524,,...,0.6969,-0.7263,0.2281,-0.157,1.6425,-1.1314,-0.414,-1.6647,lihc_tcga,0
4,TCGA-2Y-A9GV,-3.4841,1.2833,0.4761,-0.7084,-5.0936,0.8847,-2.5813,-1.524,,...,1.9398,-1.238,0.9947,-0.2676,-0.2191,0.5605,-0.4018,0.6367,lihc_tcga,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,TCGA-ZS-A9CD,-3.4841,-0.0807,-0.6873,-0.0535,-5.0936,-1.2764,-2.5813,-1.524,,...,-0.7111,-0.8634,0.7764,0.3031,0.0004,-0.5332,0.5432,0.958,lihc_tcga,1
369,TCGA-ZS-A9CE,-3.4841,-2.6646,-0.6714,0.0334,-5.0936,-1.4599,-2.5813,-1.524,,...,0.113,0.1048,0.0989,0.3087,-0.5642,0.1946,-1.9218,0.6654,lihc_tcga,0
370,TCGA-ZS-A9CF,-3.4841,0.651,0.5088,-2.1696,-5.0936,0.8016,-2.5813,-1.524,,...,1.54,0.5866,0.8067,-1.8457,-0.8524,1.1452,-0.3297,0.6619,lihc_tcga,1
371,TCGA-ZS-A9CF,-3.4841,0.8235,0.5742,-1.5076,-5.0936,1.0062,1.0209,-1.0637,,...,0.9097,1.1194,0.7427,-2.9768,0.7493,1.4011,-0.0581,0.1876,lihc_tcga,1


In [69]:
lihc_df_with_sex["Sex"].isna().sum()

0

In [70]:
lihc_df_with_sex = pd.merge(lihc_df_with_sex, lihc_survival_info_df, how = 'inner', on = 'Patient ID')

In [71]:
lihc_df_with_sex

Unnamed: 0,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Study ID,Sex,OS_STATUS,OS_MONTHS
0,TCGA-2Y-A9GS,-3.4841,2.5111,-0.4579,-1.6792,-5.0936,-0.0577,1.4818,-1.524,,...,0.3493,0.7811,0.381,1.2172,0.487,1.5427,lihc_tcga,1,1,23.78
1,TCGA-2Y-A9GT,-3.4841,-2.6646,0.259,-1.2217,-5.0936,-0.3453,2.0169,-0.9116,,...,0.8576,-0.0875,0.6682,-0.0543,1.3632,0.406,lihc_tcga,1,1,53.35
2,TCGA-2Y-A9GU,-3.4841,0.3139,-0.7977,0.213,-5.0936,0.8754,0.1338,-1.524,,...,0.2281,-0.157,1.6425,-1.1314,-0.414,-1.6647,lihc_tcga,0,0,63.7
3,TCGA-2Y-A9GV,-3.4841,1.2833,0.4761,-0.7084,-5.0936,0.8847,-2.5813,-1.524,,...,0.9947,-0.2676,-0.2191,0.5605,-0.4018,0.6367,lihc_tcga,0,1,83.18
4,TCGA-2Y-A9GW,-3.4841,-0.1729,0.3577,-1.4437,-5.0936,-0.7053,-2.5813,-1.524,,...,-0.2357,1.5964,0.9814,-0.4042,0.5639,0.54,lihc_tcga,1,1,41.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,TCGA-ZS-A9CD,-3.4841,-0.0807,-0.6873,-0.0535,-5.0936,-1.2764,-2.5813,-1.524,,...,0.7764,0.3031,0.0004,-0.5332,0.5432,0.958,lihc_tcga,1,1,45.53
368,TCGA-ZS-A9CE,-3.4841,-2.6646,-0.6714,0.0334,-5.0936,-1.4599,-2.5813,-1.524,,...,0.0989,0.3087,-0.5642,0.1946,-1.9218,0.6654,lihc_tcga,0,0,40.77
369,TCGA-ZS-A9CF,-3.4841,0.651,0.5088,-2.1696,-5.0936,0.8016,-2.5813,-1.524,,...,0.8067,-1.8457,-0.8524,1.1452,-0.3297,0.6619,lihc_tcga,1,0,79.24
370,TCGA-ZS-A9CF,-3.4841,0.8235,0.5742,-1.5076,-5.0936,1.0062,1.0209,-1.0637,,...,0.7427,-2.9768,0.7493,1.4011,-0.0581,0.1876,lihc_tcga,1,0,79.24


In [72]:
lihc_df_with_sex = lihc_df_with_sex.drop(["Patient ID", "Study ID"], axis = 1)

In [73]:
lihc_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,EZHIP,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-3.4841,2.5111,-0.4579,-1.6792,-5.0936,-0.0577,1.4818,-1.524,,0.3941,...,0.7829,0.3493,0.7811,0.381,1.2172,0.487,1.5427,1,1,23.78
1,-3.4841,-2.6646,0.259,-1.2217,-5.0936,-0.3453,2.0169,-0.9116,,-1.2793,...,-0.5059,0.8576,-0.0875,0.6682,-0.0543,1.3632,0.406,1,1,53.35
2,-3.4841,0.3139,-0.7977,0.213,-5.0936,0.8754,0.1338,-1.524,,-1.2793,...,-0.7263,0.2281,-0.157,1.6425,-1.1314,-0.414,-1.6647,0,0,63.7
3,-3.4841,1.2833,0.4761,-0.7084,-5.0936,0.8847,-2.5813,-1.524,,-1.2793,...,-1.238,0.9947,-0.2676,-0.2191,0.5605,-0.4018,0.6367,0,1,83.18
4,-3.4841,-0.1729,0.3577,-1.4437,-5.0936,-0.7053,-2.5813,-1.524,,-1.2793,...,0.1273,-0.2357,1.5964,0.9814,-0.4042,0.5639,0.54,1,1,41.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,-3.4841,-0.0807,-0.6873,-0.0535,-5.0936,-1.2764,-2.5813,-1.524,,-1.2793,...,-0.8634,0.7764,0.3031,0.0004,-0.5332,0.5432,0.958,1,1,45.53
368,-3.4841,-2.6646,-0.6714,0.0334,-5.0936,-1.4599,-2.5813,-1.524,,-1.2793,...,0.1048,0.0989,0.3087,-0.5642,0.1946,-1.9218,0.6654,0,0,40.77
369,-3.4841,0.651,0.5088,-2.1696,-5.0936,0.8016,-2.5813,-1.524,,-1.2793,...,0.5866,0.8067,-1.8457,-0.8524,1.1452,-0.3297,0.6619,1,0,79.24
370,-3.4841,0.8235,0.5742,-1.5076,-5.0936,1.0062,1.0209,-1.0637,,-0.5569,...,1.1194,0.7427,-2.9768,0.7493,1.4011,-0.0581,0.1876,1,0,79.24


In [74]:
lihc_df_with_sex.shape[0]

372

In [75]:
lihc_df_with_sex = lihc_df_with_sex.iloc[:, ~(lihc_df_with_sex.isna().sum() / lihc_df_with_sex.shape[0]  > 0.8).values]

In [76]:
lihc_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,EZHIP,EFCAB8,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-3.4841,2.5111,-0.4579,-1.6792,-5.0936,-0.0577,1.4818,-1.524,0.3941,-1.8246,...,0.7829,0.3493,0.7811,0.381,1.2172,0.487,1.5427,1,1,23.78
1,-3.4841,-2.6646,0.259,-1.2217,-5.0936,-0.3453,2.0169,-0.9116,-1.2793,-0.5605,...,-0.5059,0.8576,-0.0875,0.6682,-0.0543,1.3632,0.406,1,1,53.35
2,-3.4841,0.3139,-0.7977,0.213,-5.0936,0.8754,0.1338,-1.524,-1.2793,-1.8246,...,-0.7263,0.2281,-0.157,1.6425,-1.1314,-0.414,-1.6647,0,0,63.7
3,-3.4841,1.2833,0.4761,-0.7084,-5.0936,0.8847,-2.5813,-1.524,-1.2793,-0.614,...,-1.238,0.9947,-0.2676,-0.2191,0.5605,-0.4018,0.6367,0,1,83.18
4,-3.4841,-0.1729,0.3577,-1.4437,-5.0936,-0.7053,-2.5813,-1.524,-1.2793,0.5325,...,0.1273,-0.2357,1.5964,0.9814,-0.4042,0.5639,0.54,1,1,41.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,-3.4841,-0.0807,-0.6873,-0.0535,-5.0936,-1.2764,-2.5813,-1.524,-1.2793,0.5054,...,-0.8634,0.7764,0.3031,0.0004,-0.5332,0.5432,0.958,1,1,45.53
368,-3.4841,-2.6646,-0.6714,0.0334,-5.0936,-1.4599,-2.5813,-1.524,-1.2793,0.0933,...,0.1048,0.0989,0.3087,-0.5642,0.1946,-1.9218,0.6654,0,0,40.77
369,-3.4841,0.651,0.5088,-2.1696,-5.0936,0.8016,-2.5813,-1.524,-1.2793,-1.8246,...,0.5866,0.8067,-1.8457,-0.8524,1.1452,-0.3297,0.6619,1,0,79.24
370,-3.4841,0.8235,0.5742,-1.5076,-5.0936,1.0062,1.0209,-1.0637,-0.5569,-0.2354,...,1.1194,0.7427,-2.9768,0.7493,1.4011,-0.0581,0.1876,1,0,79.24


In [77]:
lihc_df_with_sex.isna().sum().sum()

0

---

### Remove non pathway-genes from LIHC

In [78]:
unique_gene_in_pathway

array(['A2M', 'AACS', 'AADAT', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
      dtype='<U13')

In [79]:
lihc_df_with_sex.columns.values[:-3], lihc_df_with_sex.columns.values[:-3].size

(array(['LOC100130426', 'UBE2Q2P3', 'HMGB1P1', ..., 'ZZZ3', 'TPTEP1',
        'AKR1C6P'], dtype=object),
 20023)

In [80]:
removable_gene_lihc = np.setdiff1d(lihc_df_with_sex.columns.values[:-3], unique_gene_in_pathway)

In [81]:
removable_gene_lihc, removable_gene_lihc.size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11A', 'ZYG11B', 'ZZZ3'],
       dtype=object),
 15663)

In [82]:
np.unique(removable_gene_lihc), np.unique(removable_gene_lihc).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11A', 'ZYG11B', 'ZZZ3'],
       dtype=object),
 15663)

In [83]:
lihc_df_with_sex = lihc_df_with_sex.drop(removable_gene_lihc, axis = 1)

In [84]:
lihc_df_with_sex

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,-0.4579,-0.2749,0.2537,0.3082,0.0792,0.308,0.7932,0.0252,-0.3093,0.7072,...,1.1714,0.8965,0.3385,0.3242,-0.2765,-0.8901,0.7811,1,1,23.78
1,0.259,-0.6611,-1.5083,1.2439,-2.027,-0.6591,0.3035,0.0952,1.1039,0.8076,...,0.624,0.5703,0.5101,-0.7796,0.653,-0.0861,-0.0875,1,1,53.35
2,-0.7977,1.4223,0.9841,-0.9023,-2.027,-0.5989,0.1142,0.3536,0.5283,0.5764,...,0.4081,-0.0825,0.0121,-1.222,-0.5053,1.0836,-0.157,0,0,63.7
3,0.4761,-1.3221,-1.9983,1.4541,-2.027,0.822,0.3332,1.0881,0.0164,1.3414,...,0.7449,0.8242,-2.4195,-1.5752,0.4504,0.6772,-0.2676,0,1,83.18
4,0.3577,0.006,0.0038,-0.6596,-2.027,-0.6835,-0.4084,-0.7568,-0.4736,0.3716,...,-0.5116,-0.9817,0.4782,0.0868,0.9322,-0.8486,1.5964,1,1,41.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,-0.6873,0.4198,-0.1862,-0.4012,-2.027,-0.3729,0.0078,-1.1184,-0.4914,0.4142,...,-0.976,-0.3673,-0.069,-0.7032,-0.4128,-0.8771,0.3031,1,1,45.53
368,-0.6714,-0.6625,-1.1085,0.795,-2.027,1.1797,0.2652,0.1413,-0.1388,1.184,...,-1.3322,0.8841,-0.0415,0.054,-1.238,0.0221,0.3087,0,0,40.77
369,0.5088,-0.3136,-0.8363,1.735,-2.027,-1.2987,0.2478,0.8356,1.643,1.6705,...,-0.861,0.3497,0.9416,-0.0258,1.7398,-0.0156,-1.8457,1,0,79.24
370,0.5742,-0.126,-1.9095,1.5389,1.1931,-0.4865,1.0476,1.3813,2.0297,1.1241,...,1.4361,1.5646,1.1577,-0.1308,0.5543,0.9046,-2.9768,1,0,79.24


In [85]:
20026 - 15663

4363

In [87]:
np.intersect1d(lihc_df_with_sex.columns.values[:-3], unique_gene_in_pathway)

array(['A2M', 'AACS', 'AADAT', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
      dtype=object)

In [88]:
np.intersect1d(lihc_df_with_sex.columns.values[:-3], unique_gene_in_pathway).size

4360

In [89]:
lihc_df_with_sex

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,-0.4579,-0.2749,0.2537,0.3082,0.0792,0.308,0.7932,0.0252,-0.3093,0.7072,...,1.1714,0.8965,0.3385,0.3242,-0.2765,-0.8901,0.7811,1,1,23.78
1,0.259,-0.6611,-1.5083,1.2439,-2.027,-0.6591,0.3035,0.0952,1.1039,0.8076,...,0.624,0.5703,0.5101,-0.7796,0.653,-0.0861,-0.0875,1,1,53.35
2,-0.7977,1.4223,0.9841,-0.9023,-2.027,-0.5989,0.1142,0.3536,0.5283,0.5764,...,0.4081,-0.0825,0.0121,-1.222,-0.5053,1.0836,-0.157,0,0,63.7
3,0.4761,-1.3221,-1.9983,1.4541,-2.027,0.822,0.3332,1.0881,0.0164,1.3414,...,0.7449,0.8242,-2.4195,-1.5752,0.4504,0.6772,-0.2676,0,1,83.18
4,0.3577,0.006,0.0038,-0.6596,-2.027,-0.6835,-0.4084,-0.7568,-0.4736,0.3716,...,-0.5116,-0.9817,0.4782,0.0868,0.9322,-0.8486,1.5964,1,1,41.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,-0.6873,0.4198,-0.1862,-0.4012,-2.027,-0.3729,0.0078,-1.1184,-0.4914,0.4142,...,-0.976,-0.3673,-0.069,-0.7032,-0.4128,-0.8771,0.3031,1,1,45.53
368,-0.6714,-0.6625,-1.1085,0.795,-2.027,1.1797,0.2652,0.1413,-0.1388,1.184,...,-1.3322,0.8841,-0.0415,0.054,-1.238,0.0221,0.3087,0,0,40.77
369,0.5088,-0.3136,-0.8363,1.735,-2.027,-1.2987,0.2478,0.8356,1.643,1.6705,...,-0.861,0.3497,0.9416,-0.0258,1.7398,-0.0156,-1.8457,1,0,79.24
370,0.5742,-0.126,-1.9095,1.5389,1.1931,-0.4865,1.0476,1.3813,2.0297,1.1241,...,1.4361,1.5646,1.1577,-0.1308,0.5543,0.9046,-2.9768,1,0,79.24


In [90]:
lihc_df_with_sex.to_csv("TCGA_LIHC_gene_expression_data.csv", index = False, header = True)

---

## Pathway Mask

In [91]:
gene_name = lihc_df_with_sex.columns.values[:-3]

In [92]:
gene_name.size, np.unique(gene_name).size

(4360, 4360)

In [93]:
pathway_name, pathway_name.size

(0              KEGG_N_GLYCAN_BIOSYNTHESIS
 1           KEGG_OTHER_GLYCAN_DEGRADATION
 2              KEGG_O_GLYCAN_BIOSYNTHESIS
 3      KEGG_GLYCOSAMINOGLYCAN_DEGRADATION
 4            KEGG_GLYCEROLIPID_METABOLISM
                       ...                
 168                           KEGG_ASTHMA
 169       KEGG_AUTOIMMUNE_THYROID_DISEASE
 170              KEGG_ALLOGRAFT_REJECTION
 171        KEGG_GRAFT_VERSUS_HOST_DISEASE
 172                KEGG_VIRAL_MYOCARDITIS
 Name: Name, Length: 173, dtype: object,
 173)

In [94]:
pathway_sparse_mat = sparse.coo_matrix((pathway_name.size, gene_name.size)).toarray()
pathway_sparse_mat.shape

(173, 4360)

In [95]:
pathway_sparse_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [54]:
pathway_sparse_mat.sum().sum()

0.0

In [96]:
for i in range(len(pathway_name)):
    pathway_sparse_mat[i, np.argwhere(np.isin(gene_name, set_pathway['gene'][i])).reshape((-1, ))] = 1.
            
pathway_sparse_mat, pathway_sparse_mat.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 (173, 4360))

In [97]:
pathway_sparse_mat.sum()

10840.0

In [98]:
data_type = "LIHC"

In [99]:
pathway_sparse_mat = sparse.coo_matrix(pathway_sparse_mat)
sparse.save_npz(f"TCGA_{data_type}_Pathway_Mask.npz", pathway_sparse_mat)

---

# TCGA - STAD

## clinical data

In [100]:
stad_survival_info_df = pd.read_csv(data_path + "stad_tcga/data_bcr_clinical_data_patient.txt", sep = '\t')

In [101]:
stad_survival_info_df

Unnamed: 0,#Other Patient ID,Patient Identifier,Form completion date,Neoplasm Histologic Type Name,Neoplasm Histologic Grade,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,Sex,Race Category,Ethnicity Category,...,Number of lymphnodes positive by ihc,Project code,Stage Other,Adjuvant Postoperative Targeted Therapy Administered Indicator,Tissue Source Site,Tumor Tissue Site,Overall Survival Status,Overall Survival (Months),Disease Free Status,Disease Free (Months)
0,#Legacy DMP patient identifier (DMPnnnn),Identifier to uniquely specify a patient.,Form completion date,Text term for the structural pattern of cancer...,Numeric value to express the degree of abnorma...,Text indicator for the time frame of tissue pr...,Text indicator for the time frame of tissue pr...,Sex,The text for reporting information about race.,The text for reporting information about ethni...,...,Number of lymphnodes positive by ihc,Project code,Stage Other,Text term to signify postoperative adjuvant ca...,"A Tissue Source Site collects samples (tissue,...",Text term that describes the anatomic site of ...,Overall patient survival status.,Overall survival in months since initial diago...,Disease free status since initial treatment.,Disease free (months) since initial treatment.
1,#STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,...,STRING,STRING,STRING,STRING,STRING,STRING,STRING,NUMBER,STRING,NUMBER
2,#1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,OTHER_PATIENT_ID,PATIENT_ID,FORM_COMPLETION_DATE,HISTOLOGICAL_DIAGNOSIS,GRADE,PROSPECTIVE_COLLECTION,RETROSPECTIVE_COLLECTION,SEX,RACE,ETHNICITY,...,NUMBER_OF_LYMPHNODES_POSITIVE_BY_IHC,PROJECT_CODE,STAGE_OTHER,TARGETED_MOLECULAR_THERAPY,TISSUE_SOURCE_SITE,SITE_OF_TUMOR_TISSUE,OS_STATUS,OS_MONTHS,DFS_STATUS,DFS_MONTHS
4,BE6531B2-D1F3-44AB-9C02-1CEAE51EF2BB,TCGA-3M-AB46,7/15/14,"Stomach, Adenocarcinoma, Not Otherwise Specifi...",G2,NO,YES,Male,WHITE,HISPANIC OR LATINO,...,[Not Available],[Not Available],[Not Available],YES,3M,Stomach,0:LIVING,57.98,0:DiseaseFree,57.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,3CCBE7C7-C779-4A98-B495-FF020D3E9E55,TCGA-VQ-AA6I,9/9/14,"Stomach, Intestinal Adenocarcinoma, Not Otherw...",G3,NO,YES,Male,WHITE,[Not Available],...,[Not Available],[Not Available],[Not Available],YES,VQ,Stomach,1:DECEASED,16.13,1:Recurred/Progressed,15.6
443,A267DDB4-B2D9-4292-B120-6E13E70D01F8,TCGA-VQ-AA6J,9/10/14,"Stomach, Intestinal Adenocarcinoma, Not Otherw...",G3,NO,YES,Male,BLACK OR AFRICAN AMERICAN,[Not Available],...,[Not Available],[Not Available],[Not Available],NO,VQ,Stomach,0:LIVING,27.53,0:DiseaseFree,27.53
444,83A0F9EF-4BF0-4EFE-8713-F8B6AB4F5773,TCGA-VQ-AA6K,8/26/14,"Stomach Adenocarcinoma, Signet Ring Type",G3,NO,YES,Male,WHITE,[Not Available],...,[Not Available],[Not Available],[Not Available],YES,VQ,Stomach,1:DECEASED,12.42,1:Recurred/Progressed,10.91
445,9130C730-B0BC-4D3A-9EE1-09BF43D9BD67,TCGA-ZA-A8F6,2/19/14,"Stomach, Intestinal Adenocarcinoma, Not Otherw...",G2,YES,NO,Male,WHITE,NOT HISPANIC OR LATINO,...,[Not Available],[Not Available],[Not Available],NO,ZA,Stomach,0:LIVING,17.25,0:DiseaseFree,17.25


In [102]:
stad_survival_info_df = stad_survival_info_df[["Patient Identifier", "Overall Survival Status", "Overall Survival (Months)"]]

In [103]:
stad_survival_info_df

Unnamed: 0,Patient Identifier,Overall Survival Status,Overall Survival (Months)
0,Identifier to uniquely specify a patient.,Overall patient survival status.,Overall survival in months since initial diago...
1,STRING,STRING,NUMBER
2,1,1,1
3,PATIENT_ID,OS_STATUS,OS_MONTHS
4,TCGA-3M-AB46,0:LIVING,57.98
...,...,...,...
442,TCGA-VQ-AA6I,1:DECEASED,16.13
443,TCGA-VQ-AA6J,0:LIVING,27.53
444,TCGA-VQ-AA6K,1:DECEASED,12.42
445,TCGA-ZA-A8F6,0:LIVING,17.25


In [104]:
stad_survival_info_df = stad_survival_info_df.loc[3:]

In [105]:
stad_survival_info_df

Unnamed: 0,Patient Identifier,Overall Survival Status,Overall Survival (Months)
3,PATIENT_ID,OS_STATUS,OS_MONTHS
4,TCGA-3M-AB46,0:LIVING,57.98
5,TCGA-3M-AB47,1:DECEASED,[Not Available]
6,TCGA-B7-5816,0:LIVING,26.68
7,TCGA-B7-5818,0:LIVING,11.7
...,...,...,...
442,TCGA-VQ-AA6I,1:DECEASED,16.13
443,TCGA-VQ-AA6J,0:LIVING,27.53
444,TCGA-VQ-AA6K,1:DECEASED,12.42
445,TCGA-ZA-A8F6,0:LIVING,17.25


In [106]:
stad_survival_info_df = stad_survival_info_df.rename(columns = {"Patient Identifier" : "Patient ID", "Overall Survival Status" : "OS_STATUS", "Overall Survival (Months)" : "OS_MONTHS"}).drop(3).reset_index(drop = True)

In [107]:
stad_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-3M-AB46,0:LIVING,57.98
1,TCGA-3M-AB47,1:DECEASED,[Not Available]
2,TCGA-B7-5816,0:LIVING,26.68
3,TCGA-B7-5818,0:LIVING,11.7
4,TCGA-B7-A5TI,0:LIVING,19.55
...,...,...,...
438,TCGA-VQ-AA6I,1:DECEASED,16.13
439,TCGA-VQ-AA6J,0:LIVING,27.53
440,TCGA-VQ-AA6K,1:DECEASED,12.42
441,TCGA-ZA-A8F6,0:LIVING,17.25


In [108]:
stad_survival_info_df = stad_survival_info_df[~stad_survival_info_df["OS_MONTHS"].str.contains("Not")].reset_index(drop = True)

In [109]:
stad_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-3M-AB46,0:LIVING,57.98
1,TCGA-B7-5816,0:LIVING,26.68
2,TCGA-B7-5818,0:LIVING,11.7
3,TCGA-B7-A5TI,0:LIVING,19.55
4,TCGA-B7-A5TJ,0:LIVING,11.01
...,...,...,...
430,TCGA-VQ-AA6I,1:DECEASED,16.13
431,TCGA-VQ-AA6J,0:LIVING,27.53
432,TCGA-VQ-AA6K,1:DECEASED,12.42
433,TCGA-ZA-A8F6,0:LIVING,17.25


In [110]:
stad_survival_info_df["OS_STATUS"][stad_survival_info_df["OS_STATUS"] == "1:DECEASED"].shape

(170,)

In [111]:
stad_survival_info_df["OS_STATUS"][stad_survival_info_df["OS_STATUS"] == "0:LIVING"].shape

(265,)

In [112]:
170 + 265

435

In [113]:
stad_survival_info_df["OS_STATUS"] = stad_survival_info_df["OS_STATUS"].apply(lambda x : 1 if x.find('1:') != -1 else 0)

In [114]:
stad_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-3M-AB46,0,57.98
1,TCGA-B7-5816,0,26.68
2,TCGA-B7-5818,0,11.7
3,TCGA-B7-A5TI,0,19.55
4,TCGA-B7-A5TJ,0,11.01
...,...,...,...
430,TCGA-VQ-AA6I,1,16.13
431,TCGA-VQ-AA6J,0,27.53
432,TCGA-VQ-AA6K,1,12.42
433,TCGA-ZA-A8F6,0,17.25


---

## gene expression data

In [115]:
stad_df = pd.read_csv(data_path + "stad_tcga/data_RNA_Seq_v2_mRNA_median_all_sample_Zscores.txt", sep = '\t')

In [116]:
stad_df = stad_df.T

In [117]:
stad_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20521,20522,20523,20524,20525,20526,20527,20528,20529,20530
Hugo_Symbol,LOC100130426,UBE2Q2P3,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
Entrez_Gene_Id,100130426,100133144,100134869,10357,10431,136542,155060,26823,280660,317712,...,7789,158586,79364,440590,79699,7791,23140,26009,387590,389932
TCGA-3M-AB46-01,-1.8526,-0.7213,-0.0476,0.5935,2.5965,,0.8529,1.311,-1.1795,-3.7535,...,-0.9832,-0.4358,-1.1083,0.5648,0.5564,-0.7957,-1.6589,0.2303,-0.28,0.2581
TCGA-3M-AB47-01,-1.8526,-0.4327,-1.174,-0.1311,0.1598,,0.4518,-0.3653,-1.1795,-3.7535,...,-1.1052,0.5985,-0.7442,-0.6947,0.3452,-0.1038,-0.0593,0.4956,-0.7959,0.0596
TCGA-B7-5816-01,-1.8526,-2.0692,-0.5451,-2.5289,-0.2734,,-1.9825,-1.9838,-1.1795,-3.7535,...,-0.0172,-0.9204,-1.1864,-1.5819,-0.9146,-0.0503,-0.3562,-1.461,0.4494,-1.6333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-VQ-AA6I-01,-1.8526,0.7609,0.806,0.0332,-1.2663,,-0.8076,-1.9838,-1.1795,-3.7535,...,0.7544,0.727,-0.1272,-1.0442,0.0923,-0.8232,0.1935,-0.1842,0.2265,-0.5193
TCGA-VQ-AA6J-01,-1.0166,0.4456,0.1285,0.4787,-0.4243,,-0.3396,-0.9228,-1.1795,-3.7535,...,0.3473,0.0553,0.0201,-0.2148,-0.9514,-0.4662,-0.1865,1.2026,0.0231,-0.0978
TCGA-VQ-AA6K-01,-0.5402,0.4934,0.2687,-0.5134,-0.4518,,1.2167,-0.5341,-1.1795,-3.7535,...,0.1119,-0.0878,1.5786,-0.4408,-0.6375,-0.5852,0.5316,-1.0976,-0.7294,0.4168
TCGA-ZA-A8F6-01,-1.8526,0.526,-0.1961,0.0491,-1.056,,0.699,-1.9838,-1.1795,-3.7535,...,0.8879,0.5272,-0.5264,-0.6147,0.8498,1.053,1.1045,-0.019,1.4144,-1.6333


In [118]:
stad_df.loc["Hugo_Symbol"]

0        LOC100130426
1            UBE2Q2P3
2            UBE2Q2P3
3             HMGB1P1
4              TIMM23
             ...     
20526             ZYX
20527        FLJ10821
20528            ZZZ3
20529          TPTEP1
20530         AKR1C6P
Name: Hugo_Symbol, Length: 20531, dtype: object

In [119]:
stad_df.columns = stad_df.loc["Hugo_Symbol"]

In [120]:
stad_df = stad_df.reset_index()

In [121]:
stad_df

Hugo_Symbol,index,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,Hugo_Symbol,LOC100130426,UBE2Q2P3,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
1,Entrez_Gene_Id,100130426,100133144,100134869,10357,10431,136542,155060,26823,280660,...,7789,158586,79364,440590,79699,7791,23140,26009,387590,389932
2,TCGA-3M-AB46-01,-1.8526,-0.7213,-0.0476,0.5935,2.5965,,0.8529,1.311,-1.1795,...,-0.9832,-0.4358,-1.1083,0.5648,0.5564,-0.7957,-1.6589,0.2303,-0.28,0.2581
3,TCGA-3M-AB47-01,-1.8526,-0.4327,-1.174,-0.1311,0.1598,,0.4518,-0.3653,-1.1795,...,-1.1052,0.5985,-0.7442,-0.6947,0.3452,-0.1038,-0.0593,0.4956,-0.7959,0.0596
4,TCGA-B7-5816-01,-1.8526,-2.0692,-0.5451,-2.5289,-0.2734,,-1.9825,-1.9838,-1.1795,...,-0.0172,-0.9204,-1.1864,-1.5819,-0.9146,-0.0503,-0.3562,-1.461,0.4494,-1.6333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,TCGA-VQ-AA6I-01,-1.8526,0.7609,0.806,0.0332,-1.2663,,-0.8076,-1.9838,-1.1795,...,0.7544,0.727,-0.1272,-1.0442,0.0923,-0.8232,0.1935,-0.1842,0.2265,-0.5193
413,TCGA-VQ-AA6J-01,-1.0166,0.4456,0.1285,0.4787,-0.4243,,-0.3396,-0.9228,-1.1795,...,0.3473,0.0553,0.0201,-0.2148,-0.9514,-0.4662,-0.1865,1.2026,0.0231,-0.0978
414,TCGA-VQ-AA6K-01,-0.5402,0.4934,0.2687,-0.5134,-0.4518,,1.2167,-0.5341,-1.1795,...,0.1119,-0.0878,1.5786,-0.4408,-0.6375,-0.5852,0.5316,-1.0976,-0.7294,0.4168
415,TCGA-ZA-A8F6-01,-1.8526,0.526,-0.1961,0.0491,-1.056,,0.699,-1.9838,-1.1795,...,0.8879,0.5272,-0.5264,-0.6147,0.8498,1.053,1.1045,-0.019,1.4144,-1.6333


In [122]:
stad_df = stad_df.drop([0, 1]).reset_index(drop = True).rename(columns = {"index" : "Patient ID"})

In [123]:
stad_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-3M-AB46-01,-1.8526,-0.7213,-0.0476,0.5935,2.5965,,0.8529,1.311,-1.1795,...,-0.9832,-0.4358,-1.1083,0.5648,0.5564,-0.7957,-1.6589,0.2303,-0.28,0.2581
1,TCGA-3M-AB47-01,-1.8526,-0.4327,-1.174,-0.1311,0.1598,,0.4518,-0.3653,-1.1795,...,-1.1052,0.5985,-0.7442,-0.6947,0.3452,-0.1038,-0.0593,0.4956,-0.7959,0.0596
2,TCGA-B7-5816-01,-1.8526,-2.0692,-0.5451,-2.5289,-0.2734,,-1.9825,-1.9838,-1.1795,...,-0.0172,-0.9204,-1.1864,-1.5819,-0.9146,-0.0503,-0.3562,-1.461,0.4494,-1.6333
3,TCGA-B7-5818-01,-1.8526,-1.7893,1.2804,-0.9824,-0.2798,,-0.1831,-1.9838,-1.1795,...,-2.5807,-5.509,1.3616,-1.5819,-1.6672,0.4228,-1.5959,-2.6963,-0.4898,-1.6333
4,TCGA-B7-A5TI-01,-0.6312,-0.7019,-0.0425,0.0122,1.3114,,-0.8833,-1.9838,-1.1795,...,-1.5956,-1.0806,-0.4311,-0.9994,-1.002,-0.4783,-0.5992,-0.0146,0.0321,-1.6333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,TCGA-VQ-AA6I-01,-1.8526,0.7609,0.806,0.0332,-1.2663,,-0.8076,-1.9838,-1.1795,...,0.7544,0.727,-0.1272,-1.0442,0.0923,-0.8232,0.1935,-0.1842,0.2265,-0.5193
411,TCGA-VQ-AA6J-01,-1.0166,0.4456,0.1285,0.4787,-0.4243,,-0.3396,-0.9228,-1.1795,...,0.3473,0.0553,0.0201,-0.2148,-0.9514,-0.4662,-0.1865,1.2026,0.0231,-0.0978
412,TCGA-VQ-AA6K-01,-0.5402,0.4934,0.2687,-0.5134,-0.4518,,1.2167,-0.5341,-1.1795,...,0.1119,-0.0878,1.5786,-0.4408,-0.6375,-0.5852,0.5316,-1.0976,-0.7294,0.4168
413,TCGA-ZA-A8F6-01,-1.8526,0.526,-0.1961,0.0491,-1.056,,0.699,-1.9838,-1.1795,...,0.8879,0.5272,-0.5264,-0.6147,0.8498,1.053,1.1045,-0.019,1.4144,-1.6333


In [124]:
stad_df["Patient ID"] = stad_df["Patient ID"].apply(lambda x : x[:-3])

In [125]:
stad_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-3M-AB46,-1.8526,-0.7213,-0.0476,0.5935,2.5965,,0.8529,1.311,-1.1795,...,-0.9832,-0.4358,-1.1083,0.5648,0.5564,-0.7957,-1.6589,0.2303,-0.28,0.2581
1,TCGA-3M-AB47,-1.8526,-0.4327,-1.174,-0.1311,0.1598,,0.4518,-0.3653,-1.1795,...,-1.1052,0.5985,-0.7442,-0.6947,0.3452,-0.1038,-0.0593,0.4956,-0.7959,0.0596
2,TCGA-B7-5816,-1.8526,-2.0692,-0.5451,-2.5289,-0.2734,,-1.9825,-1.9838,-1.1795,...,-0.0172,-0.9204,-1.1864,-1.5819,-0.9146,-0.0503,-0.3562,-1.461,0.4494,-1.6333
3,TCGA-B7-5818,-1.8526,-1.7893,1.2804,-0.9824,-0.2798,,-0.1831,-1.9838,-1.1795,...,-2.5807,-5.509,1.3616,-1.5819,-1.6672,0.4228,-1.5959,-2.6963,-0.4898,-1.6333
4,TCGA-B7-A5TI,-0.6312,-0.7019,-0.0425,0.0122,1.3114,,-0.8833,-1.9838,-1.1795,...,-1.5956,-1.0806,-0.4311,-0.9994,-1.002,-0.4783,-0.5992,-0.0146,0.0321,-1.6333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,TCGA-VQ-AA6I,-1.8526,0.7609,0.806,0.0332,-1.2663,,-0.8076,-1.9838,-1.1795,...,0.7544,0.727,-0.1272,-1.0442,0.0923,-0.8232,0.1935,-0.1842,0.2265,-0.5193
411,TCGA-VQ-AA6J,-1.0166,0.4456,0.1285,0.4787,-0.4243,,-0.3396,-0.9228,-1.1795,...,0.3473,0.0553,0.0201,-0.2148,-0.9514,-0.4662,-0.1865,1.2026,0.0231,-0.0978
412,TCGA-VQ-AA6K,-0.5402,0.4934,0.2687,-0.5134,-0.4518,,1.2167,-0.5341,-1.1795,...,0.1119,-0.0878,1.5786,-0.4408,-0.6375,-0.5852,0.5316,-1.0976,-0.7294,0.4168
413,TCGA-ZA-A8F6,-1.8526,0.526,-0.1961,0.0491,-1.056,,0.699,-1.9838,-1.1795,...,0.8879,0.5272,-0.5264,-0.6147,0.8498,1.053,1.1045,-0.019,1.4144,-1.6333


### remove nan column

In [126]:
stad_df.iloc[:, stad_df.columns.isna()]

Hugo_Symbol,NaN
0,0.4837
1,0.424
2,0.1805
3,0.214
4,0.5055
...,...
410,1.08
411,0.4102
412,-1.2388
413,0.9399


In [127]:
stad_df = stad_df.iloc[:, ~stad_df.columns.isna()]

In [128]:
stad_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-3M-AB46,-1.8526,-0.7213,-0.0476,0.5935,2.5965,,0.8529,1.311,-1.1795,...,-0.9832,-0.4358,-1.1083,0.5648,0.5564,-0.7957,-1.6589,0.2303,-0.28,0.2581
1,TCGA-3M-AB47,-1.8526,-0.4327,-1.174,-0.1311,0.1598,,0.4518,-0.3653,-1.1795,...,-1.1052,0.5985,-0.7442,-0.6947,0.3452,-0.1038,-0.0593,0.4956,-0.7959,0.0596
2,TCGA-B7-5816,-1.8526,-2.0692,-0.5451,-2.5289,-0.2734,,-1.9825,-1.9838,-1.1795,...,-0.0172,-0.9204,-1.1864,-1.5819,-0.9146,-0.0503,-0.3562,-1.461,0.4494,-1.6333
3,TCGA-B7-5818,-1.8526,-1.7893,1.2804,-0.9824,-0.2798,,-0.1831,-1.9838,-1.1795,...,-2.5807,-5.509,1.3616,-1.5819,-1.6672,0.4228,-1.5959,-2.6963,-0.4898,-1.6333
4,TCGA-B7-A5TI,-0.6312,-0.7019,-0.0425,0.0122,1.3114,,-0.8833,-1.9838,-1.1795,...,-1.5956,-1.0806,-0.4311,-0.9994,-1.002,-0.4783,-0.5992,-0.0146,0.0321,-1.6333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,TCGA-VQ-AA6I,-1.8526,0.7609,0.806,0.0332,-1.2663,,-0.8076,-1.9838,-1.1795,...,0.7544,0.727,-0.1272,-1.0442,0.0923,-0.8232,0.1935,-0.1842,0.2265,-0.5193
411,TCGA-VQ-AA6J,-1.0166,0.4456,0.1285,0.4787,-0.4243,,-0.3396,-0.9228,-1.1795,...,0.3473,0.0553,0.0201,-0.2148,-0.9514,-0.4662,-0.1865,1.2026,0.0231,-0.0978
412,TCGA-VQ-AA6K,-0.5402,0.4934,0.2687,-0.5134,-0.4518,,1.2167,-0.5341,-1.1795,...,0.1119,-0.0878,1.5786,-0.4408,-0.6375,-0.5852,0.5316,-1.0976,-0.7294,0.4168
413,TCGA-ZA-A8F6,-1.8526,0.526,-0.1961,0.0491,-1.056,,0.699,-1.9838,-1.1795,...,0.8879,0.5272,-0.5264,-0.6147,0.8498,1.053,1.1045,-0.019,1.4144,-1.6333


### check redundant genes

In [129]:
duplicated_genes_stad = stad_df.columns[stad_df.columns.duplicated()]

In [130]:
duplicated_genes_stad, duplicated_genes_stad.size

(Index(['UBE2Q2P3', 'CC2D2B', 'CCDC7', 'CYorf15B', 'C1orf84', 'LINC00875',
        'NBPF16', 'NEBL', 'NKAIN3', 'C5orf23', 'PALM2AKAP2', 'PLEKHG7', 'QSOX1',
        'SH3D20', 'NCRNA00185'],
       dtype='object', name='Hugo_Symbol'),
 15)

In [131]:
stad_df[duplicated_genes_stad]

Hugo_Symbol,UBE2Q2P3,UBE2Q2P3.1,CC2D2B,CC2D2B.1,CCDC7,CCDC7.1,CYorf15B,CYorf15B.1,C1orf84,C1orf84.1,...,PALM2AKAP2,PALM2AKAP2.1,PLEKHG7,PLEKHG7.1,QSOX1,QSOX1.1,SH3D20,SH3D20.1,NCRNA00185,NCRNA00185.1
0,-0.7213,-0.0476,-0.4197,0.5738,1.9092,0.3839,-0.7445,-0.5713,1.3757,0.0801,...,-3.2493,0.4687,0.4918,1.4726,-0.6244,-2.0684,1.0512,-0.1022,-1.2156,-0.7482
1,-0.4327,-1.174,-0.5882,2.3129,0.4957,0.5025,0.8667,1.0446,-1.5019,-0.2769,...,0.5398,0.0982,0.684,1.732,-1.9579,-1.6886,0.0302,-0.4616,0.9032,0.5119
2,-2.0692,-0.5451,-2.025,-1.8408,-2.0463,-2.1871,-2.0502,-2.4433,-0.5144,-0.8153,...,0.7921,0.2044,0.7974,-0.4959,1.8293,1.9895,-0.4633,-0.1258,-1.7685,-1.586
3,-1.7893,1.2804,-2.025,-1.6184,-0.0313,-1.518,0.5713,-0.138,0.0757,-1.1072,...,-0.389,-2.3356,0.3345,-2.4915,0.0411,-0.5577,-0.1264,-0.9343,0.4631,-0.7932
4,-0.7019,-0.0425,0.1162,-0.0209,-0.5508,-0.4267,0.8168,0.5726,-0.8202,0.2309,...,0.0854,1.9097,0.3384,0.6278,-0.3757,-0.5889,0.7707,-0.2734,1.5118,1.6133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,0.7609,0.806,-0.124,-0.4711,0.8494,1.3856,-0.2233,-0.2027,0.3931,1.4558,...,-0.759,-1.3358,-0.6521,-0.3135,0.4088,0.7978,-0.7991,-0.9181,-0.6429,-0.1391
411,0.4456,0.1285,-1.0832,0.7982,-0.2681,1.9316,0.0044,0.3843,-0.7013,0.0017,...,0.2371,1.4108,0.1133,0.6459,-1.0516,-1.7784,0.5834,-0.2208,-0.4625,-1.0733
412,0.4934,0.2687,0.425,1.129,0.6916,-0.6315,0.9903,1.2026,1.1903,0.6493,...,0.0723,0.3642,1.924,2.1089,1.0197,1.2328,0.8724,1.0168,-0.0374,0.0445
413,0.526,-0.1961,-0.1482,0.9679,0.3137,0.6496,0.7447,0.9211,-0.2343,0.4948,...,0.4474,-0.6356,-0.7741,-0.3871,-0.3849,-0.1648,0.0377,-0.3196,0.7109,0.1881


In [132]:
for gene in duplicated_genes_stad:
    var_gene = np.var(stad_df[gene])
    print(var_gene)
    print(stad_df[gene])
    #print(var_gene.values)
    #print(np.where(var_gene.values[0]))
    #print(var_gene.values[0])
    #print(var_gene.values[1])
    if var_gene.values[0] < var_gene.values[1]:
        print(var_gene.values[1])
        print(stad_df[gene].iloc[:, 1])
        stad_df[gene] = stad_df[gene].iloc[:, 1]
    else:
        print(var_gene.values[0])
        print(stad_df[gene].iloc[:, 0])
        stad_df[gene] = stad_df[gene].iloc[:, 0]

Hugo_Symbol
UBE2Q2P3    1.158559
UBE2Q2P3    1.096635
dtype: float64
Hugo_Symbol UBE2Q2P3 UBE2Q2P3
0            -0.7213  -0.0476
1            -0.4327   -1.174
2            -2.0692  -0.5451
3            -1.7893   1.2804
4            -0.7019  -0.0425
..               ...      ...
410           0.7609    0.806
411           0.4456   0.1285
412           0.4934   0.2687
413            0.526  -0.1961
414           0.0013  -1.0547

[415 rows x 2 columns]
1.1585585821440558
0     -0.7213
1     -0.4327
2     -2.0692
3     -1.7893
4     -0.7019
        ...  
410    0.7609
411    0.4456
412    0.4934
413     0.526
414    0.0013
Name: UBE2Q2P3, Length: 415, dtype: object
Hugo_Symbol
CC2D2B    1.469793
CC2D2B    1.193551
dtype: float64
Hugo_Symbol  CC2D2B  CC2D2B
0           -0.4197  0.5738
1           -0.5882  2.3129
2            -2.025 -1.8408
3            -2.025 -1.6184
4            0.1162 -0.0209
..              ...     ...
410          -0.124 -0.4711
411         -1.0832  0.7982
412           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stad_df[gene] = stad_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stad_df[gene] = stad_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stad_df[gene] = stad_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

In [133]:
stad_df = stad_df.iloc[:, ~stad_df.columns.duplicated()]

In [134]:
stad_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-3M-AB46,-1.8526,-0.7213,0.5935,2.5965,,0.8529,1.311,-1.1795,-3.7535,...,-0.9832,-0.4358,-1.1083,0.5648,0.5564,-0.7957,-1.6589,0.2303,-0.28,0.2581
1,TCGA-3M-AB47,-1.8526,-0.4327,-0.1311,0.1598,,0.4518,-0.3653,-1.1795,-3.7535,...,-1.1052,0.5985,-0.7442,-0.6947,0.3452,-0.1038,-0.0593,0.4956,-0.7959,0.0596
2,TCGA-B7-5816,-1.8526,-2.0692,-2.5289,-0.2734,,-1.9825,-1.9838,-1.1795,-3.7535,...,-0.0172,-0.9204,-1.1864,-1.5819,-0.9146,-0.0503,-0.3562,-1.461,0.4494,-1.6333
3,TCGA-B7-5818,-1.8526,-1.7893,-0.9824,-0.2798,,-0.1831,-1.9838,-1.1795,-3.7535,...,-2.5807,-5.509,1.3616,-1.5819,-1.6672,0.4228,-1.5959,-2.6963,-0.4898,-1.6333
4,TCGA-B7-A5TI,-0.6312,-0.7019,0.0122,1.3114,,-0.8833,-1.9838,-1.1795,-3.7535,...,-1.5956,-1.0806,-0.4311,-0.9994,-1.002,-0.4783,-0.5992,-0.0146,0.0321,-1.6333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,TCGA-VQ-AA6I,-1.8526,0.7609,0.0332,-1.2663,,-0.8076,-1.9838,-1.1795,-3.7535,...,0.7544,0.727,-0.1272,-1.0442,0.0923,-0.8232,0.1935,-0.1842,0.2265,-0.5193
411,TCGA-VQ-AA6J,-1.0166,0.4456,0.4787,-0.4243,,-0.3396,-0.9228,-1.1795,-3.7535,...,0.3473,0.0553,0.0201,-0.2148,-0.9514,-0.4662,-0.1865,1.2026,0.0231,-0.0978
412,TCGA-VQ-AA6K,-0.5402,0.4934,-0.5134,-0.4518,,1.2167,-0.5341,-1.1795,-3.7535,...,0.1119,-0.0878,1.5786,-0.4408,-0.6375,-0.5852,0.5316,-1.0976,-0.7294,0.4168
413,TCGA-ZA-A8F6,-1.8526,0.526,0.0491,-1.056,,0.699,-1.9838,-1.1795,-3.7535,...,0.8879,0.5272,-0.5264,-0.6147,0.8498,1.053,1.1045,-0.019,1.4144,-1.6333


---

### sex data

In [135]:
stad_sex_info_df = pd.read_csv(data_path + "stad_tcga/Sex.txt", sep ='\t')

In [136]:
stad_sex_info_df

Unnamed: 0,Study ID,Patient ID,Sex
0,stad_tcga,TCGA-3M-AB46,Male
1,stad_tcga,TCGA-3M-AB47,Male
2,stad_tcga,TCGA-B7-5816,Female
3,stad_tcga,TCGA-B7-5818,Male
4,stad_tcga,TCGA-B7-A5TI,Male
...,...,...,...
438,stad_tcga,TCGA-VQ-AA6I,Male
439,stad_tcga,TCGA-VQ-AA6J,Male
440,stad_tcga,TCGA-VQ-AA6K,Male
441,stad_tcga,TCGA-ZA-A8F6,Male


In [137]:
stad_sex_info_df["Sex"] = stad_sex_info_df["Sex"].apply(lambda x : 0 if x.find('Female') != -1 else 1)

In [138]:
stad_sex_info_df

Unnamed: 0,Study ID,Patient ID,Sex
0,stad_tcga,TCGA-3M-AB46,1
1,stad_tcga,TCGA-3M-AB47,1
2,stad_tcga,TCGA-B7-5816,0
3,stad_tcga,TCGA-B7-5818,1
4,stad_tcga,TCGA-B7-A5TI,1
...,...,...,...
438,stad_tcga,TCGA-VQ-AA6I,1
439,stad_tcga,TCGA-VQ-AA6J,1
440,stad_tcga,TCGA-VQ-AA6K,1
441,stad_tcga,TCGA-ZA-A8F6,1


---

### merge gene expression, clincic, and sex data

In [139]:
stad_df_with_sex = pd.merge(stad_df, stad_sex_info_df, how = 'inner', on = 'Patient ID')

In [140]:
stad_df_with_sex

Unnamed: 0,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Study ID,Sex
0,TCGA-3M-AB46,-1.8526,-0.7213,0.5935,2.5965,,0.8529,1.311,-1.1795,-3.7535,...,-1.1083,0.5648,0.5564,-0.7957,-1.6589,0.2303,-0.28,0.2581,stad_tcga,1
1,TCGA-3M-AB47,-1.8526,-0.4327,-0.1311,0.1598,,0.4518,-0.3653,-1.1795,-3.7535,...,-0.7442,-0.6947,0.3452,-0.1038,-0.0593,0.4956,-0.7959,0.0596,stad_tcga,1
2,TCGA-B7-5816,-1.8526,-2.0692,-2.5289,-0.2734,,-1.9825,-1.9838,-1.1795,-3.7535,...,-1.1864,-1.5819,-0.9146,-0.0503,-0.3562,-1.461,0.4494,-1.6333,stad_tcga,0
3,TCGA-B7-5818,-1.8526,-1.7893,-0.9824,-0.2798,,-0.1831,-1.9838,-1.1795,-3.7535,...,1.3616,-1.5819,-1.6672,0.4228,-1.5959,-2.6963,-0.4898,-1.6333,stad_tcga,1
4,TCGA-B7-A5TI,-0.6312,-0.7019,0.0122,1.3114,,-0.8833,-1.9838,-1.1795,-3.7535,...,-0.4311,-0.9994,-1.002,-0.4783,-0.5992,-0.0146,0.0321,-1.6333,stad_tcga,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,TCGA-VQ-AA6I,-1.8526,0.7609,0.0332,-1.2663,,-0.8076,-1.9838,-1.1795,-3.7535,...,-0.1272,-1.0442,0.0923,-0.8232,0.1935,-0.1842,0.2265,-0.5193,stad_tcga,1
411,TCGA-VQ-AA6J,-1.0166,0.4456,0.4787,-0.4243,,-0.3396,-0.9228,-1.1795,-3.7535,...,0.0201,-0.2148,-0.9514,-0.4662,-0.1865,1.2026,0.0231,-0.0978,stad_tcga,1
412,TCGA-VQ-AA6K,-0.5402,0.4934,-0.5134,-0.4518,,1.2167,-0.5341,-1.1795,-3.7535,...,1.5786,-0.4408,-0.6375,-0.5852,0.5316,-1.0976,-0.7294,0.4168,stad_tcga,1
413,TCGA-ZA-A8F6,-1.8526,0.526,0.0491,-1.056,,0.699,-1.9838,-1.1795,-3.7535,...,-0.5264,-0.6147,0.8498,1.053,1.1045,-0.019,1.4144,-1.6333,stad_tcga,1


In [141]:
stad_df_with_sex["Sex"].isna().sum()

0

In [142]:
stad_df_with_sex = pd.merge(stad_df_with_sex, stad_survival_info_df, how = 'inner', on = 'Patient ID')

In [143]:
stad_df_with_sex

Unnamed: 0,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Study ID,Sex,OS_STATUS,OS_MONTHS
0,TCGA-3M-AB46,-1.8526,-0.7213,0.5935,2.5965,,0.8529,1.311,-1.1795,-3.7535,...,0.5564,-0.7957,-1.6589,0.2303,-0.28,0.2581,stad_tcga,1,0,57.98
1,TCGA-B7-5816,-1.8526,-2.0692,-2.5289,-0.2734,,-1.9825,-1.9838,-1.1795,-3.7535,...,-0.9146,-0.0503,-0.3562,-1.461,0.4494,-1.6333,stad_tcga,0,0,26.68
2,TCGA-B7-5818,-1.8526,-1.7893,-0.9824,-0.2798,,-0.1831,-1.9838,-1.1795,-3.7535,...,-1.6672,0.4228,-1.5959,-2.6963,-0.4898,-1.6333,stad_tcga,1,0,11.7
3,TCGA-B7-A5TI,-0.6312,-0.7019,0.0122,1.3114,,-0.8833,-1.9838,-1.1795,-3.7535,...,-1.002,-0.4783,-0.5992,-0.0146,0.0321,-1.6333,stad_tcga,1,0,19.55
4,TCGA-B7-A5TJ,-1.8526,0.253,1.0339,-0.2055,,2.1552,-0.4697,-1.1795,-3.7535,...,-1.01,0.1977,-0.1539,-0.708,-0.9311,-1.6333,stad_tcga,1,0,11.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,TCGA-VQ-AA6I,-1.8526,0.7609,0.0332,-1.2663,,-0.8076,-1.9838,-1.1795,-3.7535,...,0.0923,-0.8232,0.1935,-0.1842,0.2265,-0.5193,stad_tcga,1,1,16.13
403,TCGA-VQ-AA6J,-1.0166,0.4456,0.4787,-0.4243,,-0.3396,-0.9228,-1.1795,-3.7535,...,-0.9514,-0.4662,-0.1865,1.2026,0.0231,-0.0978,stad_tcga,1,0,27.53
404,TCGA-VQ-AA6K,-0.5402,0.4934,-0.5134,-0.4518,,1.2167,-0.5341,-1.1795,-3.7535,...,-0.6375,-0.5852,0.5316,-1.0976,-0.7294,0.4168,stad_tcga,1,1,12.42
405,TCGA-ZA-A8F6,-1.8526,0.526,0.0491,-1.056,,0.699,-1.9838,-1.1795,-3.7535,...,0.8498,1.053,1.1045,-0.019,1.4144,-1.6333,stad_tcga,1,0,17.25


In [144]:
stad_df_with_sex = stad_df_with_sex.drop(["Patient ID", "Study ID"], axis = 1)

In [145]:
stad_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,EZHIP,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-1.8526,-0.7213,0.5935,2.5965,,0.8529,1.311,-1.1795,-3.7535,-0.8286,...,0.5648,0.5564,-0.7957,-1.6589,0.2303,-0.28,0.2581,1,0,57.98
1,-1.8526,-2.0692,-2.5289,-0.2734,,-1.9825,-1.9838,-1.1795,-3.7535,-0.8286,...,-1.5819,-0.9146,-0.0503,-0.3562,-1.461,0.4494,-1.6333,0,0,26.68
2,-1.8526,-1.7893,-0.9824,-0.2798,,-0.1831,-1.9838,-1.1795,-3.7535,-0.8286,...,-1.5819,-1.6672,0.4228,-1.5959,-2.6963,-0.4898,-1.6333,1,0,11.7
3,-0.6312,-0.7019,0.0122,1.3114,,-0.8833,-1.9838,-1.1795,-3.7535,-0.8286,...,-0.9994,-1.002,-0.4783,-0.5992,-0.0146,0.0321,-1.6333,1,0,19.55
4,-1.8526,0.253,1.0339,-0.2055,,2.1552,-0.4697,-1.1795,-3.7535,-0.8286,...,-0.8524,-1.01,0.1977,-0.1539,-0.708,-0.9311,-1.6333,1,0,11.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,-1.8526,0.7609,0.0332,-1.2663,,-0.8076,-1.9838,-1.1795,-3.7535,-0.8286,...,-1.0442,0.0923,-0.8232,0.1935,-0.1842,0.2265,-0.5193,1,1,16.13
403,-1.0166,0.4456,0.4787,-0.4243,,-0.3396,-0.9228,-1.1795,-3.7535,-0.5334,...,-0.2148,-0.9514,-0.4662,-0.1865,1.2026,0.0231,-0.0978,1,0,27.53
404,-0.5402,0.4934,-0.5134,-0.4518,,1.2167,-0.5341,-1.1795,-3.7535,-0.8286,...,-0.4408,-0.6375,-0.5852,0.5316,-1.0976,-0.7294,0.4168,1,1,12.42
405,-1.8526,0.526,0.0491,-1.056,,0.699,-1.9838,-1.1795,-3.7535,-0.1675,...,-0.6147,0.8498,1.053,1.1045,-0.019,1.4144,-1.6333,1,0,17.25


In [146]:
stad_df_with_sex = stad_df_with_sex.iloc[:, ~(stad_df_with_sex.isna().sum() / stad_df_with_sex.shape[0]  > 0.8).values]

In [147]:
stad_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,LOC155060,RNU12-2P,SSX9,LOC317712,EZHIP,EFCAB8,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-1.8526,-0.7213,0.5935,2.5965,0.8529,1.311,-1.1795,-3.7535,-0.8286,-0.3149,...,0.5648,0.5564,-0.7957,-1.6589,0.2303,-0.28,0.2581,1,0,57.98
1,-1.8526,-2.0692,-2.5289,-0.2734,-1.9825,-1.9838,-1.1795,-3.7535,-0.8286,-1.452,...,-1.5819,-0.9146,-0.0503,-0.3562,-1.461,0.4494,-1.6333,0,0,26.68
2,-1.8526,-1.7893,-0.9824,-0.2798,-0.1831,-1.9838,-1.1795,-3.7535,-0.8286,-1.452,...,-1.5819,-1.6672,0.4228,-1.5959,-2.6963,-0.4898,-1.6333,1,0,11.7
3,-0.6312,-0.7019,0.0122,1.3114,-0.8833,-1.9838,-1.1795,-3.7535,-0.8286,-1.452,...,-0.9994,-1.002,-0.4783,-0.5992,-0.0146,0.0321,-1.6333,1,0,19.55
4,-1.8526,0.253,1.0339,-0.2055,2.1552,-0.4697,-1.1795,-3.7535,-0.8286,-1.0438,...,-0.8524,-1.01,0.1977,-0.1539,-0.708,-0.9311,-1.6333,1,0,11.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,-1.8526,0.7609,0.0332,-1.2663,-0.8076,-1.9838,-1.1795,-3.7535,-0.8286,-0.1831,...,-1.0442,0.0923,-0.8232,0.1935,-0.1842,0.2265,-0.5193,1,1,16.13
403,-1.0166,0.4456,0.4787,-0.4243,-0.3396,-0.9228,-1.1795,-3.7535,-0.5334,0.2885,...,-0.2148,-0.9514,-0.4662,-0.1865,1.2026,0.0231,-0.0978,1,0,27.53
404,-0.5402,0.4934,-0.5134,-0.4518,1.2167,-0.5341,-1.1795,-3.7535,-0.8286,1.0507,...,-0.4408,-0.6375,-0.5852,0.5316,-1.0976,-0.7294,0.4168,1,1,12.42
405,-1.8526,0.526,0.0491,-1.056,0.699,-1.9838,-1.1795,-3.7535,-0.1675,-1.0255,...,-0.6147,0.8498,1.053,1.1045,-0.019,1.4144,-1.6333,1,0,17.25


In [148]:
stad_df_with_sex.isna().sum().sum()

0

---

### Remove non pathway-genes from STAD

In [149]:
unique_gene_in_pathway

array(['A2M', 'AACS', 'AADAT', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
      dtype='<U13')

In [150]:
stad_df_with_sex.columns.values[:-3], stad_df_with_sex.columns.values[:-3].size

(array(['LOC100130426', 'UBE2Q2P3', 'HMGB1P1', ..., 'ZZZ3', 'TPTEP1',
        'AKR1C6P'], dtype=object),
 20258)

In [151]:
np.unique(stad_df_with_sex.columns), np.unique(stad_df_with_sex.columns).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11B', 'ZYX', 'ZZZ3'],
       dtype=object),
 20261)

In [152]:
removable_gene_stad = np.setdiff1d(stad_df_with_sex.columns.values[:-3], unique_gene_in_pathway)

In [153]:
removable_gene_stad, removable_gene_stad.size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11A', 'ZYG11B', 'ZZZ3'],
       dtype=object),
 15889)

In [154]:
np.unique(removable_gene_stad), np.unique(removable_gene_stad).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11A', 'ZYG11B', 'ZZZ3'],
       dtype=object),
 15889)

In [155]:
stad_df_with_sex = stad_df_with_sex.drop(removable_gene_stad, axis = 1)

In [156]:
stad_df_with_sex

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,0.5935,-0.7905,0.1913,0.7205,-0.9637,0.9801,-1.7089,-2.251,-0.5928,-2.2211,...,0.5274,-1.074,-0.6108,-0.2185,0.0939,2.7476,-0.7957,1,0,57.98
1,-2.5289,-0.4229,-0.7856,0.1786,-1.0378,-0.2856,-0.8256,-1.3174,-0.2596,-0.6966,...,-0.2306,-0.3609,0.6905,1.8023,0.3047,-0.8814,-0.0503,0,0,26.68
2,-0.9824,-1.2408,1.1243,-0.2285,1.3105,-0.1549,-2.0963,-1.45,-2.0049,0.1831,...,-1.2383,-0.8456,-1.8184,1.31,-0.9642,-1.2401,0.4228,1,0,11.7
3,0.0122,0.3973,2.1612,-1.3275,-1.6946,0.8537,-1.1514,1.2557,-0.8907,0.1589,...,0.4294,0.4894,-0.076,0.4733,0.4611,-0.6828,-0.4783,1,0,19.55
4,1.0339,-0.7171,-0.0049,0.2223,-1.6946,-0.2764,-0.4209,-1.716,-0.2034,0.6382,...,-1.4298,-1.1947,1.3031,-0.6918,-0.4172,-0.0998,0.1977,1,0,11.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,0.0332,-0.3452,0.1883,0.6936,-0.4248,-0.1127,0.5221,0.1307,0.773,-0.9549,...,-0.2239,1.0356,0.4915,0.2803,-0.8648,-1.9597,-0.8232,1,1,16.13
403,0.4787,0.0216,0.3006,0.7963,0.6076,-0.11,-0.4227,0.0208,1.3625,0.5721,...,-0.9005,-1.186,-1.5501,0.3959,-0.4625,-0.703,-0.4662,1,0,27.53
404,-0.5134,0.7968,0.222,0.5102,-0.333,-0.3603,-1.0079,0.7964,0.3475,0.0812,...,-0.3561,-1.2356,-1.0319,1.8559,0.5439,0.2765,-0.5852,1,1,12.42
405,0.0491,1.4773,-0.5391,-0.0735,-0.2259,0.1212,-1.1962,0.0108,0.7556,0.64,...,1.2759,0.4438,1.1157,-0.4324,0.7854,0.3452,1.053,1,0,17.25


In [157]:
20261 - 15889

4372

In [158]:
np.intersect1d(stad_df_with_sex.columns.values[:-3], unique_gene_in_pathway)

array(['A2M', 'AACS', 'AADAT', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
      dtype=object)

In [159]:
np.intersect1d(stad_df_with_sex.columns.values[:-3], unique_gene_in_pathway).size

4369

In [160]:
stad_df_with_sex

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,0.5935,-0.7905,0.1913,0.7205,-0.9637,0.9801,-1.7089,-2.251,-0.5928,-2.2211,...,0.5274,-1.074,-0.6108,-0.2185,0.0939,2.7476,-0.7957,1,0,57.98
1,-2.5289,-0.4229,-0.7856,0.1786,-1.0378,-0.2856,-0.8256,-1.3174,-0.2596,-0.6966,...,-0.2306,-0.3609,0.6905,1.8023,0.3047,-0.8814,-0.0503,0,0,26.68
2,-0.9824,-1.2408,1.1243,-0.2285,1.3105,-0.1549,-2.0963,-1.45,-2.0049,0.1831,...,-1.2383,-0.8456,-1.8184,1.31,-0.9642,-1.2401,0.4228,1,0,11.7
3,0.0122,0.3973,2.1612,-1.3275,-1.6946,0.8537,-1.1514,1.2557,-0.8907,0.1589,...,0.4294,0.4894,-0.076,0.4733,0.4611,-0.6828,-0.4783,1,0,19.55
4,1.0339,-0.7171,-0.0049,0.2223,-1.6946,-0.2764,-0.4209,-1.716,-0.2034,0.6382,...,-1.4298,-1.1947,1.3031,-0.6918,-0.4172,-0.0998,0.1977,1,0,11.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,0.0332,-0.3452,0.1883,0.6936,-0.4248,-0.1127,0.5221,0.1307,0.773,-0.9549,...,-0.2239,1.0356,0.4915,0.2803,-0.8648,-1.9597,-0.8232,1,1,16.13
403,0.4787,0.0216,0.3006,0.7963,0.6076,-0.11,-0.4227,0.0208,1.3625,0.5721,...,-0.9005,-1.186,-1.5501,0.3959,-0.4625,-0.703,-0.4662,1,0,27.53
404,-0.5134,0.7968,0.222,0.5102,-0.333,-0.3603,-1.0079,0.7964,0.3475,0.0812,...,-0.3561,-1.2356,-1.0319,1.8559,0.5439,0.2765,-0.5852,1,1,12.42
405,0.0491,1.4773,-0.5391,-0.0735,-0.2259,0.1212,-1.1962,0.0108,0.7556,0.64,...,1.2759,0.4438,1.1157,-0.4324,0.7854,0.3452,1.053,1,0,17.25


In [161]:
stad_df_with_sex.to_csv("TCGA_STAD_gene_expression_data.csv", index = False, header = True)

---

## Pathway Mask

In [162]:
stad_df_with_sex.columns.values[:-3], stad_df_with_sex.columns.values[:-3].size

(array(['HMGB1P1', 'A2M', 'AACS', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
       dtype=object),
 4369)

In [163]:
gene_name = stad_df_with_sex.columns.values[:-3]

In [164]:
gene_name.size, np.unique(gene_name).size

(4369, 4369)

In [165]:
pathway_name, pathway_name.size

(0              KEGG_N_GLYCAN_BIOSYNTHESIS
 1           KEGG_OTHER_GLYCAN_DEGRADATION
 2              KEGG_O_GLYCAN_BIOSYNTHESIS
 3      KEGG_GLYCOSAMINOGLYCAN_DEGRADATION
 4            KEGG_GLYCEROLIPID_METABOLISM
                       ...                
 168                           KEGG_ASTHMA
 169       KEGG_AUTOIMMUNE_THYROID_DISEASE
 170              KEGG_ALLOGRAFT_REJECTION
 171        KEGG_GRAFT_VERSUS_HOST_DISEASE
 172                KEGG_VIRAL_MYOCARDITIS
 Name: Name, Length: 173, dtype: object,
 173)

In [166]:
pathway_sparse_mat = sparse.coo_matrix((pathway_name.size, gene_name.size)).toarray()
pathway_sparse_mat.shape

(173, 4369)

In [167]:
pathway_sparse_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [168]:
pathway_sparse_mat.sum().sum()

0.0

In [169]:
for i in range(len(pathway_name)):
    pathway_sparse_mat[i, np.argwhere(np.isin(gene_name, set_pathway['gene'][i])).reshape((-1, ))] = 1.
            
pathway_sparse_mat, pathway_sparse_mat.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 (173, 4369))

In [170]:
pathway_sparse_mat.sum()

10893.0

---

In [171]:
data_type = "STAD"

In [172]:
pathway_sparse_mat = sparse.coo_matrix(pathway_sparse_mat)
sparse.save_npz(f"TCGA_{data_type}_Pathway_Mask.npz", pathway_sparse_mat)

---

# TCGA - LUAD

## clinical data

In [173]:
luad_survival_info_df = pd.read_csv(data_path + "luad_tcga/data_bcr_clinical_data_patient.txt", sep = '\t')

In [174]:
luad_survival_info_df

Unnamed: 0,#Other Patient ID,Patient Identifier,Form completion date,Neoplasm Histologic Type Name,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,Sex,Tumor Site,Race Category,Ethnicity Category,...,Number of lymphnodes positive by ihc,Lymph node location positive pathology name,Project code,Stage Other,Adjuvant Postoperative Targeted Therapy Administered Indicator,Tissue Source Site,Overall Survival Status,Overall Survival (Months),Disease Free Status,Disease Free (Months)
0,#Legacy DMP patient identifier (DMPnnnn),Identifier to uniquely specify a patient.,Form completion date,Text term for the structural pattern of cancer...,Text indicator for the time frame of tissue pr...,Text indicator for the time frame of tissue pr...,Sex,Tumor Site,The text for reporting information about race.,The text for reporting information about ethni...,...,Number of lymphnodes positive by ihc,Lymph node location positive pathology name,Project code,Stage Other,Text term to signify postoperative adjuvant ca...,"A Tissue Source Site collects samples (tissue,...",Overall patient survival status.,Overall survival in months since initial diago...,Disease free status since initial treatment.,Disease free (months) since initial treatment.
1,#STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,...,STRING,STRING,STRING,STRING,STRING,STRING,STRING,NUMBER,STRING,NUMBER
2,#1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,OTHER_PATIENT_ID,PATIENT_ID,FORM_COMPLETION_DATE,HISTOLOGICAL_DIAGNOSIS,PROSPECTIVE_COLLECTION,RETROSPECTIVE_COLLECTION,SEX,TUMOR_SITE,RACE,ETHNICITY,...,NUMBER_OF_LYMPHNODES_POSITIVE_BY_IHC,POS_LYMPH_NODE_LOCATION,PROJECT_CODE,STAGE_OTHER,TARGETED_MOLECULAR_THERAPY,TISSUE_SOURCE_SITE,OS_STATUS,OS_MONTHS,DFS_STATUS,DFS_MONTHS
4,34040b83-7e8a-4264-a551-b16621843e28,TCGA-05-4244,7/22/10,Lung Adenocarcinoma,NO,YES,MALE,Lung,[Not Available],[Not Available],...,[Not Available],[Not Available],[Not Available],[Not Available],[Not Available],5,0:LIVING,0,0:DiseaseFree,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,8AB8C4BA-311E-41D0-85EA-A0245ACA1BDD,TCGA-NJ-A55O,9/27/13,Lung Adenocarcinoma,[Not Available],[Not Available],Female,Lung,WHITE,NOT HISPANIC OR LATINO,...,[Not Available],[Not Available],[Not Available],[Not Available],NO,NJ,0:LIVING,0.43,0:DiseaseFree,0.43
522,7C7D777B-BF00-4C24-97FC-922580D5605F,TCGA-NJ-A55R,9/27/13,Lung Adenocarcinoma,[Not Available],[Not Available],Male,Lung,WHITE,NOT HISPANIC OR LATINO,...,[Not Available],[Not Available],[Not Available],[Not Available],NO,NJ,0:LIVING,19.81,0:DiseaseFree,19.81
523,01E9888D-B5B9-48F1-8BA6-8A89AF108A04,TCGA-NJ-A7XG,5/22/14,Lung Adenocarcinoma,[Not Available],[Not Available],Male,Lung,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,...,[Not Available],[Not Available],[Not Available],[Not Available],YES,NJ,0:LIVING,20.27,0:DiseaseFree,20.27
524,42432463-8E92-4F25-B72A-F03953527AA5,TCGA-O1-A52J,4/11/13,Lung Adenocarcinoma,NO,YES,Female,Lung,WHITE,NOT HISPANIC OR LATINO,...,[Not Available],[Not Available],[Not Available],[Not Available],NO,O1,1:DECEASED,59.07,1:Recurred/Progressed,29.5


In [175]:
luad_survival_info_df = luad_survival_info_df[["Patient Identifier", "Overall Survival Status", "Overall Survival (Months)"]]

In [176]:
luad_survival_info_df

Unnamed: 0,Patient Identifier,Overall Survival Status,Overall Survival (Months)
0,Identifier to uniquely specify a patient.,Overall patient survival status.,Overall survival in months since initial diago...
1,STRING,STRING,NUMBER
2,1,1,1
3,PATIENT_ID,OS_STATUS,OS_MONTHS
4,TCGA-05-4244,0:LIVING,0
...,...,...,...
521,TCGA-NJ-A55O,0:LIVING,0.43
522,TCGA-NJ-A55R,0:LIVING,19.81
523,TCGA-NJ-A7XG,0:LIVING,20.27
524,TCGA-O1-A52J,1:DECEASED,59.07


In [177]:
luad_survival_info_df = luad_survival_info_df.loc[3:]

In [178]:
luad_survival_info_df

Unnamed: 0,Patient Identifier,Overall Survival Status,Overall Survival (Months)
3,PATIENT_ID,OS_STATUS,OS_MONTHS
4,TCGA-05-4244,0:LIVING,0
5,TCGA-05-4245,0:LIVING,23.98
6,TCGA-05-4249,0:LIVING,50.03
7,TCGA-05-4250,1:DECEASED,3.98
...,...,...,...
521,TCGA-NJ-A55O,0:LIVING,0.43
522,TCGA-NJ-A55R,0:LIVING,19.81
523,TCGA-NJ-A7XG,0:LIVING,20.27
524,TCGA-O1-A52J,1:DECEASED,59.07


In [179]:
luad_survival_info_df = luad_survival_info_df.rename(columns = {"Patient Identifier" : "Patient ID", "Overall Survival Status" : "OS_STATUS", "Overall Survival (Months)" : "OS_MONTHS"}).drop(3).reset_index(drop = True)

In [180]:
luad_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-05-4244,0:LIVING,0
1,TCGA-05-4245,0:LIVING,23.98
2,TCGA-05-4249,0:LIVING,50.03
3,TCGA-05-4250,1:DECEASED,3.98
4,TCGA-05-4382,0:LIVING,19.94
...,...,...,...
517,TCGA-NJ-A55O,0:LIVING,0.43
518,TCGA-NJ-A55R,0:LIVING,19.81
519,TCGA-NJ-A7XG,0:LIVING,20.27
520,TCGA-O1-A52J,1:DECEASED,59.07


In [181]:
luad_survival_info_df = luad_survival_info_df[~luad_survival_info_df["OS_MONTHS"].str.contains("Not")].reset_index(drop = True)

In [182]:
luad_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-05-4244,0:LIVING,0
1,TCGA-05-4245,0:LIVING,23.98
2,TCGA-05-4249,0:LIVING,50.03
3,TCGA-05-4250,1:DECEASED,3.98
4,TCGA-05-4382,0:LIVING,19.94
...,...,...,...
508,TCGA-NJ-A55O,0:LIVING,0.43
509,TCGA-NJ-A55R,0:LIVING,19.81
510,TCGA-NJ-A7XG,0:LIVING,20.27
511,TCGA-O1-A52J,1:DECEASED,59.07


In [183]:
luad_survival_info_df["OS_STATUS"][luad_survival_info_df["OS_STATUS"] == "1:DECEASED"].shape

(184,)

In [184]:
luad_survival_info_df["OS_STATUS"][luad_survival_info_df["OS_STATUS"] == "0:LIVING"].shape

(329,)

In [185]:
184 + 329

513

In [186]:
luad_survival_info_df["OS_STATUS"] = luad_survival_info_df["OS_STATUS"].apply(lambda x : 1 if x.find('1:') != -1 else 0)

In [187]:
luad_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-05-4244,0,0
1,TCGA-05-4245,0,23.98
2,TCGA-05-4249,0,50.03
3,TCGA-05-4250,1,3.98
4,TCGA-05-4382,0,19.94
...,...,...,...
508,TCGA-NJ-A55O,0,0.43
509,TCGA-NJ-A55R,0,19.81
510,TCGA-NJ-A7XG,0,20.27
511,TCGA-O1-A52J,1,59.07


---

## gene expression data

In [188]:
luad_df = pd.read_csv(data_path + "luad_tcga/data_RNA_Seq_v2_mRNA_median_all_sample_Zscores.txt", sep = '\t')

In [189]:
luad_df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-05-4244-01,TCGA-05-4249-01,TCGA-05-4250-01,TCGA-05-4382-01,TCGA-05-4384-01,TCGA-05-4389-01,TCGA-05-4390-01,TCGA-05-4395-01,...,TCGA-NJ-A4YG-01,TCGA-NJ-A4YI-01,TCGA-NJ-A4YP-01,TCGA-NJ-A4YQ-01,TCGA-NJ-A55A-01,TCGA-NJ-A55O-01,TCGA-NJ-A55R-01,TCGA-NJ-A7XG-01,TCGA-O1-A52J-01,TCGA-S2-AA1A-01
0,LOC100130426,100130426,-2.2883,-2.2883,-2.2883,-2.2883,-2.2883,-2.2883,-2.2883,-2.2883,...,-2.2883,-2.2883,-2.2883,-2.2883,-2.2883,-2.2883,-2.2883,-2.2883,-2.2883,-2.2883
1,UBE2Q2P3,100133144,0.0380,-0.3514,-0.3435,0.1873,-1.2251,-1.0032,-0.3656,-1.1610,...,0.2277,-1.0944,-1.5393,0.6355,0.9113,0.5729,-0.1679,1.8645,0.4532,0.9225
2,UBE2Q2P3,100134869,0.0691,0.1971,-0.7239,-0.4402,-1.3555,0.3319,-0.4390,0.1641,...,0.1449,-0.1182,-0.3188,-0.4470,0.6294,1.0176,-0.0462,2.7613,1.0870,-0.2293
3,HMGB1P1,10357,-1.9057,-0.2950,-1.9091,-0.5333,-0.8895,-1.1277,0.2097,-1.5319,...,1.3004,0.6957,0.0196,-0.5466,-0.0156,-0.0218,-0.8099,-0.4522,-1.3473,-0.7319
4,TIMM23,10431,-0.0395,0.1945,0.7761,-0.1787,-1.1778,1.2262,0.7946,0.2882,...,0.1800,-2.0916,0.1793,0.4005,-1.3046,0.0408,-0.3206,-0.6611,0.7679,-1.1610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20526,ZYX,7791,-0.4967,-0.8791,0.8248,1.2704,-0.1648,-0.7019,0.7560,0.6336,...,0.0674,0.5368,0.2105,0.2937,0.3504,0.2930,-0.3319,-1.2127,0.4326,0.7905
20527,FLJ10821,23140,-0.2737,0.3183,0.3805,0.3781,1.2559,-0.8254,-0.5176,-1.6532,...,-0.9521,-0.1136,-0.1859,0.6566,1.0621,0.6698,1.5561,0.8153,0.9605,1.0496
20528,ZZZ3,26009,-0.4482,0.0984,-0.6777,-0.0188,0.2385,0.3594,-0.9222,0.5367,...,-0.7431,-1.4964,0.0259,0.1293,-0.0123,-0.3969,-0.2637,-1.3431,-1.0035,-0.8107
20529,TPTEP1,387590,-1.0407,1.0324,-1.0698,-0.0218,1.5166,-0.7404,-1.1213,-1.4285,...,0.0388,-0.4165,-0.6336,-0.4154,0.4191,1.7447,1.9706,-1.1463,1.0542,1.4044


In [190]:
luad_df_columns = luad_df["Hugo_Symbol"]

In [191]:
luad_df_columns

0        LOC100130426
1            UBE2Q2P3
2            UBE2Q2P3
3             HMGB1P1
4              TIMM23
             ...     
20526             ZYX
20527        FLJ10821
20528            ZZZ3
20529          TPTEP1
20530         AKR1C6P
Name: Hugo_Symbol, Length: 20531, dtype: object

In [192]:
luad_df = luad_df.drop(["Hugo_Symbol", "Entrez_Gene_Id"], axis = 1).T

In [193]:
luad_df.columns = luad_df_columns

In [194]:
luad_df = luad_df.reset_index().rename(columns = {"index" : "Patient ID"})

In [195]:
luad_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-05-4244-01,-2.2883,0.0380,0.0691,-1.9057,-0.0395,,1.0624,0.5387,-1.2501,...,-0.2092,-0.4448,-0.2293,-1.8833,0.9850,-0.4967,-0.2737,-0.4482,-1.0407,-1.2966
1,TCGA-05-4249-01,-2.2883,-0.3514,0.1971,-0.2950,0.1945,,-0.0690,1.4599,-1.2501,...,0.7816,0.8157,0.4243,-0.1788,-0.1119,-0.8791,0.3183,0.0984,1.0324,-1.2966
2,TCGA-05-4250-01,-2.2883,-0.3435,-0.7239,-1.9091,0.7761,,-1.4074,-2.1796,-1.2501,...,0.7041,0.0911,-0.6346,-0.0054,-1.8251,0.8248,0.3805,-0.6777,-1.0698,-1.2966
3,TCGA-05-4382-01,-2.2883,0.1873,-0.4402,-0.5333,-0.1787,,0.5870,-0.6958,-0.9868,...,-0.1734,-0.0352,-0.2710,0.5825,0.3717,1.2704,0.3781,-0.0188,-0.0218,-1.2966
4,TCGA-05-4384-01,-2.2883,-1.2251,-1.3555,-0.8895,-1.1778,,0.7614,-0.3706,-1.2501,...,1.3474,1.1341,0.7826,-0.7635,-0.1493,-0.1648,1.2559,0.2385,1.5166,-1.2966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,TCGA-NJ-A55O-01,-2.2883,0.5729,1.0176,-0.0218,0.0408,,0.6719,0.7543,-1.2501,...,-0.1517,-0.1613,0.2250,-0.2732,-0.3860,0.2930,0.6698,-0.3969,1.7447,-1.2966
513,TCGA-NJ-A55R-01,-2.2883,-0.1679,-0.0462,-0.8099,-0.3206,,1.1540,-0.1200,-1.2501,...,0.3282,0.1494,1.1546,0.4441,-0.2421,-0.3319,1.5561,-0.2637,1.9706,0.5663
514,TCGA-NJ-A7XG-01,-2.2883,1.8645,2.7613,-0.4522,-0.6611,,1.0220,-2.1796,-1.2501,...,-2.0722,-1.7646,0.5444,-0.6192,-1.4443,-1.2127,0.8153,-1.3431,-1.1463,-1.2966
515,TCGA-O1-A52J-01,-2.2883,0.4532,1.0870,-1.3473,0.7679,,0.1428,2.9555,-1.2501,...,2.5448,1.1317,0.7244,-1.9766,-0.1344,0.4326,0.9605,-1.0035,1.0542,-0.3735


In [196]:
luad_df["Patient ID"] = luad_df["Patient ID"].apply(lambda x : x[:-3])

In [197]:
luad_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-05-4244,-2.2883,0.0380,0.0691,-1.9057,-0.0395,,1.0624,0.5387,-1.2501,...,-0.2092,-0.4448,-0.2293,-1.8833,0.9850,-0.4967,-0.2737,-0.4482,-1.0407,-1.2966
1,TCGA-05-4249,-2.2883,-0.3514,0.1971,-0.2950,0.1945,,-0.0690,1.4599,-1.2501,...,0.7816,0.8157,0.4243,-0.1788,-0.1119,-0.8791,0.3183,0.0984,1.0324,-1.2966
2,TCGA-05-4250,-2.2883,-0.3435,-0.7239,-1.9091,0.7761,,-1.4074,-2.1796,-1.2501,...,0.7041,0.0911,-0.6346,-0.0054,-1.8251,0.8248,0.3805,-0.6777,-1.0698,-1.2966
3,TCGA-05-4382,-2.2883,0.1873,-0.4402,-0.5333,-0.1787,,0.5870,-0.6958,-0.9868,...,-0.1734,-0.0352,-0.2710,0.5825,0.3717,1.2704,0.3781,-0.0188,-0.0218,-1.2966
4,TCGA-05-4384,-2.2883,-1.2251,-1.3555,-0.8895,-1.1778,,0.7614,-0.3706,-1.2501,...,1.3474,1.1341,0.7826,-0.7635,-0.1493,-0.1648,1.2559,0.2385,1.5166,-1.2966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,TCGA-NJ-A55O,-2.2883,0.5729,1.0176,-0.0218,0.0408,,0.6719,0.7543,-1.2501,...,-0.1517,-0.1613,0.2250,-0.2732,-0.3860,0.2930,0.6698,-0.3969,1.7447,-1.2966
513,TCGA-NJ-A55R,-2.2883,-0.1679,-0.0462,-0.8099,-0.3206,,1.1540,-0.1200,-1.2501,...,0.3282,0.1494,1.1546,0.4441,-0.2421,-0.3319,1.5561,-0.2637,1.9706,0.5663
514,TCGA-NJ-A7XG,-2.2883,1.8645,2.7613,-0.4522,-0.6611,,1.0220,-2.1796,-1.2501,...,-2.0722,-1.7646,0.5444,-0.6192,-1.4443,-1.2127,0.8153,-1.3431,-1.1463,-1.2966
515,TCGA-O1-A52J,-2.2883,0.4532,1.0870,-1.3473,0.7679,,0.1428,2.9555,-1.2501,...,2.5448,1.1317,0.7244,-1.9766,-0.1344,0.4326,0.9605,-1.0035,1.0542,-0.3735


### remove nan column

In [198]:
luad_df.iloc[:, luad_df.columns.isna()]

Hugo_Symbol,NaN
0,0.9733
1,0.3935
2,0.8307
3,0.5365
4,0.2817
...,...
512,0.5067
513,0.4204
514,-0.5186
515,-1.2191


In [199]:
luad_df = luad_df.iloc[:, ~luad_df.columns.isna()]

In [200]:
luad_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-05-4244,-2.2883,0.0380,0.0691,-1.9057,-0.0395,,1.0624,0.5387,-1.2501,...,-0.2092,-0.4448,-0.2293,-1.8833,0.9850,-0.4967,-0.2737,-0.4482,-1.0407,-1.2966
1,TCGA-05-4249,-2.2883,-0.3514,0.1971,-0.2950,0.1945,,-0.0690,1.4599,-1.2501,...,0.7816,0.8157,0.4243,-0.1788,-0.1119,-0.8791,0.3183,0.0984,1.0324,-1.2966
2,TCGA-05-4250,-2.2883,-0.3435,-0.7239,-1.9091,0.7761,,-1.4074,-2.1796,-1.2501,...,0.7041,0.0911,-0.6346,-0.0054,-1.8251,0.8248,0.3805,-0.6777,-1.0698,-1.2966
3,TCGA-05-4382,-2.2883,0.1873,-0.4402,-0.5333,-0.1787,,0.5870,-0.6958,-0.9868,...,-0.1734,-0.0352,-0.2710,0.5825,0.3717,1.2704,0.3781,-0.0188,-0.0218,-1.2966
4,TCGA-05-4384,-2.2883,-1.2251,-1.3555,-0.8895,-1.1778,,0.7614,-0.3706,-1.2501,...,1.3474,1.1341,0.7826,-0.7635,-0.1493,-0.1648,1.2559,0.2385,1.5166,-1.2966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,TCGA-NJ-A55O,-2.2883,0.5729,1.0176,-0.0218,0.0408,,0.6719,0.7543,-1.2501,...,-0.1517,-0.1613,0.2250,-0.2732,-0.3860,0.2930,0.6698,-0.3969,1.7447,-1.2966
513,TCGA-NJ-A55R,-2.2883,-0.1679,-0.0462,-0.8099,-0.3206,,1.1540,-0.1200,-1.2501,...,0.3282,0.1494,1.1546,0.4441,-0.2421,-0.3319,1.5561,-0.2637,1.9706,0.5663
514,TCGA-NJ-A7XG,-2.2883,1.8645,2.7613,-0.4522,-0.6611,,1.0220,-2.1796,-1.2501,...,-2.0722,-1.7646,0.5444,-0.6192,-1.4443,-1.2127,0.8153,-1.3431,-1.1463,-1.2966
515,TCGA-O1-A52J,-2.2883,0.4532,1.0870,-1.3473,0.7679,,0.1428,2.9555,-1.2501,...,2.5448,1.1317,0.7244,-1.9766,-0.1344,0.4326,0.9605,-1.0035,1.0542,-0.3735


### check redundant genes

In [201]:
duplicated_genes_luad = luad_df.columns[luad_df.columns.duplicated()]

In [202]:
duplicated_genes_luad, duplicated_genes_luad.size

(Index(['UBE2Q2P3', 'CC2D2B', 'CCDC7', 'CYorf15B', 'C1orf84', 'LINC00875',
        'NBPF16', 'NEBL', 'NKAIN3', 'C5orf23', 'PALM2AKAP2', 'PLEKHG7', 'QSOX1',
        'SH3D20', 'NCRNA00185'],
       dtype='object', name='Hugo_Symbol'),
 15)

In [203]:
luad_df[duplicated_genes_luad]

Hugo_Symbol,UBE2Q2P3,UBE2Q2P3.1,CC2D2B,CC2D2B.1,CCDC7,CCDC7.1,CYorf15B,CYorf15B.1,C1orf84,C1orf84.1,...,PALM2AKAP2,PALM2AKAP2.1,PLEKHG7,PLEKHG7.1,QSOX1,QSOX1.1,SH3D20,SH3D20.1,NCRNA00185,NCRNA00185.1
0,0.0380,0.0691,0.8393,-0.2076,1.0499,0.1448,-0.5281,-0.0507,0.1083,0.4121,...,-0.5466,-0.9311,-0.4489,0.1163,1.0489,1.3858,-0.2279,-0.7205,-0.4156,-1.8652
1,-0.3514,0.1971,-2.1608,-0.1224,-0.1711,-1.4454,0.6107,0.6147,0.2227,-0.5627,...,-0.3266,-1.3090,0.6799,0.2304,-0.1161,0.9117,0.9888,1.0532,0.4379,-0.5588
2,-0.3435,-0.7239,-2.1608,-1.7550,-0.9605,-0.5302,-2.5804,-2.4011,-0.8030,-2.0498,...,1.1324,-0.8193,-1.2351,-1.2264,-0.4695,-1.0472,0.9490,1.9918,-2.1351,-1.8652
3,0.1873,-0.4402,-0.8182,0.9324,0.2852,0.5464,0.4917,0.3188,0.7330,-0.1046,...,0.4636,0.0130,-0.4765,-0.9528,1.1563,0.0658,0.3129,0.4907,0.5894,0.2868
4,-1.2251,-1.3555,-0.5240,0.6347,0.5575,-0.4249,0.5496,0.7018,-0.0049,0.5876,...,0.5459,-0.6517,-0.7388,-0.6822,0.7372,0.4127,-0.1261,-0.5530,-0.4773,-0.7123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,0.5729,1.0176,1.3123,0.3949,1.1708,0.1947,-2.5804,-2.4011,-0.1986,0.7522,...,0.6420,-0.3673,-1.1950,-1.3188,-0.3392,0.8019,0.3441,0.2914,-2.1351,-1.8652
513,-0.1679,-0.0462,-2.1608,1.0445,0.4464,-0.8050,0.6217,1.0077,0.0968,0.6818,...,0.4202,1.8817,0.1571,0.7827,-1.2096,-1.2372,-0.5801,-1.6960,0.1902,-0.2337
514,1.8645,2.7613,-0.8666,0.2172,0.9368,0.7518,0.3522,0.8083,-0.7043,0.4329,...,-2.5261,-1.9469,0.0406,-0.3012,0.7136,-0.8449,0.9455,1.1508,0.6561,1.0790
515,0.4532,1.0870,-2.1608,-0.7717,0.1589,0.4240,-2.5804,-2.4011,1.1789,0.5875,...,0.4592,-0.2009,-1.1271,-0.7308,1.0879,1.4126,0.1794,0.6040,-2.1351,-1.8652


In [204]:
for gene in duplicated_genes_luad:
    var_gene = np.var(luad_df[gene])
    print(var_gene)
    print(luad_df[gene])
    #print(var_gene.values)
    #print(np.where(var_gene.values[0]))
    #print(var_gene.values[0])
    #print(var_gene.values[1])
    if var_gene.values[0] < var_gene.values[1]:
        print(var_gene.values[1])
        print(luad_df[gene].iloc[:, 1])
        luad_df[gene] = luad_df[gene].iloc[:, 1]
    else:
        print(var_gene.values[0])
        print(luad_df[gene].iloc[:, 0])
        luad_df[gene] = luad_df[gene].iloc[:, 0]

Hugo_Symbol
UBE2Q2P3    1.246222
UBE2Q2P3    1.106644
dtype: float64
Hugo_Symbol  UBE2Q2P3  UBE2Q2P3
0              0.0380    0.0691
1             -0.3514    0.1971
2             -0.3435   -0.7239
3              0.1873   -0.4402
4             -1.2251   -1.3555
..                ...       ...
512            0.5729    1.0176
513           -0.1679   -0.0462
514            1.8645    2.7613
515            0.4532    1.0870
516            0.9225   -0.2293

[517 rows x 2 columns]
1.2462216066738252
0      0.0380
1     -0.3514
2     -0.3435
3      0.1873
4     -1.2251
        ...  
512    0.5729
513   -0.1679
514    1.8645
515    0.4532
516    0.9225
Name: UBE2Q2P3, Length: 517, dtype: float64
Hugo_Symbol
CC2D2B    1.718643
CC2D2B    1.135668
dtype: float64
Hugo_Symbol  CC2D2B  CC2D2B
0            0.8393 -0.2076
1           -2.1608 -0.1224
2           -2.1608 -1.7550
3           -0.8182  0.9324
4           -0.5240  0.6347
..              ...     ...
512          1.3123  0.3949
513         -2.16

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  luad_df[gene] = luad_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  luad_df[gene] = luad_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  luad_df[gene] = luad_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

In [205]:
luad_df = luad_df.iloc[:, ~luad_df.columns.duplicated()]

In [206]:
luad_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-05-4244,-2.2883,0.0380,-1.9057,-0.0395,,1.0624,0.5387,-1.2501,,...,-0.2092,-0.4448,-0.2293,-1.8833,0.9850,-0.4967,-0.2737,-0.4482,-1.0407,-1.2966
1,TCGA-05-4249,-2.2883,-0.3514,-0.2950,0.1945,,-0.0690,1.4599,-1.2501,,...,0.7816,0.8157,0.4243,-0.1788,-0.1119,-0.8791,0.3183,0.0984,1.0324,-1.2966
2,TCGA-05-4250,-2.2883,-0.3435,-1.9091,0.7761,,-1.4074,-2.1796,-1.2501,,...,0.7041,0.0911,-0.6346,-0.0054,-1.8251,0.8248,0.3805,-0.6777,-1.0698,-1.2966
3,TCGA-05-4382,-2.2883,0.1873,-0.5333,-0.1787,,0.5870,-0.6958,-0.9868,,...,-0.1734,-0.0352,-0.2710,0.5825,0.3717,1.2704,0.3781,-0.0188,-0.0218,-1.2966
4,TCGA-05-4384,-2.2883,-1.2251,-0.8895,-1.1778,,0.7614,-0.3706,-1.2501,,...,1.3474,1.1341,0.7826,-0.7635,-0.1493,-0.1648,1.2559,0.2385,1.5166,-1.2966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,TCGA-NJ-A55O,-2.2883,0.5729,-0.0218,0.0408,,0.6719,0.7543,-1.2501,,...,-0.1517,-0.1613,0.2250,-0.2732,-0.3860,0.2930,0.6698,-0.3969,1.7447,-1.2966
513,TCGA-NJ-A55R,-2.2883,-0.1679,-0.8099,-0.3206,,1.1540,-0.1200,-1.2501,,...,0.3282,0.1494,1.1546,0.4441,-0.2421,-0.3319,1.5561,-0.2637,1.9706,0.5663
514,TCGA-NJ-A7XG,-2.2883,1.8645,-0.4522,-0.6611,,1.0220,-2.1796,-1.2501,,...,-2.0722,-1.7646,0.5444,-0.6192,-1.4443,-1.2127,0.8153,-1.3431,-1.1463,-1.2966
515,TCGA-O1-A52J,-2.2883,0.4532,-1.3473,0.7679,,0.1428,2.9555,-1.2501,,...,2.5448,1.1317,0.7244,-1.9766,-0.1344,0.4326,0.9605,-1.0035,1.0542,-0.3735


---

### sex data

In [207]:
luad_sex_info_df = pd.read_csv(data_path + "luad_tcga/Sex.txt", sep ='\t')

In [208]:
luad_sex_info_df

Unnamed: 0,Study ID,Patient ID,Sex
0,luad_tcga,TCGA-05-4244,MALE
1,luad_tcga,TCGA-05-4245,Male
2,luad_tcga,TCGA-05-4249,Male
3,luad_tcga,TCGA-05-4250,Female
4,luad_tcga,TCGA-05-4382,Male
...,...,...,...
517,luad_tcga,TCGA-NJ-A55O,Female
518,luad_tcga,TCGA-NJ-A55R,Male
519,luad_tcga,TCGA-NJ-A7XG,Male
520,luad_tcga,TCGA-O1-A52J,Female


In [209]:
luad_sex_info_df["Sex"] = luad_sex_info_df["Sex"].apply(lambda x : 0 if x.find('Female') != -1 else 1)

In [210]:
luad_sex_info_df

Unnamed: 0,Study ID,Patient ID,Sex
0,luad_tcga,TCGA-05-4244,1
1,luad_tcga,TCGA-05-4245,1
2,luad_tcga,TCGA-05-4249,1
3,luad_tcga,TCGA-05-4250,0
4,luad_tcga,TCGA-05-4382,1
...,...,...,...
517,luad_tcga,TCGA-NJ-A55O,0
518,luad_tcga,TCGA-NJ-A55R,1
519,luad_tcga,TCGA-NJ-A7XG,1
520,luad_tcga,TCGA-O1-A52J,0


---

### merge gene expression, clincic, and sex data

In [211]:
luad_df_with_sex = pd.merge(luad_df, luad_sex_info_df, how = 'inner', on = 'Patient ID')

In [212]:
luad_df_with_sex

Unnamed: 0,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Study ID,Sex
0,TCGA-05-4244,-2.2883,0.0380,-1.9057,-0.0395,,1.0624,0.5387,-1.2501,,...,-0.2293,-1.8833,0.9850,-0.4967,-0.2737,-0.4482,-1.0407,-1.2966,luad_tcga,1
1,TCGA-05-4249,-2.2883,-0.3514,-0.2950,0.1945,,-0.0690,1.4599,-1.2501,,...,0.4243,-0.1788,-0.1119,-0.8791,0.3183,0.0984,1.0324,-1.2966,luad_tcga,1
2,TCGA-05-4250,-2.2883,-0.3435,-1.9091,0.7761,,-1.4074,-2.1796,-1.2501,,...,-0.6346,-0.0054,-1.8251,0.8248,0.3805,-0.6777,-1.0698,-1.2966,luad_tcga,0
3,TCGA-05-4382,-2.2883,0.1873,-0.5333,-0.1787,,0.5870,-0.6958,-0.9868,,...,-0.2710,0.5825,0.3717,1.2704,0.3781,-0.0188,-0.0218,-1.2966,luad_tcga,1
4,TCGA-05-4384,-2.2883,-1.2251,-0.8895,-1.1778,,0.7614,-0.3706,-1.2501,,...,0.7826,-0.7635,-0.1493,-0.1648,1.2559,0.2385,1.5166,-1.2966,luad_tcga,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,TCGA-NJ-A55O,-2.2883,0.5729,-0.0218,0.0408,,0.6719,0.7543,-1.2501,,...,0.2250,-0.2732,-0.3860,0.2930,0.6698,-0.3969,1.7447,-1.2966,luad_tcga,0
513,TCGA-NJ-A55R,-2.2883,-0.1679,-0.8099,-0.3206,,1.1540,-0.1200,-1.2501,,...,1.1546,0.4441,-0.2421,-0.3319,1.5561,-0.2637,1.9706,0.5663,luad_tcga,1
514,TCGA-NJ-A7XG,-2.2883,1.8645,-0.4522,-0.6611,,1.0220,-2.1796,-1.2501,,...,0.5444,-0.6192,-1.4443,-1.2127,0.8153,-1.3431,-1.1463,-1.2966,luad_tcga,1
515,TCGA-O1-A52J,-2.2883,0.4532,-1.3473,0.7679,,0.1428,2.9555,-1.2501,,...,0.7244,-1.9766,-0.1344,0.4326,0.9605,-1.0035,1.0542,-0.3735,luad_tcga,0


In [213]:
luad_df_with_sex["Sex"].isna().sum()

0

In [214]:
luad_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-05-4244,0,0
1,TCGA-05-4245,0,23.98
2,TCGA-05-4249,0,50.03
3,TCGA-05-4250,1,3.98
4,TCGA-05-4382,0,19.94
...,...,...,...
508,TCGA-NJ-A55O,0,0.43
509,TCGA-NJ-A55R,0,19.81
510,TCGA-NJ-A7XG,0,20.27
511,TCGA-O1-A52J,1,59.07


In [215]:
luad_df_with_sex = pd.merge(luad_df_with_sex, luad_survival_info_df, how = 'inner', on = 'Patient ID')

In [216]:
luad_df_with_sex

Unnamed: 0,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Study ID,Sex,OS_STATUS,OS_MONTHS
0,TCGA-05-4244,-2.2883,0.0380,-1.9057,-0.0395,,1.0624,0.5387,-1.2501,,...,0.9850,-0.4967,-0.2737,-0.4482,-1.0407,-1.2966,luad_tcga,1,0,0
1,TCGA-05-4249,-2.2883,-0.3514,-0.2950,0.1945,,-0.0690,1.4599,-1.2501,,...,-0.1119,-0.8791,0.3183,0.0984,1.0324,-1.2966,luad_tcga,1,0,50.03
2,TCGA-05-4250,-2.2883,-0.3435,-1.9091,0.7761,,-1.4074,-2.1796,-1.2501,,...,-1.8251,0.8248,0.3805,-0.6777,-1.0698,-1.2966,luad_tcga,0,1,3.98
3,TCGA-05-4382,-2.2883,0.1873,-0.5333,-0.1787,,0.5870,-0.6958,-0.9868,,...,0.3717,1.2704,0.3781,-0.0188,-0.0218,-1.2966,luad_tcga,1,0,19.94
4,TCGA-05-4384,-2.2883,-1.2251,-0.8895,-1.1778,,0.7614,-0.3706,-1.2501,,...,-0.1493,-0.1648,1.2559,0.2385,1.5166,-1.2966,luad_tcga,1,0,13.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,TCGA-NJ-A55O,-2.2883,0.5729,-0.0218,0.0408,,0.6719,0.7543,-1.2501,,...,-0.3860,0.2930,0.6698,-0.3969,1.7447,-1.2966,luad_tcga,0,0,0.43
504,TCGA-NJ-A55R,-2.2883,-0.1679,-0.8099,-0.3206,,1.1540,-0.1200,-1.2501,,...,-0.2421,-0.3319,1.5561,-0.2637,1.9706,0.5663,luad_tcga,1,0,19.81
505,TCGA-NJ-A7XG,-2.2883,1.8645,-0.4522,-0.6611,,1.0220,-2.1796,-1.2501,,...,-1.4443,-1.2127,0.8153,-1.3431,-1.1463,-1.2966,luad_tcga,1,0,20.27
506,TCGA-O1-A52J,-2.2883,0.4532,-1.3473,0.7679,,0.1428,2.9555,-1.2501,,...,-0.1344,0.4326,0.9605,-1.0035,1.0542,-0.3735,luad_tcga,0,1,59.07


In [217]:
luad_df_with_sex = luad_df_with_sex.drop(["Patient ID", "Study ID"], axis = 1)

In [218]:
luad_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,EZHIP,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-2.2883,0.0380,-1.9057,-0.0395,,1.0624,0.5387,-1.2501,,-0.9569,...,-1.8833,0.9850,-0.4967,-0.2737,-0.4482,-1.0407,-1.2966,1,0,0
1,-2.2883,-0.3514,-0.2950,0.1945,,-0.0690,1.4599,-1.2501,,-0.9569,...,-0.1788,-0.1119,-0.8791,0.3183,0.0984,1.0324,-1.2966,1,0,50.03
2,-2.2883,-0.3435,-1.9091,0.7761,,-1.4074,-2.1796,-1.2501,,-0.9569,...,-0.0054,-1.8251,0.8248,0.3805,-0.6777,-1.0698,-1.2966,0,1,3.98
3,-2.2883,0.1873,-0.5333,-0.1787,,0.5870,-0.6958,-0.9868,,1.1884,...,0.5825,0.3717,1.2704,0.3781,-0.0188,-0.0218,-1.2966,1,0,19.94
4,-2.2883,-1.2251,-0.8895,-1.1778,,0.7614,-0.3706,-1.2501,,-0.9569,...,-0.7635,-0.1493,-0.1648,1.2559,0.2385,1.5166,-1.2966,1,0,13.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,-2.2883,0.5729,-0.0218,0.0408,,0.6719,0.7543,-1.2501,,-0.9569,...,-0.2732,-0.3860,0.2930,0.6698,-0.3969,1.7447,-1.2966,0,0,0.43
504,-2.2883,-0.1679,-0.8099,-0.3206,,1.1540,-0.1200,-1.2501,,0.1089,...,0.4441,-0.2421,-0.3319,1.5561,-0.2637,1.9706,0.5663,1,0,19.81
505,-2.2883,1.8645,-0.4522,-0.6611,,1.0220,-2.1796,-1.2501,,-0.9569,...,-0.6192,-1.4443,-1.2127,0.8153,-1.3431,-1.1463,-1.2966,1,0,20.27
506,-2.2883,0.4532,-1.3473,0.7679,,0.1428,2.9555,-1.2501,,0.5042,...,-1.9766,-0.1344,0.4326,0.9605,-1.0035,1.0542,-0.3735,0,1,59.07


In [219]:
luad_df_with_sex = luad_df_with_sex.iloc[:, ~(luad_df_with_sex.isna().sum() / luad_df_with_sex.shape[0] > 0.8).values]

In [220]:
luad_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,LOC155060,RNU12-2P,SSX9,EZHIP,EFCAB8,SRP14P1,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-2.2883,0.0380,-1.9057,-0.0395,1.0624,0.5387,-1.2501,-0.9569,1.3884,0.1027,...,-1.8833,0.9850,-0.4967,-0.2737,-0.4482,-1.0407,-1.2966,1,0,0
1,-2.2883,-0.3514,-0.2950,0.1945,-0.0690,1.4599,-1.2501,-0.9569,-0.2834,-0.0119,...,-0.1788,-0.1119,-0.8791,0.3183,0.0984,1.0324,-1.2966,1,0,50.03
2,-2.2883,-0.3435,-1.9091,0.7761,-1.4074,-2.1796,-1.2501,-0.9569,-1.6633,-0.0611,...,-0.0054,-1.8251,0.8248,0.3805,-0.6777,-1.0698,-1.2966,0,1,3.98
3,-2.2883,0.1873,-0.5333,-0.1787,0.5870,-0.6958,-0.9868,1.1884,-0.1789,-0.2600,...,0.5825,0.3717,1.2704,0.3781,-0.0188,-0.0218,-1.2966,1,0,19.94
4,-2.2883,-1.2251,-0.8895,-1.1778,0.7614,-0.3706,-1.2501,-0.9569,-0.3323,0.2313,...,-0.7635,-0.1493,-0.1648,1.2559,0.2385,1.5166,-1.2966,1,0,13.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,-2.2883,0.5729,-0.0218,0.0408,0.6719,0.7543,-1.2501,-0.9569,-1.6633,-0.1497,...,-0.2732,-0.3860,0.2930,0.6698,-0.3969,1.7447,-1.2966,0,0,0.43
504,-2.2883,-0.1679,-0.8099,-0.3206,1.1540,-0.1200,-1.2501,0.1089,2.9233,-1.7395,...,0.4441,-0.2421,-0.3319,1.5561,-0.2637,1.9706,0.5663,1,0,19.81
505,-2.2883,1.8645,-0.4522,-0.6611,1.0220,-2.1796,-1.2501,-0.9569,0.7316,-1.6127,...,-0.6192,-1.4443,-1.2127,0.8153,-1.3431,-1.1463,-1.2966,1,0,20.27
506,-2.2883,0.4532,-1.3473,0.7679,0.1428,2.9555,-1.2501,0.5042,-0.1792,-0.1152,...,-1.9766,-0.1344,0.4326,0.9605,-1.0035,1.0542,-0.3735,0,1,59.07


In [221]:
luad_df_with_sex.isna().sum().sum()

0

### Remove non pathway-genes from LUAD

In [222]:
luad_df_with_sex.columns.values[:-3], luad_df_with_sex.columns.values[:-3].size

(array(['LOC100130426', 'UBE2Q2P3', 'HMGB1P1', ..., 'ZZZ3', 'TPTEP1',
        'AKR1C6P'], dtype=object),
 20097)

In [223]:
np.unique(luad_df_with_sex.columns), np.unique(luad_df_with_sex.columns).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11B', 'ZYX', 'ZZZ3'],
       dtype=object),
 20100)

In [224]:
removable_gene_luad = np.setdiff1d(luad_df_with_sex.columns.values[:-3], unique_gene_in_pathway)

In [225]:
removable_gene_luad, removable_gene_luad.size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11A', 'ZYG11B', 'ZZZ3'],
       dtype=object),
 15732)

In [226]:
np.unique(removable_gene_luad), np.unique(removable_gene_luad).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11A', 'ZYG11B', 'ZZZ3'],
       dtype=object),
 15732)

In [227]:
luad_df_with_sex = luad_df_with_sex.drop(removable_gene_luad, axis = 1)

In [228]:
luad_df_with_sex

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,-1.9057,-0.7328,-0.0599,-0.4275,-1.9441,0.3717,-1.9669,1.2450,-0.4371,0.5023,...,0.4099,0.0758,0.7671,1.9043,0.2518,1.0164,-0.4967,1,0,0
1,-0.2950,0.4726,0.4150,0.9225,-1.1738,-0.9029,0.3265,-0.0974,-0.0565,0.6207,...,0.6071,0.6687,0.0146,-1.1770,0.8868,0.7726,-0.8791,1,0,50.03
2,-1.9091,-0.0511,-0.1980,0.0388,-0.5402,-0.2299,0.0953,-0.0171,-0.8013,-1.2674,...,-0.5641,-0.6107,0.7475,0.4486,-0.1294,-0.7576,0.8248,0,1,3.98
3,-0.5333,0.3503,0.5821,-0.5648,-0.4509,-0.2470,-0.4774,-0.1876,-0.6723,-0.1605,...,-0.2524,0.4919,-1.4625,-0.8091,0.4065,0.5852,1.2704,1,0,19.94
4,-0.8895,1.2646,-0.3292,0.2626,-1.9441,-0.2941,0.3080,0.1383,1.1152,1.0806,...,1.4413,0.2123,-1.7023,0.6045,0.8609,0.2940,-0.1648,1,0,13.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,-0.0218,0.2995,-0.4215,-0.4627,-1.9441,0.2866,-0.6996,-0.0004,0.1431,-0.4726,...,-0.1856,1.0863,-1.7023,-0.8183,-1.2910,1.2719,0.2930,0,0,0.43
504,-0.8099,0.5823,2.7798,-0.0093,-0.0856,0.8445,-0.3696,0.7827,-0.1377,2.6733,...,-0.4354,0.5556,-0.2406,-0.2837,0.5286,0.9093,-0.3319,1,0,19.81
505,-0.4522,-1.7762,-0.7533,1.4659,-0.1499,0.4272,0.0790,-1.2886,1.3519,0.4950,...,-0.3309,-0.4957,-1.3037,0.8470,-1.2156,0.6139,-1.2127,1,0,20.27
506,-1.3473,0.4958,0.5675,0.9068,2.6611,0.4739,-0.6699,0.4860,0.9761,3.1634,...,0.7868,0.4064,-1.4625,1.3708,-0.6834,-0.2513,0.4326,0,1,59.07


In [229]:
20097 - 15732

4365

In [230]:
np.intersect1d(luad_df_with_sex.columns.values[:-3], unique_gene_in_pathway)

array(['A2M', 'AACS', 'AADAT', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
      dtype=object)

In [231]:
np.intersect1d(luad_df_with_sex.columns.values[:-3], unique_gene_in_pathway).size

4365

In [233]:
luad_df_with_sex.to_csv("TCGA_LUAD_gene_expression_data.csv", index = False, header = True)

---

## Pathway Mask

In [234]:
luad_df_with_sex

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,-1.9057,-0.7328,-0.0599,-0.4275,-1.9441,0.3717,-1.9669,1.2450,-0.4371,0.5023,...,0.4099,0.0758,0.7671,1.9043,0.2518,1.0164,-0.4967,1,0,0
1,-0.2950,0.4726,0.4150,0.9225,-1.1738,-0.9029,0.3265,-0.0974,-0.0565,0.6207,...,0.6071,0.6687,0.0146,-1.1770,0.8868,0.7726,-0.8791,1,0,50.03
2,-1.9091,-0.0511,-0.1980,0.0388,-0.5402,-0.2299,0.0953,-0.0171,-0.8013,-1.2674,...,-0.5641,-0.6107,0.7475,0.4486,-0.1294,-0.7576,0.8248,0,1,3.98
3,-0.5333,0.3503,0.5821,-0.5648,-0.4509,-0.2470,-0.4774,-0.1876,-0.6723,-0.1605,...,-0.2524,0.4919,-1.4625,-0.8091,0.4065,0.5852,1.2704,1,0,19.94
4,-0.8895,1.2646,-0.3292,0.2626,-1.9441,-0.2941,0.3080,0.1383,1.1152,1.0806,...,1.4413,0.2123,-1.7023,0.6045,0.8609,0.2940,-0.1648,1,0,13.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,-0.0218,0.2995,-0.4215,-0.4627,-1.9441,0.2866,-0.6996,-0.0004,0.1431,-0.4726,...,-0.1856,1.0863,-1.7023,-0.8183,-1.2910,1.2719,0.2930,0,0,0.43
504,-0.8099,0.5823,2.7798,-0.0093,-0.0856,0.8445,-0.3696,0.7827,-0.1377,2.6733,...,-0.4354,0.5556,-0.2406,-0.2837,0.5286,0.9093,-0.3319,1,0,19.81
505,-0.4522,-1.7762,-0.7533,1.4659,-0.1499,0.4272,0.0790,-1.2886,1.3519,0.4950,...,-0.3309,-0.4957,-1.3037,0.8470,-1.2156,0.6139,-1.2127,1,0,20.27
506,-1.3473,0.4958,0.5675,0.9068,2.6611,0.4739,-0.6699,0.4860,0.9761,3.1634,...,0.7868,0.4064,-1.4625,1.3708,-0.6834,-0.2513,0.4326,0,1,59.07


In [235]:
luad_df_with_sex.columns.values[:-3], luad_df_with_sex.columns.values[:-3].size

(array(['HMGB1P1', 'A2M', 'AACS', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
       dtype=object),
 4365)

In [236]:
gene_name = luad_df_with_sex.columns.values[:-3]

In [237]:
gene_name.size, np.unique(gene_name).size

(4365, 4365)

In [238]:
pathway_name, pathway_name.size

(0              KEGG_N_GLYCAN_BIOSYNTHESIS
 1           KEGG_OTHER_GLYCAN_DEGRADATION
 2              KEGG_O_GLYCAN_BIOSYNTHESIS
 3      KEGG_GLYCOSAMINOGLYCAN_DEGRADATION
 4            KEGG_GLYCEROLIPID_METABOLISM
                       ...                
 168                           KEGG_ASTHMA
 169       KEGG_AUTOIMMUNE_THYROID_DISEASE
 170              KEGG_ALLOGRAFT_REJECTION
 171        KEGG_GRAFT_VERSUS_HOST_DISEASE
 172                KEGG_VIRAL_MYOCARDITIS
 Name: Name, Length: 173, dtype: object,
 173)

In [239]:
pathway_sparse_mat = sparse.coo_matrix((pathway_name.size, gene_name.size)).toarray()
pathway_sparse_mat.shape

(173, 4365)

In [240]:
pathway_sparse_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [241]:
pathway_sparse_mat.sum().sum()

0.0

In [242]:
for i in range(len(pathway_name)):
    pathway_sparse_mat[i, np.argwhere(np.isin(gene_name, set_pathway['gene'][i])).reshape((-1, ))] = 1.
            
pathway_sparse_mat, pathway_sparse_mat.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 (173, 4365))

In [243]:
pathway_sparse_mat.sum()

10879.0

---

In [244]:
data_type = "LUAD"

In [245]:
pathway_sparse_mat = sparse.coo_matrix(pathway_sparse_mat)
sparse.save_npz(f"TCGA_{data_type}_Pathway_Mask.npz", pathway_sparse_mat)

---

# TCGA - LUSC

## clinical data

In [246]:
lusc_survival_info_df = pd.read_csv(data_path + "lusc_tcga/data_clinical_patient.txt", sep = '\t')

In [247]:
lusc_survival_info_df

Unnamed: 0,#Other Patient ID,Patient Identifier,Form completion date,Neoplasm Histologic Type Name,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,Sex,Race Category,Ethnicity Category,Primary Tumor Laterality,...,Lymph node location positive pathology name,Project code,Stage Other,Adjuvant Postoperative Targeted Therapy Administered Indicator,Tissue Source Site,Tumor Tissue Site,Overall Survival Status,Overall Survival (Months),Disease Free Status,Disease Free (Months)
0,#Legacy DMP patient identifier (DMPnnnn),Identifier to uniquely specify a patient.,Form completion date,Text term for the structural pattern of cancer...,Text indicator for the time frame of tissue pr...,Text indicator for the time frame of tissue pr...,Sex,The text for reporting information about race.,The text for reporting information about ethni...,"For tumors in paired organs, designates the si...",...,Lymph node location positive pathology name,Project code,Stage Other,Text term to signify postoperative adjuvant ca...,"A Tissue Source Site collects samples (tissue,...",Text term that describes the anatomic site of ...,Overall patient survival status.,Overall survival in months since initial diago...,Disease free status since initial treatment.,Disease free (months) since initial treatment.
1,#STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,...,STRING,STRING,STRING,STRING,STRING,STRING,STRING,NUMBER,STRING,NUMBER
2,#1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,OTHER_PATIENT_ID,PATIENT_ID,FORM_COMPLETION_DATE,HISTOLOGICAL_DIAGNOSIS,PROSPECTIVE_COLLECTION,RETROSPECTIVE_COLLECTION,SEX,RACE,ETHNICITY,LATERALITY,...,POS_LYMPH_NODE_LOCATION,PROJECT_CODE,STAGE_OTHER,TARGETED_MOLECULAR_THERAPY,TISSUE_SOURCE_SITE,SITE_OF_TUMOR_TISSUE,OS_STATUS,OS_MONTHS,DFS_STATUS,DFS_MONTHS
4,95b83006-02c9-4c4d-bf84-a45115f7d86d,TCGA-18-3406,3/9/11,Lung Squamous Cell Carcinoma,NO,YES,Male,WHITE,NOT HISPANIC OR LATINO,[Not Available],...,[Not Available],[Not Available],[Not Available],[Not Available],18,Lung,1:DECEASED,12.19,1:Recurred/Progressed,11.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,E80A4625-1B5A-4ABC-8AB9-978D1488B961,TCGA-O2-A52S,4/11/13,Lung Squamous Cell Carcinoma,NO,YES,Female,WHITE,NOT HISPANIC OR LATINO,[Not Available],...,[Not Available],[Not Available],[Not Available],NO,O2,Lung,1:DECEASED,12.71,1:Recurred/Progressed,8.08
504,6A2CE004-8A2D-419F-9738-E352D298A315,TCGA-O2-A52V,5/19/13,Lung Squamous Cell Carcinoma,NO,YES,Female,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,[Not Available],...,[Not Available],[Not Available],[Not Available],NO,O2,Lung,1:DECEASED,43.86,1:Recurred/Progressed,22.67
505,1A49A131-D714-40D7-85B6-C4822C0C6264,TCGA-O2-A52W,4/11/13,Lung Squamous Cell Carcinoma,NO,YES,Male,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,[Not Available],...,[Not Available],[Not Available],[Not Available],NO,O2,Lung,1:DECEASED,8.57,[Not Available],[Not Available]
506,8BE10CE9-220D-4816-A575-8E8E7F041114,TCGA-O2-A5IB,5/19/13,Lung Squamous Cell Carcinoma,NO,YES,Female,WHITE,NOT HISPANIC OR LATINO,[Not Available],...,[Not Available],[Not Available],[Not Available],NO,O2,Lung,1:DECEASED,11.17,1:Recurred/Progressed,8.31


In [248]:
lusc_survival_info_df = lusc_survival_info_df[["Patient Identifier", "Overall Survival Status", "Overall Survival (Months)"]]

In [249]:
lusc_survival_info_df

Unnamed: 0,Patient Identifier,Overall Survival Status,Overall Survival (Months)
0,Identifier to uniquely specify a patient.,Overall patient survival status.,Overall survival in months since initial diago...
1,STRING,STRING,NUMBER
2,1,1,1
3,PATIENT_ID,OS_STATUS,OS_MONTHS
4,TCGA-18-3406,1:DECEASED,12.19
...,...,...,...
503,TCGA-O2-A52S,1:DECEASED,12.71
504,TCGA-O2-A52V,1:DECEASED,43.86
505,TCGA-O2-A52W,1:DECEASED,8.57
506,TCGA-O2-A5IB,1:DECEASED,11.17


In [250]:
lusc_survival_info_df = lusc_survival_info_df.loc[3:]

In [251]:
lusc_survival_info_df

Unnamed: 0,Patient Identifier,Overall Survival Status,Overall Survival (Months)
3,PATIENT_ID,OS_STATUS,OS_MONTHS
4,TCGA-18-3406,1:DECEASED,12.19
5,TCGA-18-3407,1:DECEASED,4.47
6,TCGA-18-3408,1:DECEASED,75.69
7,TCGA-18-3409,0:LIVING,123.09
...,...,...,...
503,TCGA-O2-A52S,1:DECEASED,12.71
504,TCGA-O2-A52V,1:DECEASED,43.86
505,TCGA-O2-A52W,1:DECEASED,8.57
506,TCGA-O2-A5IB,1:DECEASED,11.17


In [252]:
lusc_survival_info_df = lusc_survival_info_df.rename(columns = {"Patient Identifier" : "Patient ID", "Overall Survival Status" : "OS_STATUS", "Overall Survival (Months)" : "OS_MONTHS"}).drop(3).reset_index(drop = True)

In [253]:
lusc_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-18-3406,1:DECEASED,12.19
1,TCGA-18-3407,1:DECEASED,4.47
2,TCGA-18-3408,1:DECEASED,75.69
3,TCGA-18-3409,0:LIVING,123.09
4,TCGA-18-3410,1:DECEASED,4.8
...,...,...,...
499,TCGA-O2-A52S,1:DECEASED,12.71
500,TCGA-O2-A52V,1:DECEASED,43.86
501,TCGA-O2-A52W,1:DECEASED,8.57
502,TCGA-O2-A5IB,1:DECEASED,11.17


In [254]:
lusc_survival_info_df = lusc_survival_info_df[~lusc_survival_info_df["OS_MONTHS"].str.contains("Not")].reset_index(drop = True)

In [255]:
lusc_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-18-3406,1:DECEASED,12.19
1,TCGA-18-3407,1:DECEASED,4.47
2,TCGA-18-3408,1:DECEASED,75.69
3,TCGA-18-3409,0:LIVING,123.09
4,TCGA-18-3410,1:DECEASED,4.8
...,...,...,...
493,TCGA-O2-A52S,1:DECEASED,12.71
494,TCGA-O2-A52V,1:DECEASED,43.86
495,TCGA-O2-A52W,1:DECEASED,8.57
496,TCGA-O2-A5IB,1:DECEASED,11.17


In [256]:
lusc_survival_info_df["OS_STATUS"][lusc_survival_info_df["OS_STATUS"] == "1:DECEASED"].shape

(215,)

In [257]:
lusc_survival_info_df["OS_STATUS"][lusc_survival_info_df["OS_STATUS"] == "0:LIVING"].shape

(283,)

In [258]:
215 + 283

498

In [259]:
lusc_survival_info_df["OS_STATUS"] = lusc_survival_info_df["OS_STATUS"].apply(lambda x : 1 if x.find('1:') != -1 else 0)

In [260]:
lusc_survival_info_df

Unnamed: 0,Patient ID,OS_STATUS,OS_MONTHS
0,TCGA-18-3406,1,12.19
1,TCGA-18-3407,1,4.47
2,TCGA-18-3408,1,75.69
3,TCGA-18-3409,0,123.09
4,TCGA-18-3410,1,4.8
...,...,...,...
493,TCGA-O2-A52S,1,12.71
494,TCGA-O2-A52V,1,43.86
495,TCGA-O2-A52W,1,8.57
496,TCGA-O2-A5IB,1,11.17


---

## gene expression data

In [261]:
lusc_df = pd.read_csv(data_path + "lusc_tcga/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt", sep = '\t')

In [262]:
lusc_df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-18-3406-01,TCGA-18-3407-01,TCGA-18-3408-01,TCGA-18-3409-01,TCGA-18-3410-01,TCGA-18-3411-01,TCGA-18-3412-01,TCGA-18-3414-01,...,TCGA-NK-A5CX-01,TCGA-NK-A5D1-01,TCGA-NK-A7XE-01,TCGA-O2-A52N-01,TCGA-O2-A52Q-01,TCGA-O2-A52S-01,TCGA-O2-A52V-01,TCGA-O2-A52W-01,TCGA-O2-A5IB-01,TCGA-XC-AA0X-01
0,LOC100130426,100130426,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,...,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669,-4.8669
1,UBE2Q2P3,100133144,1.8381,-0.1497,-0.7525,-1.9600,0.0195,-0.2745,1.2762,-0.3315,...,1.0768,1.0530,0.2849,0.7271,-1.4785,0.2314,0.7217,1.4227,1.7917,1.2017
2,UBE2Q2P3,100134869,0.7733,-0.8979,-1.7401,-0.8701,0.4233,0.0199,0.5100,0.7883,...,0.2622,0.5634,0.1394,-1.4982,-1.6511,1.6105,0.0976,1.1415,1.4597,0.6750
3,HMGB1P1,10357,1.4899,0.5958,1.2436,0.1109,0.6288,1.4684,2.3784,0.6934,...,-0.2606,1.0269,-0.4230,-0.4644,-0.0644,0.2982,0.4682,1.3014,0.0438,-1.8559
4,TIMM23,10431,1.2254,0.0224,1.1598,-0.0455,0.5015,0.4147,-0.2948,0.1901,...,-0.0715,-0.5728,2.3142,-1.2397,0.3754,0.5761,0.7089,-0.5316,-1.9878,-0.4753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20526,ZYX,7791,-1.5127,0.1076,-4.1585,0.1869,-0.5724,-0.7101,-0.5911,0.2225,...,0.5227,-0.8450,-1.6918,1.2123,1.6464,-0.8226,0.8601,-0.9170,-2.4961,0.5752
20527,FLJ10821,23140,-2.6774,-0.5487,-0.9323,-0.2452,-0.4479,-2.2587,-1.4435,-1.8266,...,-0.0922,0.0397,-0.3541,-0.3689,-0.7372,0.3708,-0.6727,-0.4644,-0.1755,1.1268
20528,ZZZ3,26009,-0.0367,0.7938,0.0432,0.9468,0.5313,0.8125,-0.6525,0.2861,...,-0.7462,0.5022,-1.7251,-0.7472,0.3204,0.1480,0.1300,0.0616,1.9430,-0.4403
20529,TPTEP1,387590,-0.7324,-0.4687,-0.8272,0.3779,-0.8654,-0.5865,1.9244,1.9068,...,-0.8328,1.0957,-1.1411,1.7581,-0.3768,-0.8781,-1.3218,-0.4238,-1.5718,1.7542


In [263]:
lusc_df_columns = lusc_df["Hugo_Symbol"]

In [264]:
lusc_df_columns

0        LOC100130426
1            UBE2Q2P3
2            UBE2Q2P3
3             HMGB1P1
4              TIMM23
             ...     
20526             ZYX
20527        FLJ10821
20528            ZZZ3
20529          TPTEP1
20530         AKR1C6P
Name: Hugo_Symbol, Length: 20531, dtype: object

In [265]:
lusc_df = lusc_df.drop(["Hugo_Symbol", "Entrez_Gene_Id"], axis = 1).T

In [266]:
lusc_df.columns = lusc_df_columns

In [267]:
lusc_df = lusc_df.reset_index().rename(columns = {"index" : "Patient ID"})

In [268]:
lusc_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-18-3406-01,-4.8669,1.8381,0.7733,1.4899,1.2254,,-2.2792,-1.9720,-1.2958,...,1.1436,0.5836,-2.9833,-1.5172,1.4713,-1.5127,-2.6774,-0.0367,-0.7324,-1.1332
1,TCGA-18-3407-01,-4.8669,-0.1497,-0.8979,0.5958,0.0224,,1.0199,-1.9720,-1.2958,...,-0.0999,-0.1863,-0.1186,-1.0225,0.9700,0.1076,-0.5487,0.7938,-0.4687,-0.6623
2,TCGA-18-3408-01,-4.8669,-0.7525,-1.7401,1.2436,1.1598,,-1.3415,1.5868,-1.2958,...,0.2821,0.3986,-0.8985,-0.7883,-0.0203,-4.1585,-0.9323,0.0432,-0.8272,2.4546
3,TCGA-18-3409-01,-4.8669,-1.9600,-0.8701,0.1109,-0.0455,,-0.5129,0.4852,-1.2958,...,1.4444,1.7315,-0.5215,-1.7184,2.0602,0.1869,-0.2452,0.9468,0.3779,-1.1332
4,TCGA-18-3410-01,-4.8669,0.0195,0.4233,0.6288,0.5015,,-0.1756,-1.9720,-1.2958,...,-0.2171,-0.8273,-0.6553,1.2613,0.3334,-0.5724,-0.4479,0.5313,-0.8654,1.5092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,TCGA-O2-A52S-01,-4.8669,0.2314,1.6105,0.2982,0.5761,,0.6465,1.4279,-1.2958,...,0.8256,1.2019,0.1243,1.0844,-0.3779,-0.8226,0.3708,0.1480,-0.8781,-0.1563
497,TCGA-O2-A52V-01,-4.8669,0.7217,0.0976,0.4682,0.7089,,0.0837,-0.7508,-1.2958,...,0.3632,0.3226,-0.0041,-0.9618,0.9350,0.8601,-0.6727,0.1300,-1.3218,-1.1332
498,TCGA-O2-A52W-01,-4.8669,1.4227,1.1415,1.3014,-0.5316,,2.3862,-1.9720,-1.2958,...,0.0150,0.0921,1.0822,0.9300,-0.1833,-0.9170,-0.4644,0.0616,-0.4238,-1.1332
499,TCGA-O2-A5IB-01,-4.8669,1.7917,1.4597,0.0438,-1.9878,,1.8530,0.9744,-1.2958,...,1.5595,1.5834,1.1752,1.2090,2.0212,-2.4961,-0.1755,1.9430,-1.5718,-1.1332


In [269]:
lusc_df["Patient ID"] = lusc_df["Patient ID"].apply(lambda x : x[:-3])

In [270]:
lusc_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-18-3406,-4.8669,1.8381,0.7733,1.4899,1.2254,,-2.2792,-1.9720,-1.2958,...,1.1436,0.5836,-2.9833,-1.5172,1.4713,-1.5127,-2.6774,-0.0367,-0.7324,-1.1332
1,TCGA-18-3407,-4.8669,-0.1497,-0.8979,0.5958,0.0224,,1.0199,-1.9720,-1.2958,...,-0.0999,-0.1863,-0.1186,-1.0225,0.9700,0.1076,-0.5487,0.7938,-0.4687,-0.6623
2,TCGA-18-3408,-4.8669,-0.7525,-1.7401,1.2436,1.1598,,-1.3415,1.5868,-1.2958,...,0.2821,0.3986,-0.8985,-0.7883,-0.0203,-4.1585,-0.9323,0.0432,-0.8272,2.4546
3,TCGA-18-3409,-4.8669,-1.9600,-0.8701,0.1109,-0.0455,,-0.5129,0.4852,-1.2958,...,1.4444,1.7315,-0.5215,-1.7184,2.0602,0.1869,-0.2452,0.9468,0.3779,-1.1332
4,TCGA-18-3410,-4.8669,0.0195,0.4233,0.6288,0.5015,,-0.1756,-1.9720,-1.2958,...,-0.2171,-0.8273,-0.6553,1.2613,0.3334,-0.5724,-0.4479,0.5313,-0.8654,1.5092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,TCGA-O2-A52S,-4.8669,0.2314,1.6105,0.2982,0.5761,,0.6465,1.4279,-1.2958,...,0.8256,1.2019,0.1243,1.0844,-0.3779,-0.8226,0.3708,0.1480,-0.8781,-0.1563
497,TCGA-O2-A52V,-4.8669,0.7217,0.0976,0.4682,0.7089,,0.0837,-0.7508,-1.2958,...,0.3632,0.3226,-0.0041,-0.9618,0.9350,0.8601,-0.6727,0.1300,-1.3218,-1.1332
498,TCGA-O2-A52W,-4.8669,1.4227,1.1415,1.3014,-0.5316,,2.3862,-1.9720,-1.2958,...,0.0150,0.0921,1.0822,0.9300,-0.1833,-0.9170,-0.4644,0.0616,-0.4238,-1.1332
499,TCGA-O2-A5IB,-4.8669,1.7917,1.4597,0.0438,-1.9878,,1.8530,0.9744,-1.2958,...,1.5595,1.5834,1.1752,1.2090,2.0212,-2.4961,-0.1755,1.9430,-1.5718,-1.1332


### remove nan column

In [271]:
lusc_df.iloc[:, lusc_df.columns.isna()]

Hugo_Symbol,NaN
0,0.5429
1,-0.5327
2,0.3373
3,-0.8741
4,1.0070
...,...
496,0.6913
497,0.5035
498,-0.4389
499,1.2603


In [272]:
lusc_df = lusc_df.iloc[:, ~lusc_df.columns.isna()]

In [273]:
lusc_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-18-3406,-4.8669,1.8381,0.7733,1.4899,1.2254,,-2.2792,-1.9720,-1.2958,...,1.1436,0.5836,-2.9833,-1.5172,1.4713,-1.5127,-2.6774,-0.0367,-0.7324,-1.1332
1,TCGA-18-3407,-4.8669,-0.1497,-0.8979,0.5958,0.0224,,1.0199,-1.9720,-1.2958,...,-0.0999,-0.1863,-0.1186,-1.0225,0.9700,0.1076,-0.5487,0.7938,-0.4687,-0.6623
2,TCGA-18-3408,-4.8669,-0.7525,-1.7401,1.2436,1.1598,,-1.3415,1.5868,-1.2958,...,0.2821,0.3986,-0.8985,-0.7883,-0.0203,-4.1585,-0.9323,0.0432,-0.8272,2.4546
3,TCGA-18-3409,-4.8669,-1.9600,-0.8701,0.1109,-0.0455,,-0.5129,0.4852,-1.2958,...,1.4444,1.7315,-0.5215,-1.7184,2.0602,0.1869,-0.2452,0.9468,0.3779,-1.1332
4,TCGA-18-3410,-4.8669,0.0195,0.4233,0.6288,0.5015,,-0.1756,-1.9720,-1.2958,...,-0.2171,-0.8273,-0.6553,1.2613,0.3334,-0.5724,-0.4479,0.5313,-0.8654,1.5092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,TCGA-O2-A52S,-4.8669,0.2314,1.6105,0.2982,0.5761,,0.6465,1.4279,-1.2958,...,0.8256,1.2019,0.1243,1.0844,-0.3779,-0.8226,0.3708,0.1480,-0.8781,-0.1563
497,TCGA-O2-A52V,-4.8669,0.7217,0.0976,0.4682,0.7089,,0.0837,-0.7508,-1.2958,...,0.3632,0.3226,-0.0041,-0.9618,0.9350,0.8601,-0.6727,0.1300,-1.3218,-1.1332
498,TCGA-O2-A52W,-4.8669,1.4227,1.1415,1.3014,-0.5316,,2.3862,-1.9720,-1.2958,...,0.0150,0.0921,1.0822,0.9300,-0.1833,-0.9170,-0.4644,0.0616,-0.4238,-1.1332
499,TCGA-O2-A5IB,-4.8669,1.7917,1.4597,0.0438,-1.9878,,1.8530,0.9744,-1.2958,...,1.5595,1.5834,1.1752,1.2090,2.0212,-2.4961,-0.1755,1.9430,-1.5718,-1.1332


### check redundant genes

In [274]:
lusc_df.columns

Index(['Patient ID', 'LOC100130426', 'UBE2Q2P3', 'UBE2Q2P3', 'HMGB1P1',
       'TIMM23', 'MOXD2', 'LOC155060', 'RNU12-2P', 'SSX9',
       ...
       'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX', 'FLJ10821', 'ZZZ3',
       'TPTEP1', 'AKR1C6P'],
      dtype='object', name='Hugo_Symbol', length=20531)

In [275]:
duplicated_genes_lusc = lusc_df.columns[lusc_df.columns.duplicated()]

In [276]:
duplicated_genes_lusc, duplicated_genes_lusc.size

(Index(['UBE2Q2P3', 'CC2D2B', 'CCDC7', 'CYorf15B', 'C1orf84', 'LINC00875',
        'NBPF16', 'NEBL', 'NKAIN3', 'C5orf23', 'PALM2AKAP2', 'PLEKHG7', 'QSOX1',
        'SH3D20', 'NCRNA00185'],
       dtype='object', name='Hugo_Symbol'),
 15)

In [277]:
lusc_df[duplicated_genes_lusc]

Hugo_Symbol,UBE2Q2P3,UBE2Q2P3.1,CC2D2B,CC2D2B.1,CCDC7,CCDC7.1,CYorf15B,CYorf15B.1,C1orf84,C1orf84.1,...,PALM2AKAP2,PALM2AKAP2.1,PLEKHG7,PLEKHG7.1,QSOX1,QSOX1.1,SH3D20,SH3D20.1,NCRNA00185,NCRNA00185.1
0,1.8381,0.7733,-0.2066,1.1206,0.4199,1.4826,0.5928,0.0590,0.0685,-2.5687,...,-0.1327,-0.3618,0.2619,0.7760,-0.9272,-0.9965,-1.8098,-2.1165,-0.3336,-1.5344
1,-0.1497,-0.8979,-0.7348,0.3800,-1.4092,1.3364,0.4656,0.6128,-0.0340,-0.1502,...,0.4101,0.2272,-0.2353,0.0757,-0.2042,0.4746,0.6911,-0.3314,-0.0537,-0.2075
2,-0.7525,-1.7401,-1.8074,-1.4351,1.0304,-0.5122,-1.9075,-2.1686,-1.3716,-2.5317,...,0.2104,0.8148,-1.1873,0.2645,-0.1969,-1.0054,-1.8268,-3.3165,-1.7906,-1.5344
3,-1.9600,-0.8701,-0.6663,-0.5895,-0.1934,0.1229,0.7020,0.2968,-0.6068,-0.8230,...,0.6872,0.8995,-1.2366,-0.4564,-1.2655,-0.3923,-2.4891,-2.7913,0.8687,-0.6555
4,0.0195,0.4233,-1.8074,-0.3049,0.6378,0.4886,1.3882,1.0634,-0.4035,-0.0912,...,0.1851,-0.4655,-0.1101,0.3944,-1.7580,-1.3565,-0.1310,-1.3956,1.9991,1.6414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,0.2314,1.6105,-0.8527,0.1888,-1.4808,-0.8905,-2.6540,-2.7103,0.5345,0.2133,...,0.0254,0.6361,-1.3010,-1.3292,-0.5200,-0.1310,-1.1856,-0.9651,-1.7906,-1.5344
497,0.7217,0.0976,-0.8405,-0.2261,-0.6075,0.5693,-2.6540,-2.7103,1.0045,0.1901,...,0.1767,-0.1433,0.7622,0.0137,0.2870,-0.0700,0.4081,-0.1717,-1.7906,-1.5344
498,1.4227,1.1415,-0.6221,0.0879,-0.0326,1.4674,-0.5876,0.0121,-0.1944,0.7763,...,-0.7859,0.2775,0.7149,1.2631,-0.3125,-1.5949,0.4712,0.5391,-0.6001,-1.5344
499,1.7917,1.4597,-1.8074,-0.0611,3.8000,0.3890,-2.6540,-2.7103,-1.0509,1.6253,...,0.3823,1.3168,1.5739,1.4555,-0.6177,0.1608,-1.6141,-1.5805,-1.7906,-1.5344


In [278]:
for gene in duplicated_genes_lusc:
    var_gene = np.var(lusc_df[gene])
    print(var_gene)
    print(lusc_df[gene])
    #print(var_gene.values)
    #print(np.where(var_gene.values[0]))
    #print(var_gene.values[0])
    #print(var_gene.values[1])
    if var_gene.values[0] < var_gene.values[1]:
        print(var_gene.values[1])
        print(lusc_df[gene].iloc[:, 1])
        lusc_df[gene] = lusc_df[gene].iloc[:, 1]
    else:
        print(var_gene.values[0])
        print(lusc_df[gene].iloc[:, 0])
        lusc_df[gene] = lusc_df[gene].iloc[:, 0]

Hugo_Symbol
UBE2Q2P3    1.335605
UBE2Q2P3    1.132913
dtype: float64
Hugo_Symbol  UBE2Q2P3  UBE2Q2P3
0              1.8381    0.7733
1             -0.1497   -0.8979
2             -0.7525   -1.7401
3             -1.9600   -0.8701
4              0.0195    0.4233
..                ...       ...
496            0.2314    1.6105
497            0.7217    0.0976
498            1.4227    1.1415
499            1.7917    1.4597
500            1.2017    0.6750

[501 rows x 2 columns]
1.33560460938562
0      1.8381
1     -0.1497
2     -0.7525
3     -1.9600
4      0.0195
        ...  
496    0.2314
497    0.7217
498    1.4227
499    1.7917
500    1.2017
Name: UBE2Q2P3, Length: 501, dtype: float64
Hugo_Symbol
CC2D2B    1.393202
CC2D2B    1.261126
dtype: float64
Hugo_Symbol  CC2D2B  CC2D2B
0           -0.2066  1.1206
1           -0.7348  0.3800
2           -1.8074 -1.4351
3           -0.6663 -0.5895
4           -1.8074 -0.3049
..              ...     ...
496         -0.8527  0.1888
497         -0.8405

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lusc_df[gene] = lusc_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lusc_df[gene] = lusc_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lusc_df[gene] = lusc_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

In [279]:
lusc_df = lusc_df.iloc[:, ~lusc_df.columns.duplicated()]

In [280]:
lusc_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-18-3406,-4.8669,1.8381,1.4899,1.2254,,-2.2792,-1.9720,-1.2958,,...,1.1436,0.5836,-2.9833,-1.5172,1.4713,-1.5127,-2.6774,-0.0367,-0.7324,-1.1332
1,TCGA-18-3407,-4.8669,-0.1497,0.5958,0.0224,,1.0199,-1.9720,-1.2958,,...,-0.0999,-0.1863,-0.1186,-1.0225,0.9700,0.1076,-0.5487,0.7938,-0.4687,-0.6623
2,TCGA-18-3408,-4.8669,-0.7525,1.2436,1.1598,,-1.3415,1.5868,-1.2958,,...,0.2821,0.3986,-0.8985,-0.7883,-0.0203,-4.1585,-0.9323,0.0432,-0.8272,2.4546
3,TCGA-18-3409,-4.8669,-1.9600,0.1109,-0.0455,,-0.5129,0.4852,-1.2958,,...,1.4444,1.7315,-0.5215,-1.7184,2.0602,0.1869,-0.2452,0.9468,0.3779,-1.1332
4,TCGA-18-3410,-4.8669,0.0195,0.6288,0.5015,,-0.1756,-1.9720,-1.2958,,...,-0.2171,-0.8273,-0.6553,1.2613,0.3334,-0.5724,-0.4479,0.5313,-0.8654,1.5092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,TCGA-O2-A52S,-4.8669,0.2314,0.2982,0.5761,,0.6465,1.4279,-1.2958,,...,0.8256,1.2019,0.1243,1.0844,-0.3779,-0.8226,0.3708,0.1480,-0.8781,-0.1563
497,TCGA-O2-A52V,-4.8669,0.7217,0.4682,0.7089,,0.0837,-0.7508,-1.2958,,...,0.3632,0.3226,-0.0041,-0.9618,0.9350,0.8601,-0.6727,0.1300,-1.3218,-1.1332
498,TCGA-O2-A52W,-4.8669,1.4227,1.3014,-0.5316,,2.3862,-1.9720,-1.2958,,...,0.0150,0.0921,1.0822,0.9300,-0.1833,-0.9170,-0.4644,0.0616,-0.4238,-1.1332
499,TCGA-O2-A5IB,-4.8669,1.7917,0.0438,-1.9878,,1.8530,0.9744,-1.2958,,...,1.5595,1.5834,1.1752,1.2090,2.0212,-2.4961,-0.1755,1.9430,-1.5718,-1.1332


---

### sex data

In [281]:
lusc_sex_info_df = pd.read_csv(data_path + "lusc_tcga/Sex.txt", sep ='\t')

In [282]:
lusc_sex_info_df

Unnamed: 0,Study ID,Patient ID,Sex
0,lusc_tcga,TCGA-18-3406,Male
1,lusc_tcga,TCGA-18-3407,Male
2,lusc_tcga,TCGA-18-3408,Female
3,lusc_tcga,TCGA-18-3409,Male
4,lusc_tcga,TCGA-18-3410,Male
...,...,...,...
499,lusc_tcga,TCGA-O2-A52S,Female
500,lusc_tcga,TCGA-O2-A52V,Female
501,lusc_tcga,TCGA-O2-A52W,Male
502,lusc_tcga,TCGA-O2-A5IB,Female


In [283]:
lusc_sex_info_df["Sex"] = lusc_sex_info_df["Sex"].apply(lambda x : 0 if x.find('Female') != -1 else 1)

In [284]:
lusc_sex_info_df

Unnamed: 0,Study ID,Patient ID,Sex
0,lusc_tcga,TCGA-18-3406,1
1,lusc_tcga,TCGA-18-3407,1
2,lusc_tcga,TCGA-18-3408,0
3,lusc_tcga,TCGA-18-3409,1
4,lusc_tcga,TCGA-18-3410,1
...,...,...,...
499,lusc_tcga,TCGA-O2-A52S,0
500,lusc_tcga,TCGA-O2-A52V,0
501,lusc_tcga,TCGA-O2-A52W,1
502,lusc_tcga,TCGA-O2-A5IB,0


---

### merge gene expression, clincic, and sex data

In [285]:
lusc_df_with_sex = pd.merge(lusc_df, lusc_sex_info_df, how = 'inner', on = 'Patient ID')

In [286]:
lusc_df_with_sex

Unnamed: 0,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Study ID,Sex
0,TCGA-18-3406,-4.8669,1.8381,1.4899,1.2254,,-2.2792,-1.9720,-1.2958,,...,-2.9833,-1.5172,1.4713,-1.5127,-2.6774,-0.0367,-0.7324,-1.1332,lusc_tcga,1
1,TCGA-18-3407,-4.8669,-0.1497,0.5958,0.0224,,1.0199,-1.9720,-1.2958,,...,-0.1186,-1.0225,0.9700,0.1076,-0.5487,0.7938,-0.4687,-0.6623,lusc_tcga,1
2,TCGA-18-3408,-4.8669,-0.7525,1.2436,1.1598,,-1.3415,1.5868,-1.2958,,...,-0.8985,-0.7883,-0.0203,-4.1585,-0.9323,0.0432,-0.8272,2.4546,lusc_tcga,0
3,TCGA-18-3409,-4.8669,-1.9600,0.1109,-0.0455,,-0.5129,0.4852,-1.2958,,...,-0.5215,-1.7184,2.0602,0.1869,-0.2452,0.9468,0.3779,-1.1332,lusc_tcga,1
4,TCGA-18-3410,-4.8669,0.0195,0.6288,0.5015,,-0.1756,-1.9720,-1.2958,,...,-0.6553,1.2613,0.3334,-0.5724,-0.4479,0.5313,-0.8654,1.5092,lusc_tcga,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,TCGA-O2-A52S,-4.8669,0.2314,0.2982,0.5761,,0.6465,1.4279,-1.2958,,...,0.1243,1.0844,-0.3779,-0.8226,0.3708,0.1480,-0.8781,-0.1563,lusc_tcga,0
497,TCGA-O2-A52V,-4.8669,0.7217,0.4682,0.7089,,0.0837,-0.7508,-1.2958,,...,-0.0041,-0.9618,0.9350,0.8601,-0.6727,0.1300,-1.3218,-1.1332,lusc_tcga,0
498,TCGA-O2-A52W,-4.8669,1.4227,1.3014,-0.5316,,2.3862,-1.9720,-1.2958,,...,1.0822,0.9300,-0.1833,-0.9170,-0.4644,0.0616,-0.4238,-1.1332,lusc_tcga,1
499,TCGA-O2-A5IB,-4.8669,1.7917,0.0438,-1.9878,,1.8530,0.9744,-1.2958,,...,1.1752,1.2090,2.0212,-2.4961,-0.1755,1.9430,-1.5718,-1.1332,lusc_tcga,0


In [287]:
lusc_df_with_sex = pd.merge(lusc_df_with_sex, lusc_survival_info_df, how = 'inner', on = 'Patient ID')

In [288]:
lusc_df_with_sex

Unnamed: 0,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Study ID,Sex,OS_STATUS,OS_MONTHS
0,TCGA-18-3406,-4.8669,1.8381,1.4899,1.2254,,-2.2792,-1.9720,-1.2958,,...,1.4713,-1.5127,-2.6774,-0.0367,-0.7324,-1.1332,lusc_tcga,1,1,12.19
1,TCGA-18-3407,-4.8669,-0.1497,0.5958,0.0224,,1.0199,-1.9720,-1.2958,,...,0.9700,0.1076,-0.5487,0.7938,-0.4687,-0.6623,lusc_tcga,1,1,4.47
2,TCGA-18-3408,-4.8669,-0.7525,1.2436,1.1598,,-1.3415,1.5868,-1.2958,,...,-0.0203,-4.1585,-0.9323,0.0432,-0.8272,2.4546,lusc_tcga,0,1,75.69
3,TCGA-18-3409,-4.8669,-1.9600,0.1109,-0.0455,,-0.5129,0.4852,-1.2958,,...,2.0602,0.1869,-0.2452,0.9468,0.3779,-1.1332,lusc_tcga,1,0,123.09
4,TCGA-18-3410,-4.8669,0.0195,0.6288,0.5015,,-0.1756,-1.9720,-1.2958,,...,0.3334,-0.5724,-0.4479,0.5313,-0.8654,1.5092,lusc_tcga,1,1,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,TCGA-O2-A52S,-4.8669,0.2314,0.2982,0.5761,,0.6465,1.4279,-1.2958,,...,-0.3779,-0.8226,0.3708,0.1480,-0.8781,-0.1563,lusc_tcga,0,1,12.71
491,TCGA-O2-A52V,-4.8669,0.7217,0.4682,0.7089,,0.0837,-0.7508,-1.2958,,...,0.9350,0.8601,-0.6727,0.1300,-1.3218,-1.1332,lusc_tcga,0,1,43.86
492,TCGA-O2-A52W,-4.8669,1.4227,1.3014,-0.5316,,2.3862,-1.9720,-1.2958,,...,-0.1833,-0.9170,-0.4644,0.0616,-0.4238,-1.1332,lusc_tcga,1,1,8.57
493,TCGA-O2-A5IB,-4.8669,1.7917,0.0438,-1.9878,,1.8530,0.9744,-1.2958,,...,2.0212,-2.4961,-0.1755,1.9430,-1.5718,-1.1332,lusc_tcga,0,1,11.17


In [289]:
lusc_df_with_sex["Sex"].isna().sum()

0

In [290]:
lusc_df_with_sex = lusc_df_with_sex.drop(["Patient ID", "Study ID"], axis = 1)

In [291]:
lusc_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,EZHIP,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-4.8669,1.8381,1.4899,1.2254,,-2.2792,-1.9720,-1.2958,,-0.9506,...,-1.5172,1.4713,-1.5127,-2.6774,-0.0367,-0.7324,-1.1332,1,1,12.19
1,-4.8669,-0.1497,0.5958,0.0224,,1.0199,-1.9720,-1.2958,,-0.9506,...,-1.0225,0.9700,0.1076,-0.5487,0.7938,-0.4687,-0.6623,1,1,4.47
2,-4.8669,-0.7525,1.2436,1.1598,,-1.3415,1.5868,-1.2958,,-0.9506,...,-0.7883,-0.0203,-4.1585,-0.9323,0.0432,-0.8272,2.4546,0,1,75.69
3,-4.8669,-1.9600,0.1109,-0.0455,,-0.5129,0.4852,-1.2958,,-0.2293,...,-1.7184,2.0602,0.1869,-0.2452,0.9468,0.3779,-1.1332,1,0,123.09
4,-4.8669,0.0195,0.6288,0.5015,,-0.1756,-1.9720,-1.2958,,-0.1663,...,1.2613,0.3334,-0.5724,-0.4479,0.5313,-0.8654,1.5092,1,1,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,-4.8669,0.2314,0.2982,0.5761,,0.6465,1.4279,-1.2958,,-0.5606,...,1.0844,-0.3779,-0.8226,0.3708,0.1480,-0.8781,-0.1563,0,1,12.71
491,-4.8669,0.7217,0.4682,0.7089,,0.0837,-0.7508,-1.2958,,0.1157,...,-0.9618,0.9350,0.8601,-0.6727,0.1300,-1.3218,-1.1332,0,1,43.86
492,-4.8669,1.4227,1.3014,-0.5316,,2.3862,-1.9720,-1.2958,,-0.6725,...,0.9300,-0.1833,-0.9170,-0.4644,0.0616,-0.4238,-1.1332,1,1,8.57
493,-4.8669,1.7917,0.0438,-1.9878,,1.8530,0.9744,-1.2958,,-0.5710,...,1.2090,2.0212,-2.4961,-0.1755,1.9430,-1.5718,-1.1332,0,1,11.17


In [292]:
lusc_df_with_sex = lusc_df_with_sex.iloc[:, ~(lusc_df_with_sex.isna().sum() / lusc_df_with_sex.shape[0]  > 0.8).values]

In [293]:
lusc_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,LOC155060,RNU12-2P,SSX9,EZHIP,EFCAB8,SRP14P1,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-4.8669,1.8381,1.4899,1.2254,-2.2792,-1.9720,-1.2958,-0.9506,0.5534,1.5247,...,-1.5172,1.4713,-1.5127,-2.6774,-0.0367,-0.7324,-1.1332,1,1,12.19
1,-4.8669,-0.1497,0.5958,0.0224,1.0199,-1.9720,-1.2958,-0.9506,-0.5806,-0.3746,...,-1.0225,0.9700,0.1076,-0.5487,0.7938,-0.4687,-0.6623,1,1,4.47
2,-4.8669,-0.7525,1.2436,1.1598,-1.3415,1.5868,-1.2958,-0.9506,-1.2852,0.9556,...,-0.7883,-0.0203,-4.1585,-0.9323,0.0432,-0.8272,2.4546,0,1,75.69
3,-4.8669,-1.9600,0.1109,-0.0455,-0.5129,0.4852,-1.2958,-0.2293,-1.2852,0.9506,...,-1.7184,2.0602,0.1869,-0.2452,0.9468,0.3779,-1.1332,1,0,123.09
4,-4.8669,0.0195,0.6288,0.5015,-0.1756,-1.9720,-1.2958,-0.1663,1.0158,-0.7022,...,1.2613,0.3334,-0.5724,-0.4479,0.5313,-0.8654,1.5092,1,1,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,-4.8669,0.2314,0.2982,0.5761,0.6465,1.4279,-1.2958,-0.5606,0.0300,-2.5366,...,1.0844,-0.3779,-0.8226,0.3708,0.1480,-0.8781,-0.1563,0,1,12.71
491,-4.8669,0.7217,0.4682,0.7089,0.0837,-0.7508,-1.2958,0.1157,0.7477,-1.0702,...,-0.9618,0.9350,0.8601,-0.6727,0.1300,-1.3218,-1.1332,0,1,43.86
492,-4.8669,1.4227,1.3014,-0.5316,2.3862,-1.9720,-1.2958,-0.6725,0.8308,0.4377,...,0.9300,-0.1833,-0.9170,-0.4644,0.0616,-0.4238,-1.1332,1,1,8.57
493,-4.8669,1.7917,0.0438,-1.9878,1.8530,0.9744,-1.2958,-0.5710,-0.0828,0.0429,...,1.2090,2.0212,-2.4961,-0.1755,1.9430,-1.5718,-1.1332,0,1,11.17


In [294]:
lusc_df_with_sex.isna().sum().sum()

0

### Remove non pathway-genes from LUSC

In [296]:
lusc_df_with_sex.columns.values[:-3], lusc_df_with_sex.columns.values[:-3].size

(array(['LOC100130426', 'UBE2Q2P3', 'HMGB1P1', ..., 'ZZZ3', 'TPTEP1',
        'AKR1C6P'], dtype=object),
 20169)

In [297]:
np.unique(lusc_df_with_sex.columns), np.unique(lusc_df_with_sex.columns).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11B', 'ZYX', 'ZZZ3'],
       dtype=object),
 20172)

In [298]:
removable_gene_lusc = np.setdiff1d(lusc_df_with_sex.columns.values[:-3], unique_gene_in_pathway)

In [299]:
removable_gene_lusc, removable_gene_lusc.size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11A', 'ZYG11B', 'ZZZ3'],
       dtype=object),
 15803)

In [300]:
np.unique(removable_gene_lusc), np.unique(removable_gene_lusc).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11A', 'ZYG11B', 'ZZZ3'],
       dtype=object),
 15803)

In [301]:
lusc_df_with_sex = lusc_df_with_sex.drop(removable_gene_lusc, axis = 1)

In [302]:
lusc_df_with_sex

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,1.4899,0.2806,1.2640,-1.1910,-1.9130,-3.1176,1.5903,2.0014,-0.3019,-0.1855,...,-1.5068,-0.5023,0.0084,1.9210,0.5612,1.1802,-1.5127,1,1,12.19
1,0.5958,0.5284,0.7721,-0.1655,-0.8774,-0.8649,0.1732,0.7866,1.3584,1.3587,...,0.3832,0.3592,0.6552,0.0093,0.7833,-1.4013,0.1076,1,1,4.47
2,1.2436,0.8217,-1.8103,1.8121,-1.9130,-2.9060,1.1801,1.1786,0.1547,0.6600,...,-0.0217,0.5434,0.2672,0.1989,2.8537,-0.0450,-4.1585,0,1,75.69
3,0.1109,1.4265,-1.0411,0.8031,-1.9130,-0.5093,0.8880,1.2401,-0.1967,0.7333,...,2.0952,0.7145,-2.5363,0.9940,0.2669,0.3938,0.1869,1,0,123.09
4,0.6288,-0.5615,1.4504,0.0493,-0.3669,1.5842,0.6747,1.3838,0.1108,-0.9052,...,-0.5310,-0.3965,-1.9010,-1.2751,-0.3862,0.2625,-0.5724,1,1,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,0.2982,-0.7502,-0.6965,0.6680,-1.9130,0.0325,0.6547,-0.2071,-0.8912,-1.3756,...,0.3423,0.7051,-0.2711,-1.7719,0.3876,-0.9890,-0.8226,0,1,12.71
491,0.4682,-0.3021,-0.6245,-0.1061,0.2575,-0.2706,-0.3664,-1.0547,1.2710,1.1411,...,-1.4025,0.7756,0.3390,-0.2501,1.1855,-2.0965,0.8601,0,1,43.86
492,1.3014,-0.3246,0.1157,-0.3372,-0.7686,1.5318,-0.8293,-0.6591,0.5701,0.6028,...,-1.9555,-0.5944,1.4217,-0.5923,-1.3152,-0.5711,-0.9170,1,1,8.57
493,0.0438,0.9554,0.8300,2.0655,-1.2849,2.1550,0.0948,0.8630,-0.1724,-0.5642,...,0.6429,0.8057,-0.8409,1.7818,-0.4635,0.8710,-2.4961,0,1,11.17


In [303]:
20172 - 15803

4369

In [304]:
np.intersect1d(lusc_df_with_sex.columns.values[:-3], unique_gene_in_pathway)

array(['A2M', 'AACS', 'AADAT', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
      dtype=object)

In [305]:
np.intersect1d(lusc_df_with_sex.columns.values[:-3], unique_gene_in_pathway).size

4366

In [306]:
lusc_df_with_sex.to_csv("TCGA_LUSC_gene_expression_data.csv", index = False, header = True)

---

## Pathway Mask

In [307]:
lusc_df_with_sex

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,1.4899,0.2806,1.2640,-1.1910,-1.9130,-3.1176,1.5903,2.0014,-0.3019,-0.1855,...,-1.5068,-0.5023,0.0084,1.9210,0.5612,1.1802,-1.5127,1,1,12.19
1,0.5958,0.5284,0.7721,-0.1655,-0.8774,-0.8649,0.1732,0.7866,1.3584,1.3587,...,0.3832,0.3592,0.6552,0.0093,0.7833,-1.4013,0.1076,1,1,4.47
2,1.2436,0.8217,-1.8103,1.8121,-1.9130,-2.9060,1.1801,1.1786,0.1547,0.6600,...,-0.0217,0.5434,0.2672,0.1989,2.8537,-0.0450,-4.1585,0,1,75.69
3,0.1109,1.4265,-1.0411,0.8031,-1.9130,-0.5093,0.8880,1.2401,-0.1967,0.7333,...,2.0952,0.7145,-2.5363,0.9940,0.2669,0.3938,0.1869,1,0,123.09
4,0.6288,-0.5615,1.4504,0.0493,-0.3669,1.5842,0.6747,1.3838,0.1108,-0.9052,...,-0.5310,-0.3965,-1.9010,-1.2751,-0.3862,0.2625,-0.5724,1,1,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,0.2982,-0.7502,-0.6965,0.6680,-1.9130,0.0325,0.6547,-0.2071,-0.8912,-1.3756,...,0.3423,0.7051,-0.2711,-1.7719,0.3876,-0.9890,-0.8226,0,1,12.71
491,0.4682,-0.3021,-0.6245,-0.1061,0.2575,-0.2706,-0.3664,-1.0547,1.2710,1.1411,...,-1.4025,0.7756,0.3390,-0.2501,1.1855,-2.0965,0.8601,0,1,43.86
492,1.3014,-0.3246,0.1157,-0.3372,-0.7686,1.5318,-0.8293,-0.6591,0.5701,0.6028,...,-1.9555,-0.5944,1.4217,-0.5923,-1.3152,-0.5711,-0.9170,1,1,8.57
493,0.0438,0.9554,0.8300,2.0655,-1.2849,2.1550,0.0948,0.8630,-0.1724,-0.5642,...,0.6429,0.8057,-0.8409,1.7818,-0.4635,0.8710,-2.4961,0,1,11.17


In [308]:
lusc_df_with_sex.columns.values[:-3], lusc_df_with_sex.columns.values[:-3].size

(array(['HMGB1P1', 'A2M', 'AACS', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
       dtype=object),
 4366)

In [309]:
gene_name = lusc_df_with_sex.columns.values[:-3]

In [310]:
gene_name.size, np.unique(gene_name).size

(4366, 4366)

In [311]:
pathway_name, pathway_name.size

(0              KEGG_N_GLYCAN_BIOSYNTHESIS
 1           KEGG_OTHER_GLYCAN_DEGRADATION
 2              KEGG_O_GLYCAN_BIOSYNTHESIS
 3      KEGG_GLYCOSAMINOGLYCAN_DEGRADATION
 4            KEGG_GLYCEROLIPID_METABOLISM
                       ...                
 168                           KEGG_ASTHMA
 169       KEGG_AUTOIMMUNE_THYROID_DISEASE
 170              KEGG_ALLOGRAFT_REJECTION
 171        KEGG_GRAFT_VERSUS_HOST_DISEASE
 172                KEGG_VIRAL_MYOCARDITIS
 Name: Name, Length: 173, dtype: object,
 173)

In [312]:
pathway_sparse_mat = sparse.coo_matrix((pathway_name.size, gene_name.size)).toarray()
pathway_sparse_mat.shape

(173, 4366)

In [313]:
pathway_sparse_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [314]:
pathway_sparse_mat.sum().sum()

0.0

In [315]:
for i in range(len(pathway_name)):
    pathway_sparse_mat[i, np.argwhere(np.isin(gene_name, set_pathway['gene'][i])).reshape((-1, ))] = 1.
            
pathway_sparse_mat, pathway_sparse_mat.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 (173, 4366))

In [316]:
pathway_sparse_mat.sum()

10880.0

---

In [317]:
data_type = "LUSC"

In [318]:
pathway_sparse_mat = sparse.coo_matrix(pathway_sparse_mat)
sparse.save_npz(f"TCGA_{data_type}_Pathway_Mask.npz", pathway_sparse_mat)

---

# TCGA - GBM

## clinical data

In [319]:
gbm_survival_info_df = pd.read_csv(data_path + "gbm_tcga/data_clinical_patient.txt", sep = '\t')

In [320]:
gbm_survival_info_df

Unnamed: 0,#Other Patient ID,Patient Identifier,Form completion date,History lgg dx of brain tissue,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,Sex,Race Category,Ethnicity Category,Prior Cancer Diagnosis Occurence,...,"International Classification of Diseases for Oncology, Third Edition ICD-O-3 Histology Code","International Classification of Diseases for Oncology, Third Edition ICD-O-3 Site Code",Informed consent verified,Project code,Tissue Source Site,Tumor Tissue Site,Overall Survival Status,Overall Survival (Months),Disease Free Status,Disease Free (Months)
0,#Legacy DMP patient identifier (DMPnnnn),Identifier to uniquely specify a patient.,Form completion date,History lgg dx of brain tissue,Text indicator for the time frame of tissue pr...,Text indicator for the time frame of tissue pr...,Sex,The text for reporting information about race.,The text for reporting information about ethni...,Text term to describe the patient's history of...,...,The third edition of the International Classif...,The third edition of the International Classif...,Informed consent verified,Project code,"A Tissue Source Site collects samples (tissue,...",Text term that describes the anatomic site of ...,Overall patient survival status.,Overall survival in months since initial diago...,Disease free status since initial treatment.,Disease free (months) since initial treatment.
1,#STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,...,STRING,STRING,STRING,STRING,STRING,STRING,STRING,NUMBER,STRING,NUMBER
2,#1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,OTHER_PATIENT_ID,PATIENT_ID,FORM_COMPLETION_DATE,HISTORY_LGG_DX_OF_BRAIN_TISSUE,PROSPECTIVE_COLLECTION,RETROSPECTIVE_COLLECTION,SEX,RACE,ETHNICITY,HISTORY_OTHER_MALIGNANCY,...,ICD_O_3_HISTOLOGY,ICD_O_3_SITE,INFORMED_CONSENT_VERIFIED,PROJECT_CODE,TISSUE_SOURCE_SITE,SITE_OF_TUMOR_TISSUE,OS_STATUS,OS_MONTHS,DFS_STATUS,DFS_MONTHS
4,30a1fe5e-5b12-472c-aa86-c2db8167ab23,TCGA-02-0001,12/16/08,NO,[Not Available],[Not Available],Female,WHITE,NOT HISPANIC OR LATINO,[Not Available],...,9440/3,C71.9,YES,[Not Available],2,Brain,1:DECEASED,11.76,1:Recurred/Progressed,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,ed879063-e603-4151-bf5e-22a4ee210281,TCGA-87-5896,7/13/11,NO,[Not Available],[Not Available],Female,WHITE,NOT HISPANIC OR LATINO,[Not Available],...,9440/3,C71.9,YES,[Not Available],87,Brain,0:LIVING,26.28,0:DiseaseFree,26.28
596,0FC32169-8EB9-405F-BECC-ED38D7064A58,TCGA-OX-A56R,2/27/13,NO,YES,NO,Male,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,No,...,9440/3,C71.2,YES,[Not Available],OX,Brain,1:DECEASED,5.91,[Not Available],[Not Available]
597,225F3689-221D-4296-8472-D8C21EEDAF8D,TCGA-RR-A6KA,8/18/14,NO,NO,YES,Female,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,No,...,9440/3,C71.1,YES,[Not Available],RR,Brain,1:DECEASED,6.27,[Not Available],[Not Available]
598,6F1B1011-D7A4-4A05-B6FF-288183E7254B,TCGA-RR-A6KB,8/18/14,NO,NO,YES,Male,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,No,...,9440/3,C71.1,YES,[Not Available],RR,Brain,0:LIVING,0,0:DiseaseFree,0


In [321]:
gbm_survival_info_df = gbm_survival_info_df[["Patient Identifier", "Sex", "Overall Survival Status", "Overall Survival (Months)"]]

In [322]:
gbm_survival_info_df

Unnamed: 0,Patient Identifier,Sex,Overall Survival Status,Overall Survival (Months)
0,Identifier to uniquely specify a patient.,Sex,Overall patient survival status.,Overall survival in months since initial diago...
1,STRING,STRING,STRING,NUMBER
2,1,1,1,1
3,PATIENT_ID,SEX,OS_STATUS,OS_MONTHS
4,TCGA-02-0001,Female,1:DECEASED,11.76
...,...,...,...,...
595,TCGA-87-5896,Female,0:LIVING,26.28
596,TCGA-OX-A56R,Male,1:DECEASED,5.91
597,TCGA-RR-A6KA,Female,1:DECEASED,6.27
598,TCGA-RR-A6KB,Male,0:LIVING,0


In [323]:
gbm_survival_info_df = gbm_survival_info_df.loc[3:]

In [324]:
gbm_survival_info_df

Unnamed: 0,Patient Identifier,Sex,Overall Survival Status,Overall Survival (Months)
3,PATIENT_ID,SEX,OS_STATUS,OS_MONTHS
4,TCGA-02-0001,Female,1:DECEASED,11.76
5,TCGA-02-0003,Male,1:DECEASED,4.73
6,TCGA-02-0004,Male,1:DECEASED,11.33
7,TCGA-02-0006,Female,1:DECEASED,18.33
...,...,...,...,...
595,TCGA-87-5896,Female,0:LIVING,26.28
596,TCGA-OX-A56R,Male,1:DECEASED,5.91
597,TCGA-RR-A6KA,Female,1:DECEASED,6.27
598,TCGA-RR-A6KB,Male,0:LIVING,0


In [325]:
gbm_survival_info_df = gbm_survival_info_df.rename(columns = {"Patient Identifier" : "Patient ID", "Overall Survival Status" : "OS_STATUS", "Overall Survival (Months)" : "OS_MONTHS"}).drop(3).reset_index(drop = True)

In [326]:
gbm_survival_info_df

Unnamed: 0,Patient ID,Sex,OS_STATUS,OS_MONTHS
0,TCGA-02-0001,Female,1:DECEASED,11.76
1,TCGA-02-0003,Male,1:DECEASED,4.73
2,TCGA-02-0004,Male,1:DECEASED,11.33
3,TCGA-02-0006,Female,1:DECEASED,18.33
4,TCGA-02-0007,Female,1:DECEASED,23.16
...,...,...,...,...
591,TCGA-87-5896,Female,0:LIVING,26.28
592,TCGA-OX-A56R,Male,1:DECEASED,5.91
593,TCGA-RR-A6KA,Female,1:DECEASED,6.27
594,TCGA-RR-A6KB,Male,0:LIVING,0


In [327]:
gbm_survival_info_df = gbm_survival_info_df[~gbm_survival_info_df["OS_MONTHS"].str.contains("Not")].reset_index(drop = True)

In [328]:
gbm_survival_info_df

Unnamed: 0,Patient ID,Sex,OS_STATUS,OS_MONTHS
0,TCGA-02-0001,Female,1:DECEASED,11.76
1,TCGA-02-0003,Male,1:DECEASED,4.73
2,TCGA-02-0004,Male,1:DECEASED,11.33
3,TCGA-02-0006,Female,1:DECEASED,18.33
4,TCGA-02-0007,Female,1:DECEASED,23.16
...,...,...,...,...
589,TCGA-87-5896,Female,0:LIVING,26.28
590,TCGA-OX-A56R,Male,1:DECEASED,5.91
591,TCGA-RR-A6KA,Female,1:DECEASED,6.27
592,TCGA-RR-A6KB,Male,0:LIVING,0


In [329]:
gbm_survival_info_df["OS_STATUS"][gbm_survival_info_df["OS_STATUS"] == "1:DECEASED"].shape

(492,)

In [330]:
gbm_survival_info_df["OS_STATUS"][gbm_survival_info_df["OS_STATUS"] == "0:LIVING"].shape

(102,)

In [331]:
492 + 102

594

In [332]:
gbm_survival_info_df["OS_STATUS"] = gbm_survival_info_df["OS_STATUS"].apply(lambda x : 1 if x.find('1:') != -1 else 0)

In [333]:
gbm_survival_info_df["Sex"] = gbm_survival_info_df["Sex"].apply(lambda x : 0 if x.find('Female') != -1 else 1)

In [334]:
gbm_survival_info_df

Unnamed: 0,Patient ID,Sex,OS_STATUS,OS_MONTHS
0,TCGA-02-0001,0,1,11.76
1,TCGA-02-0003,1,1,4.73
2,TCGA-02-0004,1,1,11.33
3,TCGA-02-0006,0,1,18.33
4,TCGA-02-0007,0,1,23.16
...,...,...,...,...
589,TCGA-87-5896,0,0,26.28
590,TCGA-OX-A56R,1,1,5.91
591,TCGA-RR-A6KA,0,1,6.27
592,TCGA-RR-A6KB,1,0,0


---

## gene expression data

In [335]:
gbm_df = pd.read_csv(data_path + "gbm_tcga/data_mrna_seq_v2_rsem_zscores_ref_all_samples.txt", sep = '\t')

In [336]:
gbm_df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-02-0047-01,TCGA-02-0055-01,TCGA-02-2483-01,TCGA-02-2485-01,TCGA-02-2486-01,TCGA-06-0125-01,TCGA-06-0125-02,TCGA-06-0129-01,...,TCGA-41-3915-01,TCGA-41-4097-01,TCGA-41-5651-01,TCGA-76-4925-01,TCGA-76-4926-01,TCGA-76-4927-01,TCGA-76-4928-01,TCGA-76-4929-01,TCGA-76-4931-01,TCGA-76-4932-01
0,LOC100130426,100130426,-2.9316,-2.9316,-2.9316,-2.9316,-2.9316,-0.7002,-2.9316,-2.9316,...,-2.9316,-2.9316,-2.9316,-2.9316,-2.9316,-2.9316,-2.9316,-2.9316,-2.9316,-2.9316
1,UBE2Q2P3,100133144,-0.7830,-2.8886,-0.8303,1.3587,-0.0683,0.6961,-0.6756,1.8330,...,0.4431,1.2928,-1.6936,0.9816,0.6469,-1.0496,0.7296,0.4828,0.0627,-2.8886
2,UBE2Q2P3,100134869,-0.3913,1.0067,0.8038,0.9240,-0.9213,0.5028,-1.4326,1.1258,...,0.2808,0.7987,0.3041,1.1799,0.8073,-0.4719,-0.4583,-1.4904,-0.1297,0.2368
3,HMGB1P1,10357,-0.0933,-1.9056,0.2292,1.6140,0.3466,-0.2502,-0.9890,0.9990,...,-1.4999,-2.3187,0.7218,2.1552,0.7440,-0.8803,-0.9232,0.3303,0.1450,0.2213
4,TIMM23,10431,1.1789,1.1769,2.2680,-0.9031,-0.4093,-1.4989,-1.2780,0.1221,...,-0.6190,-0.1390,-0.6500,-0.4970,-0.1679,0.0104,-0.4859,0.2549,-0.9855,-0.7034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20526,ZYX,7791,-0.9779,1.0307,-0.4795,-0.4707,0.3169,1.4637,-0.0917,-0.9919,...,0.0618,-0.6086,-1.3431,0.7940,-0.4091,0.2369,1.7484,-1.4953,0.1723,0.7019
20527,FLJ10821,23140,0.8058,-1.1217,-0.3903,0.3354,-1.0416,0.7283,0.8050,1.5915,...,-0.0878,1.2188,-0.5975,-0.6217,0.2386,0.5382,0.0908,1.4699,1.0625,-0.4955
20528,ZZZ3,26009,-0.3770,0.3210,0.2492,0.2209,-0.9185,0.3872,0.0107,2.0857,...,0.6459,-0.4051,-0.8422,1.2740,0.4751,-0.8139,-1.0322,-0.1884,-0.6144,0.4886
20529,TPTEP1,387590,-0.7354,-0.2794,2.5862,-0.0890,1.1239,-0.8091,0.0049,2.4425,...,-0.1530,-0.1907,-0.3284,-0.3007,-0.4146,-0.1998,-0.6212,-0.3962,-0.9278,-2.4620


In [337]:
gbm_df_columns = gbm_df["Hugo_Symbol"]

In [338]:
gbm_df_columns

0        LOC100130426
1            UBE2Q2P3
2            UBE2Q2P3
3             HMGB1P1
4              TIMM23
             ...     
20526             ZYX
20527        FLJ10821
20528            ZZZ3
20529          TPTEP1
20530         AKR1C6P
Name: Hugo_Symbol, Length: 20531, dtype: object

In [339]:
gbm_df = gbm_df.drop(["Hugo_Symbol", "Entrez_Gene_Id"], axis = 1).T

In [340]:
gbm_df.columns = gbm_df_columns

In [341]:
gbm_df = gbm_df.reset_index().rename(columns = {"index" : "Patient ID"})

In [342]:
gbm_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-02-0047-01,-2.9316,-0.7830,-0.3913,-0.0933,1.1789,,-0.4838,-2.0864,,...,0.5403,0.0360,0.4028,-0.8701,0.4574,-0.9779,0.8058,-0.3770,-0.7354,-2.194
1,TCGA-02-0055-01,-2.9316,-2.8886,1.0067,-1.9056,1.1769,,-0.9794,-0.7341,,...,-0.4143,-0.7361,-1.6643,0.9106,-0.7419,1.0307,-1.1217,0.3210,-0.2794,-2.194
2,TCGA-02-2483-01,-2.9316,-0.8303,0.8038,0.2292,2.2680,,-1.2271,-2.0864,,...,-2.1522,0.4548,-0.4459,3.6280,0.4629,-0.4795,-0.3903,0.2492,2.5862,-2.194
3,TCGA-02-2485-01,-2.9316,1.3587,0.9240,1.6140,-0.9031,,0.3144,1.1536,,...,0.1094,-0.8058,1.0686,-1.5164,-0.0848,-0.4707,0.3354,0.2209,-0.0890,-2.194
4,TCGA-02-2486-01,-2.9316,-0.0683,-0.9213,0.3466,-0.4093,,-0.2510,-2.0864,,...,-1.0731,-1.8484,-1.4203,-1.5164,-2.0586,0.3169,-1.0416,-0.9185,1.1239,-2.194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,TCGA-76-4927-01,-2.9316,-1.0496,-0.4719,-0.8803,0.0104,,-0.9245,-0.2951,,...,-0.1382,-0.8198,-0.1896,-1.5164,-0.4434,0.2369,0.5382,-0.8139,-0.1998,-2.194
162,TCGA-76-4928-01,-2.9316,0.7296,-0.4583,-0.9232,-0.4859,,-0.7389,-0.9823,,...,-0.7678,-1.2737,-0.5106,-1.5164,-1.1488,1.7484,0.0908,-1.0322,-0.6212,-2.194
163,TCGA-76-4929-01,-2.9316,0.4828,-1.4904,0.3303,0.2549,,-0.0078,-0.7386,,...,1.2540,0.7121,0.7392,0.2814,1.0287,-1.4953,1.4699,-0.1884,-0.3962,-2.194
164,TCGA-76-4931-01,-2.9316,0.0627,-0.1297,0.1450,-0.9855,,1.2868,-0.9146,,...,-0.1036,-0.0523,0.8455,-0.6217,0.2066,0.1723,1.0625,-0.6144,-0.9278,-2.194


In [343]:
gbm_df["Patient ID"] = gbm_df["Patient ID"].apply(lambda x : x[:-3])

In [344]:
gbm_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-02-0047,-2.9316,-0.7830,-0.3913,-0.0933,1.1789,,-0.4838,-2.0864,,...,0.5403,0.0360,0.4028,-0.8701,0.4574,-0.9779,0.8058,-0.3770,-0.7354,-2.194
1,TCGA-02-0055,-2.9316,-2.8886,1.0067,-1.9056,1.1769,,-0.9794,-0.7341,,...,-0.4143,-0.7361,-1.6643,0.9106,-0.7419,1.0307,-1.1217,0.3210,-0.2794,-2.194
2,TCGA-02-2483,-2.9316,-0.8303,0.8038,0.2292,2.2680,,-1.2271,-2.0864,,...,-2.1522,0.4548,-0.4459,3.6280,0.4629,-0.4795,-0.3903,0.2492,2.5862,-2.194
3,TCGA-02-2485,-2.9316,1.3587,0.9240,1.6140,-0.9031,,0.3144,1.1536,,...,0.1094,-0.8058,1.0686,-1.5164,-0.0848,-0.4707,0.3354,0.2209,-0.0890,-2.194
4,TCGA-02-2486,-2.9316,-0.0683,-0.9213,0.3466,-0.4093,,-0.2510,-2.0864,,...,-1.0731,-1.8484,-1.4203,-1.5164,-2.0586,0.3169,-1.0416,-0.9185,1.1239,-2.194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,TCGA-76-4927,-2.9316,-1.0496,-0.4719,-0.8803,0.0104,,-0.9245,-0.2951,,...,-0.1382,-0.8198,-0.1896,-1.5164,-0.4434,0.2369,0.5382,-0.8139,-0.1998,-2.194
162,TCGA-76-4928,-2.9316,0.7296,-0.4583,-0.9232,-0.4859,,-0.7389,-0.9823,,...,-0.7678,-1.2737,-0.5106,-1.5164,-1.1488,1.7484,0.0908,-1.0322,-0.6212,-2.194
163,TCGA-76-4929,-2.9316,0.4828,-1.4904,0.3303,0.2549,,-0.0078,-0.7386,,...,1.2540,0.7121,0.7392,0.2814,1.0287,-1.4953,1.4699,-0.1884,-0.3962,-2.194
164,TCGA-76-4931,-2.9316,0.0627,-0.1297,0.1450,-0.9855,,1.2868,-0.9146,,...,-0.1036,-0.0523,0.8455,-0.6217,0.2066,0.1723,1.0625,-0.6144,-0.9278,-2.194


---

### merge gene expression, clincic, and sex data

In [345]:
gbm_df_with_sex = pd.merge(gbm_df, gbm_survival_info_df, how = 'inner', on = 'Patient ID')

In [346]:
gbm_df_with_sex

Unnamed: 0,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,TCGA-02-0047,-2.9316,-0.7830,-0.3913,-0.0933,1.1789,,-0.4838,-2.0864,,...,-0.8701,0.4574,-0.9779,0.8058,-0.3770,-0.7354,-2.194,1,1,14.72
1,TCGA-02-0055,-2.9316,-2.8886,1.0067,-1.9056,1.1769,,-0.9794,-0.7341,,...,0.9106,-0.7419,1.0307,-1.1217,0.3210,-0.2794,-2.194,0,1,2.5
2,TCGA-02-2483,-2.9316,-0.8303,0.8038,0.2292,2.2680,,-1.2271,-2.0864,,...,3.6280,0.4629,-0.4795,-0.3903,0.2492,2.5862,-2.194,1,0,15.31
3,TCGA-02-2485,-2.9316,1.3587,0.9240,1.6140,-0.9031,,0.3144,1.1536,,...,-1.5164,-0.0848,-0.4707,0.3354,0.2209,-0.0890,-2.194,1,0,15.44
4,TCGA-02-2486,-2.9316,-0.0683,-0.9213,0.3466,-0.4093,,-0.2510,-2.0864,,...,-1.5164,-2.0586,0.3169,-1.0416,-0.9185,1.1239,-2.194,1,1,20.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,TCGA-76-4927,-2.9316,-1.0496,-0.4719,-0.8803,0.0104,,-0.9245,-0.2951,,...,-1.5164,-0.4434,0.2369,0.5382,-0.8139,-0.1998,-2.194,1,1,17.58
160,TCGA-76-4928,-2.9316,0.7296,-0.4583,-0.9232,-0.4859,,-0.7389,-0.9823,,...,-1.5164,-1.1488,1.7484,0.0908,-1.0322,-0.6212,-2.194,0,1,3.09
161,TCGA-76-4929,-2.9316,0.4828,-1.4904,0.3303,0.2549,,-0.0078,-0.7386,,...,0.2814,1.0287,-1.4953,1.4699,-0.1884,-0.3962,-2.194,0,1,3.65
162,TCGA-76-4931,-2.9316,0.0627,-0.1297,0.1450,-0.9855,,1.2868,-0.9146,,...,-0.6217,0.2066,0.1723,1.0625,-0.6144,-0.9278,-2.194,0,1,9.17


In [347]:
gbm_df_with_sex["Sex"].isna().sum()

0

In [348]:
gbm_df_with_sex = gbm_df_with_sex.drop(["Patient ID"], axis = 1)

In [349]:
gbm_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-2.9316,-0.7830,-0.3913,-0.0933,1.1789,,-0.4838,-2.0864,,,...,-0.8701,0.4574,-0.9779,0.8058,-0.3770,-0.7354,-2.194,1,1,14.72
1,-2.9316,-2.8886,1.0067,-1.9056,1.1769,,-0.9794,-0.7341,,,...,0.9106,-0.7419,1.0307,-1.1217,0.3210,-0.2794,-2.194,0,1,2.5
2,-2.9316,-0.8303,0.8038,0.2292,2.2680,,-1.2271,-2.0864,,,...,3.6280,0.4629,-0.4795,-0.3903,0.2492,2.5862,-2.194,1,0,15.31
3,-2.9316,1.3587,0.9240,1.6140,-0.9031,,0.3144,1.1536,,,...,-1.5164,-0.0848,-0.4707,0.3354,0.2209,-0.0890,-2.194,1,0,15.44
4,-2.9316,-0.0683,-0.9213,0.3466,-0.4093,,-0.2510,-2.0864,,,...,-1.5164,-2.0586,0.3169,-1.0416,-0.9185,1.1239,-2.194,1,1,20.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,-2.9316,-1.0496,-0.4719,-0.8803,0.0104,,-0.9245,-0.2951,,,...,-1.5164,-0.4434,0.2369,0.5382,-0.8139,-0.1998,-2.194,1,1,17.58
160,-2.9316,0.7296,-0.4583,-0.9232,-0.4859,,-0.7389,-0.9823,,,...,-1.5164,-1.1488,1.7484,0.0908,-1.0322,-0.6212,-2.194,0,1,3.09
161,-2.9316,0.4828,-1.4904,0.3303,0.2549,,-0.0078,-0.7386,,,...,0.2814,1.0287,-1.4953,1.4699,-0.1884,-0.3962,-2.194,0,1,3.65
162,-2.9316,0.0627,-0.1297,0.1450,-0.9855,,1.2868,-0.9146,,,...,-0.6217,0.2066,0.1723,1.0625,-0.6144,-0.9278,-2.194,0,1,9.17


In [350]:
gbm_df_with_sex = gbm_df_with_sex.iloc[:, ~(gbm_df_with_sex.isna().sum() / gbm_df_with_sex.shape[0]  > 0.8).values]

In [351]:
gbm_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,LOC155060,RNU12-2P,EZHIP,EFCAB8,SRP14P1,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-2.9316,-0.7830,-0.3913,-0.0933,1.1789,-0.4838,-2.0864,0.4560,-2.1239,-0.9802,...,-0.8701,0.4574,-0.9779,0.8058,-0.3770,-0.7354,-2.194,1,1,14.72
1,-2.9316,-2.8886,1.0067,-1.9056,1.1769,-0.9794,-0.7341,-2.1923,0.5035,0.9036,...,0.9106,-0.7419,1.0307,-1.1217,0.3210,-0.2794,-2.194,0,1,2.5
2,-2.9316,-0.8303,0.8038,0.2292,2.2680,-1.2271,-2.0864,-1.0381,-2.1239,-0.5348,...,3.6280,0.4629,-0.4795,-0.3903,0.2492,2.5862,-2.194,1,0,15.31
3,-2.9316,1.3587,0.9240,1.6140,-0.9031,0.3144,1.1536,-0.1815,-0.3420,-1.2954,...,-1.5164,-0.0848,-0.4707,0.3354,0.2209,-0.0890,-2.194,1,0,15.44
4,-2.9316,-0.0683,-0.9213,0.3466,-0.4093,-0.2510,-2.0864,-0.6889,-0.0005,-0.9907,...,-1.5164,-2.0586,0.3169,-1.0416,-0.9185,1.1239,-2.194,1,1,20.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,-2.9316,-1.0496,-0.4719,-0.8803,0.0104,-0.9245,-0.2951,-2.1923,-2.1239,-0.6087,...,-1.5164,-0.4434,0.2369,0.5382,-0.8139,-0.1998,-2.194,1,1,17.58
160,-2.9316,0.7296,-0.4583,-0.9232,-0.4859,-0.7389,-0.9823,-0.0042,-0.4478,0.4492,...,-1.5164,-1.1488,1.7484,0.0908,-1.0322,-0.6212,-2.194,0,1,3.09
161,-2.9316,0.4828,-1.4904,0.3303,0.2549,-0.0078,-0.7386,2.1370,-2.1239,0.8959,...,0.2814,1.0287,-1.4953,1.4699,-0.1884,-0.3962,-2.194,0,1,3.65
162,-2.9316,0.0627,-0.1297,0.1450,-0.9855,1.2868,-0.9146,-0.9415,0.2208,0.3021,...,-0.6217,0.2066,0.1723,1.0625,-0.6144,-0.9278,-2.194,0,1,9.17


In [352]:
gbm_df_with_sex.isna().sum().sum()

0

---

### remove nan column

In [353]:
gbm_df_with_sex.iloc[:, gbm_df_with_sex.columns.isna()]

Unnamed: 0,NaN
0,0.1201
1,0.1719
2,-2.2764
3,-0.4513
4,0.4676
...,...
159,-0.3390
160,-0.5051
161,-2.2410
162,0.0269


In [354]:
gbm_df_with_sex = gbm_df_with_sex.iloc[:, ~gbm_df_with_sex.columns.isna()]

In [355]:
gbm_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,LOC155060,RNU12-2P,EZHIP,EFCAB8,SRP14P1,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-2.9316,-0.7830,-0.3913,-0.0933,1.1789,-0.4838,-2.0864,0.4560,-2.1239,-0.9802,...,-0.8701,0.4574,-0.9779,0.8058,-0.3770,-0.7354,-2.194,1,1,14.72
1,-2.9316,-2.8886,1.0067,-1.9056,1.1769,-0.9794,-0.7341,-2.1923,0.5035,0.9036,...,0.9106,-0.7419,1.0307,-1.1217,0.3210,-0.2794,-2.194,0,1,2.5
2,-2.9316,-0.8303,0.8038,0.2292,2.2680,-1.2271,-2.0864,-1.0381,-2.1239,-0.5348,...,3.6280,0.4629,-0.4795,-0.3903,0.2492,2.5862,-2.194,1,0,15.31
3,-2.9316,1.3587,0.9240,1.6140,-0.9031,0.3144,1.1536,-0.1815,-0.3420,-1.2954,...,-1.5164,-0.0848,-0.4707,0.3354,0.2209,-0.0890,-2.194,1,0,15.44
4,-2.9316,-0.0683,-0.9213,0.3466,-0.4093,-0.2510,-2.0864,-0.6889,-0.0005,-0.9907,...,-1.5164,-2.0586,0.3169,-1.0416,-0.9185,1.1239,-2.194,1,1,20.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,-2.9316,-1.0496,-0.4719,-0.8803,0.0104,-0.9245,-0.2951,-2.1923,-2.1239,-0.6087,...,-1.5164,-0.4434,0.2369,0.5382,-0.8139,-0.1998,-2.194,1,1,17.58
160,-2.9316,0.7296,-0.4583,-0.9232,-0.4859,-0.7389,-0.9823,-0.0042,-0.4478,0.4492,...,-1.5164,-1.1488,1.7484,0.0908,-1.0322,-0.6212,-2.194,0,1,3.09
161,-2.9316,0.4828,-1.4904,0.3303,0.2549,-0.0078,-0.7386,2.1370,-2.1239,0.8959,...,0.2814,1.0287,-1.4953,1.4699,-0.1884,-0.3962,-2.194,0,1,3.65
162,-2.9316,0.0627,-0.1297,0.1450,-0.9855,1.2868,-0.9146,-0.9415,0.2208,0.3021,...,-0.6217,0.2066,0.1723,1.0625,-0.6144,-0.9278,-2.194,0,1,9.17


---

### check redundant genes

In [356]:
gbm_df_with_sex.columns

Index(['LOC100130426', 'UBE2Q2P3', 'UBE2Q2P3', 'HMGB1P1', 'TIMM23',
       'LOC155060', 'RNU12-2P', 'EZHIP', 'EFCAB8', 'SRP14P1',
       ...
       'ZYG11A', 'ZYG11B', 'ZYX', 'FLJ10821', 'ZZZ3', 'TPTEP1', 'AKR1C6P',
       'Sex', 'OS_STATUS', 'OS_MONTHS'],
      dtype='object', length=19825)

In [357]:
duplicated_genes_gbm = gbm_df_with_sex.columns[gbm_df_with_sex.columns.duplicated()]

In [358]:
duplicated_genes_gbm, duplicated_genes_gbm.size

(Index(['UBE2Q2P3', 'CC2D2B', 'CCDC7', 'CYorf15B', 'C1orf84', 'LINC00875',
        'NBPF16', 'NEBL', 'NKAIN3', 'C5orf23', 'PALM2AKAP2', 'PLEKHG7', 'QSOX1',
        'SH3D20', 'NCRNA00185'],
       dtype='object'),
 15)

In [359]:
gbm_df_with_sex[duplicated_genes_gbm]

Unnamed: 0,UBE2Q2P3,UBE2Q2P3.1,CC2D2B,CC2D2B.1,CCDC7,CCDC7.1,CYorf15B,CYorf15B.1,C1orf84,C1orf84.1,...,PALM2AKAP2,PALM2AKAP2.1,PLEKHG7,PLEKHG7.1,QSOX1,QSOX1.1,SH3D20,SH3D20.1,NCRNA00185,NCRNA00185.1
0,-0.7830,-0.3913,0.1553,-0.0974,0.5078,0.1255,0.4789,0.2052,-0.6247,0.2793,...,0.6681,0.9222,-2.2153,-0.0003,0.8143,0.6132,0.9011,0.5587,0.0117,0.6399
1,-2.8886,1.0067,-0.9053,-1.1062,-0.5593,0.5462,-1.5965,-1.9123,1.8563,-0.2887,...,1.7519,0.1633,-2.2153,-2.0679,1.1756,1.6133,-0.3582,-1.0249,-3.3583,-2.5868
2,-0.8303,0.8038,0.2045,0.0571,-0.0326,0.1190,0.5166,0.2362,0.4573,-1.0614,...,-0.7183,0.4947,-2.2153,1.1872,0.5575,0.3224,-0.6203,-0.3972,0.4221,0.0560
3,1.3587,0.9240,0.4100,-0.8151,0.5568,-1.6803,0.6055,0.6613,0.7491,-0.2479,...,0.8704,-1.2014,-2.2153,-1.0634,0.2897,-1.1751,-0.0003,0.4579,-0.4521,-0.4997
4,-0.0683,-0.9213,-0.8130,-0.2922,-2.4443,-0.3738,0.6198,0.1504,0.5982,-0.6383,...,0.5883,-0.8075,-2.2153,-0.8400,0.3991,0.5473,0.8670,-1.3306,-0.0458,-0.4405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,-1.0496,-0.4719,1.4190,-0.6671,0.6371,-1.5273,0.5128,0.1500,0.5185,-0.1499,...,0.3606,0.1662,-2.2153,-2.0679,0.3088,0.1376,-0.0761,0.2267,-0.1741,0.4140
160,0.7296,-0.4583,-0.3572,-0.7260,-1.2417,0.1295,-1.9505,-2.1713,1.4236,-0.0544,...,0.5623,-0.2204,-0.9032,-0.4447,1.7151,2.7152,0.4252,1.0629,-3.3583,-3.2572
161,0.4828,-1.4904,-0.9097,-1.8214,1.4561,-0.0653,-1.9505,-2.1713,-1.4772,0.9307,...,-0.6439,0.5043,-2.2153,-2.0679,0.8496,0.1575,-1.0706,0.5576,-3.3583,-3.2572
162,0.0627,-0.1297,-1.0820,-0.3028,-1.7074,-0.3941,-1.9505,-2.1713,0.5100,0.8710,...,0.1493,0.9499,-2.2153,-2.0679,-0.5156,-0.7910,-0.9536,-0.9084,-3.3583,-3.2572


In [360]:
for gene in duplicated_genes_gbm:
    var_gene = np.var(gbm_df_with_sex[gene])
    #print(var_gene)
    #print(gbm_df_with_sex[gene])
    #print(var_gene.values)
    #print(np.where(var_gene.values[0]))
    #print(var_gene.values[0])
    #print(var_gene.values[1])
    if var_gene.values[0] < var_gene.values[1]:
        #print(var_gene.values[1])
        #print(gbm_df_with_sex[gene].iloc[:, 1])
        gbm_df_with_sex[gene] = gbm_df_with_sex[gene].iloc[:, 1]
    else:
        #print(var_gene.values[0])
        #print(gbm_df_with_sex[gene].iloc[:, 0])
        gbm_df_with_sex[gene] = gbm_df_with_sex[gene].iloc[:, 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gbm_df_with_sex[gene] = gbm_df_with_sex[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gbm_df_with_sex[gene] = gbm_df_with_sex[gene].iloc[:, 1]


In [361]:
gbm_df_with_sex = gbm_df_with_sex.iloc[:, ~gbm_df_with_sex.columns.duplicated()]

In [362]:
gbm_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,LOC155060,RNU12-2P,EZHIP,EFCAB8,SRP14P1,LOC391343,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-2.9316,-0.7830,-0.0933,1.1789,-0.4838,-2.0864,0.4560,-2.1239,-0.9802,-1.3281,...,-0.8701,0.4574,-0.9779,0.8058,-0.3770,-0.7354,-2.194,1,1,14.72
1,-2.9316,-2.8886,-1.9056,1.1769,-0.9794,-0.7341,-2.1923,0.5035,0.9036,-1.3281,...,0.9106,-0.7419,1.0307,-1.1217,0.3210,-0.2794,-2.194,0,1,2.5
2,-2.9316,-0.8303,0.2292,2.2680,-1.2271,-2.0864,-1.0381,-2.1239,-0.5348,-1.3281,...,3.6280,0.4629,-0.4795,-0.3903,0.2492,2.5862,-2.194,1,0,15.31
3,-2.9316,1.3587,1.6140,-0.9031,0.3144,1.1536,-0.1815,-0.3420,-1.2954,-1.3281,...,-1.5164,-0.0848,-0.4707,0.3354,0.2209,-0.0890,-2.194,1,0,15.44
4,-2.9316,-0.0683,0.3466,-0.4093,-0.2510,-2.0864,-0.6889,-0.0005,-0.9907,-1.3281,...,-1.5164,-2.0586,0.3169,-1.0416,-0.9185,1.1239,-2.194,1,1,20.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,-2.9316,-1.0496,-0.8803,0.0104,-0.9245,-0.2951,-2.1923,-2.1239,-0.6087,-1.3281,...,-1.5164,-0.4434,0.2369,0.5382,-0.8139,-0.1998,-2.194,1,1,17.58
160,-2.9316,0.7296,-0.9232,-0.4859,-0.7389,-0.9823,-0.0042,-0.4478,0.4492,-1.3281,...,-1.5164,-1.1488,1.7484,0.0908,-1.0322,-0.6212,-2.194,0,1,3.09
161,-2.9316,0.4828,0.3303,0.2549,-0.0078,-0.7386,2.1370,-2.1239,0.8959,-1.3281,...,0.2814,1.0287,-1.4953,1.4699,-0.1884,-0.3962,-2.194,0,1,3.65
162,-2.9316,0.0627,0.1450,-0.9855,1.2868,-0.9146,-0.9415,0.2208,0.3021,-1.3281,...,-0.6217,0.2066,0.1723,1.0625,-0.6144,-0.9278,-2.194,0,1,9.17


In [363]:
gbm_df_with_sex.to_csv("TCGA_GBM_gene_expression_data.csv", index = False, header = True)

---

# TCGA - LGG

## clinical data

In [364]:
lgg_survival_info_df = pd.read_csv(data_path + "lgg_tcga/data_bcr_clinical_data_patient.txt", sep = '\t')

In [365]:
lgg_survival_info_df

Unnamed: 0,#Other Patient ID,Patient Identifier,Form completion date,Neoplasm Histologic Type Name,Neoplasm Histologic Grade,Primary Tumor Laterality,Tumor Site,Supratentorial Localization,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,...,"International Classification of Diseases for Oncology, Third Edition ICD-O-3 Site Code",Informed consent verified,Project code,Adjuvant Postoperative Targeted Therapy Administered Indicator,Tissue Source Site,Tumor Tissue Site,Overall Survival Status,Overall Survival (Months),Disease Free Status,Disease Free (Months)
0,#Legacy DMP patient identifier (DMPnnnn),Identifier to uniquely specify a patient.,Form completion date,Text term for the structural pattern of cancer...,Numeric value to express the degree of abnorma...,"For tumors in paired organs, designates the si...",Tumor Site,Supratentorial localization.,Text indicator for the time frame of tissue pr...,Text indicator for the time frame of tissue pr...,...,The third edition of the International Classif...,Informed consent verified,Project code,Text term to signify postoperative adjuvant ca...,"A Tissue Source Site collects samples (tissue,...",Text term that describes the anatomic site of ...,Overall patient survival status.,Overall survival in months since initial diago...,Disease free status since initial treatment.,Disease free (months) since initial treatment.
1,#STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,...,STRING,STRING,STRING,STRING,STRING,STRING,STRING,NUMBER,STRING,NUMBER
2,#1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,OTHER_PATIENT_ID,PATIENT_ID,FORM_COMPLETION_DATE,HISTOLOGICAL_DIAGNOSIS,GRADE,LATERALITY,TUMOR_SITE,SUPRATENTORIAL_LOCALIZATION,PROSPECTIVE_COLLECTION,RETROSPECTIVE_COLLECTION,...,ICD_O_3_SITE,INFORMED_CONSENT_VERIFIED,PROJECT_CODE,TARGETED_MOLECULAR_THERAPY,TISSUE_SOURCE_SITE,SITE_OF_TUMOR_TISSUE,OS_STATUS,OS_MONTHS,DFS_STATUS,DFS_MONTHS
4,334f715e-08dc-4a29-b8e4-b010b829c478,TCGA-CS-4938,2/15/12,Astrocytoma,G2,Right,"Supratentorial, Frontal Lobe",Cerebral Cortex,NO,YES,...,C71.9,YES,[Not Available],[Not Available],CS,Central nervous system,0:LIVING,117.41,0:DiseaseFree,117.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514,09552B53-4393-4811-A671-A9EF6EFF4790,TCGA-WY-A85A,2/27/14,Astrocytoma,G2,Right,"Supratentorial, Frontal Lobe",Not listed in Medical Record,NO,YES,...,C71.0,YES,[Not Available],NO,WY,Central nervous system,0:LIVING,43.36,0:DiseaseFree,43.36
515,CB041720-0EFA-4EB6-A9A5-B6E138BB1A99,TCGA-WY-A85B,2/20/14,Astrocytoma,G2,Right,"Supratentorial, Temporal Lobe",Not listed in Medical Record,NO,YES,...,C71.0,YES,[Not Available],NO,WY,Central nervous system,0:LIVING,45.76,0:DiseaseFree,45.76
516,F62BE2B8-88C4-4A7E-8C80-8176FB5F6C56,TCGA-WY-A85C,2/26/14,Astrocytoma,G2,Right,"Supratentorial, Frontal Lobe",Not listed in Medical Record,NO,YES,...,C71.0,YES,[Not Available],NO,WY,Central nervous system,0:LIVING,46.85,1:Recurred/Progressed,41.43
517,0D5A4C63-C3FB-4295-A433-D90CDBFC4ED6,TCGA-WY-A85D,2/26/14,Oligoastrocytoma,G2,Right,"Supratentorial, Frontal Lobe",Not listed in Medical Record,NO,YES,...,C71.0,YES,[Not Available],NO,WY,Central nervous system,0:LIVING,37.68,1:Recurred/Progressed,29.14


In [366]:
lgg_survival_info_df = lgg_survival_info_df[["Patient Identifier", "Sex", "Overall Survival Status", "Overall Survival (Months)"]]

In [367]:
lgg_survival_info_df

Unnamed: 0,Patient Identifier,Sex,Overall Survival Status,Overall Survival (Months)
0,Identifier to uniquely specify a patient.,Sex,Overall patient survival status.,Overall survival in months since initial diago...
1,STRING,STRING,STRING,NUMBER
2,1,1,1,1
3,PATIENT_ID,SEX,OS_STATUS,OS_MONTHS
4,TCGA-CS-4938,Female,0:LIVING,117.41
...,...,...,...,...
514,TCGA-WY-A85A,Male,0:LIVING,43.36
515,TCGA-WY-A85B,Male,0:LIVING,45.76
516,TCGA-WY-A85C,Male,0:LIVING,46.85
517,TCGA-WY-A85D,Male,0:LIVING,37.68


In [368]:
lgg_survival_info_df = lgg_survival_info_df.loc[3:]

In [369]:
lgg_survival_info_df

Unnamed: 0,Patient Identifier,Sex,Overall Survival Status,Overall Survival (Months)
3,PATIENT_ID,SEX,OS_STATUS,OS_MONTHS
4,TCGA-CS-4938,Female,0:LIVING,117.41
5,TCGA-CS-4941,Male,1:DECEASED,7.69
6,TCGA-CS-4942,Female,1:DECEASED,43.86
7,TCGA-CS-4943,Male,1:DECEASED,36.33
...,...,...,...,...
514,TCGA-WY-A85A,Male,0:LIVING,43.36
515,TCGA-WY-A85B,Male,0:LIVING,45.76
516,TCGA-WY-A85C,Male,0:LIVING,46.85
517,TCGA-WY-A85D,Male,0:LIVING,37.68


In [370]:
lgg_survival_info_df = lgg_survival_info_df.rename(columns = {"Patient Identifier" : "Patient ID", "Overall Survival Status" : "OS_STATUS", "Overall Survival (Months)" : "OS_MONTHS"}).drop(3).reset_index(drop = True)

In [371]:
lgg_survival_info_df

Unnamed: 0,Patient ID,Sex,OS_STATUS,OS_MONTHS
0,TCGA-CS-4938,Female,0:LIVING,117.41
1,TCGA-CS-4941,Male,1:DECEASED,7.69
2,TCGA-CS-4942,Female,1:DECEASED,43.86
3,TCGA-CS-4943,Male,1:DECEASED,36.33
4,TCGA-CS-4944,Male,0:LIVING,60.05
...,...,...,...,...
510,TCGA-WY-A85A,Male,0:LIVING,43.36
511,TCGA-WY-A85B,Male,0:LIVING,45.76
512,TCGA-WY-A85C,Male,0:LIVING,46.85
513,TCGA-WY-A85D,Male,0:LIVING,37.68


In [372]:
lgg_survival_info_df = lgg_survival_info_df[~lgg_survival_info_df["OS_MONTHS"].str.contains("Not")].reset_index(drop = True)

In [373]:
lgg_survival_info_df

Unnamed: 0,Patient ID,Sex,OS_STATUS,OS_MONTHS
0,TCGA-CS-4938,Female,0:LIVING,117.41
1,TCGA-CS-4941,Male,1:DECEASED,7.69
2,TCGA-CS-4942,Female,1:DECEASED,43.86
3,TCGA-CS-4943,Male,1:DECEASED,36.33
4,TCGA-CS-4944,Male,0:LIVING,60.05
...,...,...,...,...
510,TCGA-WY-A85A,Male,0:LIVING,43.36
511,TCGA-WY-A85B,Male,0:LIVING,45.76
512,TCGA-WY-A85C,Male,0:LIVING,46.85
513,TCGA-WY-A85D,Male,0:LIVING,37.68


In [374]:
lgg_survival_info_df["OS_STATUS"][lgg_survival_info_df["OS_STATUS"] == "1:DECEASED"].shape

(126,)

In [375]:
lgg_survival_info_df["OS_STATUS"][lgg_survival_info_df["OS_STATUS"] == "0:LIVING"].shape

(389,)

In [376]:
126 + 389

515

In [377]:
lgg_survival_info_df["OS_STATUS"] = lgg_survival_info_df["OS_STATUS"].apply(lambda x : 1 if x.find('1:') != -1 else 0)

In [378]:
lgg_survival_info_df["Sex"] = lgg_survival_info_df["Sex"].apply(lambda x : 0 if x.find('Female') != -1 else 1)

In [379]:
lgg_survival_info_df

Unnamed: 0,Patient ID,Sex,OS_STATUS,OS_MONTHS
0,TCGA-CS-4938,0,0,117.41
1,TCGA-CS-4941,1,1,7.69
2,TCGA-CS-4942,0,1,43.86
3,TCGA-CS-4943,1,1,36.33
4,TCGA-CS-4944,1,0,60.05
...,...,...,...,...
510,TCGA-WY-A85A,1,0,43.36
511,TCGA-WY-A85B,1,0,45.76
512,TCGA-WY-A85C,1,0,46.85
513,TCGA-WY-A85D,1,0,37.68


---

## gene expression data

In [380]:
lgg_df = pd.read_csv(data_path + "lgg_tcga/data_RNA_Seq_v2_mRNA_median_all_sample_Zscores.txt", sep = '\t')

In [381]:
lgg_df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-CS-4938-01,TCGA-CS-4941-01,TCGA-CS-4942-01,TCGA-CS-4943-01,TCGA-CS-4944-01,TCGA-CS-5390-01,TCGA-CS-5393-01,TCGA-CS-5394-01,...,TCGA-VW-A8FI-01,TCGA-W9-A837-01,TCGA-WH-A86K-01,TCGA-WY-A858-01,TCGA-WY-A859-01,TCGA-WY-A85A-01,TCGA-WY-A85B-01,TCGA-WY-A85C-01,TCGA-WY-A85D-01,TCGA-WY-A85E-01
0,LOC100130426,100130426,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,...,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197,-6.6197
1,UBE2Q2P3,100133144,0.1204,2.0184,0.5099,0.1055,-3.0774,-0.4801,-0.8757,0.0602,...,1.1453,0.5913,0.4153,-0.0150,-1.0138,-0.4550,0.0533,0.4764,-0.4688,0.2880
2,UBE2Q2P3,100134869,0.7989,0.6812,-0.3681,-1.5361,1.2697,1.1344,-0.7323,0.1688,...,0.7430,1.0817,0.0405,0.2944,-0.4851,0.2445,0.5637,0.4807,-0.6996,-0.9732
3,HMGB1P1,10357,1.9011,0.2737,0.7771,1.9152,1.2474,0.3322,0.9827,1.1365,...,-0.3417,-0.8797,1.0924,0.7652,-0.3940,-0.4091,-0.2036,-0.5069,-0.8678,-0.0018
4,TIMM23,10431,1.1246,-1.7890,-0.0370,1.0794,0.9732,-0.3172,-0.4838,-0.1201,...,-0.4000,-0.1482,-0.3566,0.7249,1.2531,0.1796,0.2048,-0.6743,0.4390,0.6643
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20526,ZYX,7791,-0.8185,0.9425,-0.1734,-0.2491,-0.7057,-0.4204,-0.6408,-0.5755,...,2.3508,-1.0426,-0.1939,0.3549,-0.4512,-0.9482,-0.8485,-0.4482,-0.6900,-0.3323
20527,FLJ10821,23140,-1.2024,0.1612,-0.4970,0.5798,-2.9164,0.8985,0.6959,1.3148,...,-1.7017,0.2747,-0.4945,-0.9292,-0.6323,-0.1755,0.1146,0.7445,-0.4828,-0.4703
20528,ZZZ3,26009,1.0014,1.4876,0.5814,0.5183,-0.5097,-0.5046,1.0285,-0.7013,...,0.2014,-0.7861,0.2733,-0.4453,-0.2173,0.4268,1.0037,1.2134,0.7991,0.2792
20529,TPTEP1,387590,0.8724,-1.0025,1.2741,0.3283,-0.9851,-0.6017,0.9183,0.3433,...,-0.4500,-1.3303,0.9266,0.7370,-0.3345,-1.4466,2.0298,1.5652,1.2293,1.0979


In [382]:
lgg_df_columns = lgg_df["Hugo_Symbol"]

In [383]:
lgg_df_columns

0        LOC100130426
1            UBE2Q2P3
2            UBE2Q2P3
3             HMGB1P1
4              TIMM23
             ...     
20526             ZYX
20527        FLJ10821
20528            ZZZ3
20529          TPTEP1
20530         AKR1C6P
Name: Hugo_Symbol, Length: 20531, dtype: object

In [384]:
lgg_df = lgg_df.drop(["Hugo_Symbol", "Entrez_Gene_Id"], axis = 1).T

In [385]:
lgg_df.columns = lgg_df_columns

In [386]:
lgg_df = lgg_df.reset_index().rename(columns = {"index" : "Patient ID"})

In [387]:
lgg_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-CS-4938-01,-6.6197,0.1204,0.7989,1.9011,1.1246,,-1.3087,-1.1762,-9.1723,...,-0.0068,0.7328,0.2305,-1.1888,0.4558,-0.8185,-1.2024,1.0014,0.8724,-2.9248
1,TCGA-CS-4941-01,-6.6197,2.0184,0.6812,0.2737,-1.7890,,-0.3812,0.6052,-9.1723,...,1.0027,0.3329,-0.0679,-0.7353,0.9001,0.9425,0.1612,1.4876,-1.0025,-2.9248
2,TCGA-CS-4942-01,-6.6197,0.5099,-0.3681,0.7771,-0.0370,,-1.8385,-2.1332,-9.1723,...,1.1726,0.2307,-0.5447,-0.7343,0.6212,-0.1734,-0.4970,0.5814,1.2741,-2.9248
3,TCGA-CS-4943-01,-6.6197,0.1055,-1.5361,1.9152,1.0794,,-2.5604,-1.3623,-9.1723,...,0.6514,0.5074,0.1509,1.0283,0.9145,-0.2491,0.5798,0.5183,0.3283,-2.9248
4,TCGA-CS-4944-01,-6.6197,-3.0774,1.2697,1.2474,0.9732,,-2.6144,-2.1332,-9.1723,...,-0.8384,-0.9739,-0.9234,-0.5641,-0.1065,-0.7057,-2.9164,-0.5097,-0.9851,-2.9248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,TCGA-WY-A85A-01,-6.6197,-0.4550,0.2445,-0.4091,0.1796,,0.4373,-0.1649,-9.1723,...,-0.3595,-0.6869,0.0627,-1.1888,0.4482,-0.9482,-0.1755,0.4268,-1.4466,-2.9248
526,TCGA-WY-A85B-01,-6.6197,0.0533,0.5637,-0.2036,0.2048,,0.6325,1.0021,-9.1723,...,-0.1344,0.5929,0.3519,-0.2390,0.3560,-0.8485,0.1146,1.0037,2.0298,-2.9248
527,TCGA-WY-A85C-01,-6.6197,0.4764,0.4807,-0.5069,-0.6743,,1.1433,1.8313,-9.1723,...,0.3438,0.7846,0.9941,0.8775,1.0635,-0.4482,0.7445,1.2134,1.5652,-2.9248
528,TCGA-WY-A85D-01,-6.6197,-0.4688,-0.6996,-0.8678,0.4390,,1.0804,-1.0383,-9.1723,...,-1.2175,-1.1043,0.6974,-1.1888,-0.1115,-0.6900,-0.4828,0.7991,1.2293,-2.9248


In [388]:
lgg_df["Patient ID"] = lgg_df["Patient ID"].apply(lambda x : x[:-3])

In [389]:
lgg_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-CS-4938,-6.6197,0.1204,0.7989,1.9011,1.1246,,-1.3087,-1.1762,-9.1723,...,-0.0068,0.7328,0.2305,-1.1888,0.4558,-0.8185,-1.2024,1.0014,0.8724,-2.9248
1,TCGA-CS-4941,-6.6197,2.0184,0.6812,0.2737,-1.7890,,-0.3812,0.6052,-9.1723,...,1.0027,0.3329,-0.0679,-0.7353,0.9001,0.9425,0.1612,1.4876,-1.0025,-2.9248
2,TCGA-CS-4942,-6.6197,0.5099,-0.3681,0.7771,-0.0370,,-1.8385,-2.1332,-9.1723,...,1.1726,0.2307,-0.5447,-0.7343,0.6212,-0.1734,-0.4970,0.5814,1.2741,-2.9248
3,TCGA-CS-4943,-6.6197,0.1055,-1.5361,1.9152,1.0794,,-2.5604,-1.3623,-9.1723,...,0.6514,0.5074,0.1509,1.0283,0.9145,-0.2491,0.5798,0.5183,0.3283,-2.9248
4,TCGA-CS-4944,-6.6197,-3.0774,1.2697,1.2474,0.9732,,-2.6144,-2.1332,-9.1723,...,-0.8384,-0.9739,-0.9234,-0.5641,-0.1065,-0.7057,-2.9164,-0.5097,-0.9851,-2.9248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,TCGA-WY-A85A,-6.6197,-0.4550,0.2445,-0.4091,0.1796,,0.4373,-0.1649,-9.1723,...,-0.3595,-0.6869,0.0627,-1.1888,0.4482,-0.9482,-0.1755,0.4268,-1.4466,-2.9248
526,TCGA-WY-A85B,-6.6197,0.0533,0.5637,-0.2036,0.2048,,0.6325,1.0021,-9.1723,...,-0.1344,0.5929,0.3519,-0.2390,0.3560,-0.8485,0.1146,1.0037,2.0298,-2.9248
527,TCGA-WY-A85C,-6.6197,0.4764,0.4807,-0.5069,-0.6743,,1.1433,1.8313,-9.1723,...,0.3438,0.7846,0.9941,0.8775,1.0635,-0.4482,0.7445,1.2134,1.5652,-2.9248
528,TCGA-WY-A85D,-6.6197,-0.4688,-0.6996,-0.8678,0.4390,,1.0804,-1.0383,-9.1723,...,-1.2175,-1.1043,0.6974,-1.1888,-0.1115,-0.6900,-0.4828,0.7991,1.2293,-2.9248


---

### remove nan column

In [390]:
lgg_df.iloc[:, lgg_df.columns.isna()]

Hugo_Symbol,NaN
0,-0.4529
1,1.0253
2,0.4451
3,-1.4521
4,-0.5060
...,...
525,-0.8852
526,-1.6553
527,-1.4157
528,-1.1880


In [391]:
lgg_df = lgg_df.iloc[:, ~lgg_df.columns.isna()]

In [392]:
lgg_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,UBE2Q2P3.1,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-CS-4938,-6.6197,0.1204,0.7989,1.9011,1.1246,,-1.3087,-1.1762,-9.1723,...,-0.0068,0.7328,0.2305,-1.1888,0.4558,-0.8185,-1.2024,1.0014,0.8724,-2.9248
1,TCGA-CS-4941,-6.6197,2.0184,0.6812,0.2737,-1.7890,,-0.3812,0.6052,-9.1723,...,1.0027,0.3329,-0.0679,-0.7353,0.9001,0.9425,0.1612,1.4876,-1.0025,-2.9248
2,TCGA-CS-4942,-6.6197,0.5099,-0.3681,0.7771,-0.0370,,-1.8385,-2.1332,-9.1723,...,1.1726,0.2307,-0.5447,-0.7343,0.6212,-0.1734,-0.4970,0.5814,1.2741,-2.9248
3,TCGA-CS-4943,-6.6197,0.1055,-1.5361,1.9152,1.0794,,-2.5604,-1.3623,-9.1723,...,0.6514,0.5074,0.1509,1.0283,0.9145,-0.2491,0.5798,0.5183,0.3283,-2.9248
4,TCGA-CS-4944,-6.6197,-3.0774,1.2697,1.2474,0.9732,,-2.6144,-2.1332,-9.1723,...,-0.8384,-0.9739,-0.9234,-0.5641,-0.1065,-0.7057,-2.9164,-0.5097,-0.9851,-2.9248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,TCGA-WY-A85A,-6.6197,-0.4550,0.2445,-0.4091,0.1796,,0.4373,-0.1649,-9.1723,...,-0.3595,-0.6869,0.0627,-1.1888,0.4482,-0.9482,-0.1755,0.4268,-1.4466,-2.9248
526,TCGA-WY-A85B,-6.6197,0.0533,0.5637,-0.2036,0.2048,,0.6325,1.0021,-9.1723,...,-0.1344,0.5929,0.3519,-0.2390,0.3560,-0.8485,0.1146,1.0037,2.0298,-2.9248
527,TCGA-WY-A85C,-6.6197,0.4764,0.4807,-0.5069,-0.6743,,1.1433,1.8313,-9.1723,...,0.3438,0.7846,0.9941,0.8775,1.0635,-0.4482,0.7445,1.2134,1.5652,-2.9248
528,TCGA-WY-A85D,-6.6197,-0.4688,-0.6996,-0.8678,0.4390,,1.0804,-1.0383,-9.1723,...,-1.2175,-1.1043,0.6974,-1.1888,-0.1115,-0.6900,-0.4828,0.7991,1.2293,-2.9248


---

### check redundant genes

In [393]:
lgg_df.columns

Index(['Patient ID', 'LOC100130426', 'UBE2Q2P3', 'UBE2Q2P3', 'HMGB1P1',
       'TIMM23', 'MOXD2', 'LOC155060', 'RNU12-2P', 'SSX9',
       ...
       'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX', 'FLJ10821', 'ZZZ3',
       'TPTEP1', 'AKR1C6P'],
      dtype='object', name='Hugo_Symbol', length=20531)

In [394]:
duplicated_genes_lgg = lgg_df.columns[lgg_df.columns.duplicated()]

In [395]:
duplicated_genes_lgg, duplicated_genes_lgg.size

(Index(['UBE2Q2P3', 'CC2D2B', 'CCDC7', 'CYorf15B', 'C1orf84', 'LINC00875',
        'NBPF16', 'NEBL', 'NKAIN3', 'C5orf23', 'PALM2AKAP2', 'PLEKHG7', 'QSOX1',
        'SH3D20', 'NCRNA00185'],
       dtype='object', name='Hugo_Symbol'),
 15)

In [396]:
lgg_df[duplicated_genes_lgg]

Hugo_Symbol,UBE2Q2P3,UBE2Q2P3.1,CC2D2B,CC2D2B.1,CCDC7,CCDC7.1,CYorf15B,CYorf15B.1,C1orf84,C1orf84.1,...,PALM2AKAP2,PALM2AKAP2.1,PLEKHG7,PLEKHG7.1,QSOX1,QSOX1.1,SH3D20,SH3D20.1,NCRNA00185,NCRNA00185.1
0,0.1204,0.7989,-0.2730,-0.2082,-1.0192,-0.4367,-2.5373,-2.7056,0.8659,-0.1468,...,-0.8288,-0.3835,-3.0508,0.7448,-0.8033,-1.4011,-1.3242,-2.0559,-3.5938,-2.6452
1,2.0184,0.6812,-0.5477,-1.7176,-1.4780,0.2740,0.6163,-0.0679,0.7573,0.4831,...,1.1722,0.0258,-3.0508,0.4260,-0.5981,0.3112,1.0871,2.3972,-0.7434,-0.2244
2,0.5099,-0.3681,-0.5439,-1.0661,-1.0685,0.2829,-2.3644,-2.7056,0.3092,-0.3960,...,0.0528,-0.1908,-3.0508,0.4304,-0.5141,-0.2807,0.6645,-0.0401,-3.5938,-2.6452
3,0.1055,-1.5361,1.9447,-1.2863,-0.9731,-0.2479,0.3417,-0.3550,-0.9484,0.0901,...,0.0400,-1.7515,-3.0508,1.2947,-0.9189,-0.6872,-0.5982,-0.6881,-0.4286,0.7366
4,-3.0774,1.2697,0.0083,-2.0670,-2.5467,-1.2908,0.4522,-0.6072,-0.4851,-0.9383,...,-0.5810,0.7429,-3.0508,-0.0327,-0.0813,0.3495,-2.1547,-2.0559,-0.3352,-0.0868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,-0.4550,0.2445,-0.5213,1.5519,1.9821,1.6671,0.3644,0.5690,-0.0508,0.2154,...,-0.2969,-0.8576,-3.0508,-1.9180,-0.1676,-0.9884,0.5174,0.0606,0.9545,0.9596
526,0.0533,0.5637,0.9423,0.9712,-0.4330,0.2049,0.5451,0.8440,0.3364,0.4922,...,-0.8748,-2.1362,-3.0508,-1.9180,-1.3380,-0.6535,0.5393,0.0346,1.0186,0.9108
527,0.4764,0.4807,0.1328,0.8347,-0.4891,1.6403,0.4723,0.8464,0.6590,1.3885,...,-1.3222,-2.4402,-3.0508,1.4308,-0.2889,-0.7901,0.3661,0.7359,1.3712,0.8436
528,-0.4688,-0.6996,-0.6119,0.7039,0.4369,1.0790,0.2293,0.4692,0.3473,0.4148,...,-3.0371,-2.7429,-3.0508,-1.9180,-1.0498,-1.4052,0.6880,-0.5682,0.7497,-0.1135


In [397]:
for gene in duplicated_genes_lgg:
    var_gene = np.var(lgg_df[gene])
    #print(var_gene)
    #print(lgg_df[gene])
    #print(var_gene.values)
    #print(np.where(var_gene.values[0]))
    #print(var_gene.values[0])
    #print(var_gene.values[1])
    if var_gene.values[0] < var_gene.values[1]:
        #print(var_gene.values[1])
        #print(lgg_df[gene].iloc[:, 1])
        lgg_df[gene] = lgg_df[gene].iloc[:, 1]
    else:
        #print(var_gene.values[0])
        #print(lgg_df[gene].iloc[:, 0])
        lgg_df[gene] = lgg_df[gene].iloc[:, 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lgg_df[gene] = lgg_df[gene].iloc[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lgg_df[gene] = lgg_df[gene].iloc[:, 1]


In [398]:
lgg_df = lgg_df.iloc[:, ~lgg_df.columns.duplicated()]

In [399]:
lgg_df

Hugo_Symbol,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P
0,TCGA-CS-4938,-6.6197,0.1204,1.9011,1.1246,,-1.3087,-1.1762,-9.1723,,...,-0.0068,0.7328,0.2305,-1.1888,0.4558,-0.8185,-1.2024,1.0014,0.8724,-2.9248
1,TCGA-CS-4941,-6.6197,2.0184,0.2737,-1.7890,,-0.3812,0.6052,-9.1723,,...,1.0027,0.3329,-0.0679,-0.7353,0.9001,0.9425,0.1612,1.4876,-1.0025,-2.9248
2,TCGA-CS-4942,-6.6197,0.5099,0.7771,-0.0370,,-1.8385,-2.1332,-9.1723,,...,1.1726,0.2307,-0.5447,-0.7343,0.6212,-0.1734,-0.4970,0.5814,1.2741,-2.9248
3,TCGA-CS-4943,-6.6197,0.1055,1.9152,1.0794,,-2.5604,-1.3623,-9.1723,,...,0.6514,0.5074,0.1509,1.0283,0.9145,-0.2491,0.5798,0.5183,0.3283,-2.9248
4,TCGA-CS-4944,-6.6197,-3.0774,1.2474,0.9732,,-2.6144,-2.1332,-9.1723,,...,-0.8384,-0.9739,-0.9234,-0.5641,-0.1065,-0.7057,-2.9164,-0.5097,-0.9851,-2.9248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,TCGA-WY-A85A,-6.6197,-0.4550,-0.4091,0.1796,,0.4373,-0.1649,-9.1723,,...,-0.3595,-0.6869,0.0627,-1.1888,0.4482,-0.9482,-0.1755,0.4268,-1.4466,-2.9248
526,TCGA-WY-A85B,-6.6197,0.0533,-0.2036,0.2048,,0.6325,1.0021,-9.1723,,...,-0.1344,0.5929,0.3519,-0.2390,0.3560,-0.8485,0.1146,1.0037,2.0298,-2.9248
527,TCGA-WY-A85C,-6.6197,0.4764,-0.5069,-0.6743,,1.1433,1.8313,-9.1723,,...,0.3438,0.7846,0.9941,0.8775,1.0635,-0.4482,0.7445,1.2134,1.5652,-2.9248
528,TCGA-WY-A85D,-6.6197,-0.4688,-0.8678,0.4390,,1.0804,-1.0383,-9.1723,,...,-1.2175,-1.1043,0.6974,-1.1888,-0.1115,-0.6900,-0.4828,0.7991,1.2293,-2.9248


---

### merge gene expression, clincic, and sex data

In [400]:
lgg_df_with_sex = pd.merge(lgg_df, lgg_survival_info_df, how = 'inner', on = 'Patient ID')

In [401]:
lgg_df_with_sex

Unnamed: 0,Patient ID,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,TCGA-CS-4938,-6.6197,0.1204,1.9011,1.1246,,-1.3087,-1.1762,-9.1723,,...,-1.1888,0.4558,-0.8185,-1.2024,1.0014,0.8724,-2.9248,0,0,117.41
1,TCGA-CS-4941,-6.6197,2.0184,0.2737,-1.7890,,-0.3812,0.6052,-9.1723,,...,-0.7353,0.9001,0.9425,0.1612,1.4876,-1.0025,-2.9248,1,1,7.69
2,TCGA-CS-4942,-6.6197,0.5099,0.7771,-0.0370,,-1.8385,-2.1332,-9.1723,,...,-0.7343,0.6212,-0.1734,-0.4970,0.5814,1.2741,-2.9248,0,1,43.86
3,TCGA-CS-4943,-6.6197,0.1055,1.9152,1.0794,,-2.5604,-1.3623,-9.1723,,...,1.0283,0.9145,-0.2491,0.5798,0.5183,0.3283,-2.9248,1,1,36.33
4,TCGA-CS-4944,-6.6197,-3.0774,1.2474,0.9732,,-2.6144,-2.1332,-9.1723,,...,-0.5641,-0.1065,-0.7057,-2.9164,-0.5097,-0.9851,-2.9248,1,0,60.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524,TCGA-WY-A85A,-6.6197,-0.4550,-0.4091,0.1796,,0.4373,-0.1649,-9.1723,,...,-1.1888,0.4482,-0.9482,-0.1755,0.4268,-1.4466,-2.9248,1,0,43.36
525,TCGA-WY-A85B,-6.6197,0.0533,-0.2036,0.2048,,0.6325,1.0021,-9.1723,,...,-0.2390,0.3560,-0.8485,0.1146,1.0037,2.0298,-2.9248,1,0,45.76
526,TCGA-WY-A85C,-6.6197,0.4764,-0.5069,-0.6743,,1.1433,1.8313,-9.1723,,...,0.8775,1.0635,-0.4482,0.7445,1.2134,1.5652,-2.9248,1,0,46.85
527,TCGA-WY-A85D,-6.6197,-0.4688,-0.8678,0.4390,,1.0804,-1.0383,-9.1723,,...,-1.1888,-0.1115,-0.6900,-0.4828,0.7991,1.2293,-2.9248,1,0,37.68


In [402]:
lgg_df_with_sex["Sex"].isna().sum()

0

In [403]:
lgg_df_with_sex = lgg_df_with_sex.drop(["Patient ID"], axis = 1)

In [404]:
lgg_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,MOXD2,LOC155060,RNU12-2P,SSX9,LOC317712,EZHIP,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-6.6197,0.1204,1.9011,1.1246,,-1.3087,-1.1762,-9.1723,,-0.8881,...,-1.1888,0.4558,-0.8185,-1.2024,1.0014,0.8724,-2.9248,0,0,117.41
1,-6.6197,2.0184,0.2737,-1.7890,,-0.3812,0.6052,-9.1723,,-0.2914,...,-0.7353,0.9001,0.9425,0.1612,1.4876,-1.0025,-2.9248,1,1,7.69
2,-6.6197,0.5099,0.7771,-0.0370,,-1.8385,-2.1332,-9.1723,,-1.0961,...,-0.7343,0.6212,-0.1734,-0.4970,0.5814,1.2741,-2.9248,0,1,43.86
3,-6.6197,0.1055,1.9152,1.0794,,-2.5604,-1.3623,-9.1723,,-1.3753,...,1.0283,0.9145,-0.2491,0.5798,0.5183,0.3283,-2.9248,1,1,36.33
4,-6.6197,-3.0774,1.2474,0.9732,,-2.6144,-2.1332,-9.1723,,-0.4565,...,-0.5641,-0.1065,-0.7057,-2.9164,-0.5097,-0.9851,-2.9248,1,0,60.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524,-6.6197,-0.4550,-0.4091,0.1796,,0.4373,-0.1649,-9.1723,,0.7523,...,-1.1888,0.4482,-0.9482,-0.1755,0.4268,-1.4466,-2.9248,1,0,43.36
525,-6.6197,0.0533,-0.2036,0.2048,,0.6325,1.0021,-9.1723,,1.3671,...,-0.2390,0.3560,-0.8485,0.1146,1.0037,2.0298,-2.9248,1,0,45.76
526,-6.6197,0.4764,-0.5069,-0.6743,,1.1433,1.8313,-9.1723,,1.6516,...,0.8775,1.0635,-0.4482,0.7445,1.2134,1.5652,-2.9248,1,0,46.85
527,-6.6197,-0.4688,-0.8678,0.4390,,1.0804,-1.0383,-9.1723,,0.7536,...,-1.1888,-0.1115,-0.6900,-0.4828,0.7991,1.2293,-2.9248,1,0,37.68


In [405]:
lgg_df_with_sex = lgg_df_with_sex.iloc[:, ~(lgg_df_with_sex.isna().sum() / lgg_df_with_sex.shape[0] > 0.8).values]

In [406]:
lgg_df_with_sex

Unnamed: 0,LOC100130426,UBE2Q2P3,HMGB1P1,TIMM23,LOC155060,RNU12-2P,SSX9,EZHIP,EFCAB8,SRP14P1,...,ZYG11A,ZYG11B,ZYX,FLJ10821,ZZZ3,TPTEP1,AKR1C6P,Sex,OS_STATUS,OS_MONTHS
0,-6.6197,0.1204,1.9011,1.1246,-1.3087,-1.1762,-9.1723,-0.8881,-0.7434,1.5350,...,-1.1888,0.4558,-0.8185,-1.2024,1.0014,0.8724,-2.9248,0,0,117.41
1,-6.6197,2.0184,0.2737,-1.7890,-0.3812,0.6052,-9.1723,-0.2914,-0.8872,1.9309,...,-0.7353,0.9001,0.9425,0.1612,1.4876,-1.0025,-2.9248,1,1,7.69
2,-6.6197,0.5099,0.7771,-0.0370,-1.8385,-2.1332,-9.1723,-1.0961,-0.8855,1.8880,...,-0.7343,0.6212,-0.1734,-0.4970,0.5814,1.2741,-2.9248,0,1,43.86
3,-6.6197,0.1055,1.9152,1.0794,-2.5604,-1.3623,-9.1723,-1.3753,-1.6784,1.1446,...,1.0283,0.9145,-0.2491,0.5798,0.5183,0.3283,-2.9248,1,1,36.33
4,-6.6197,-3.0774,1.2474,0.9732,-2.6144,-2.1332,-9.1723,-0.4565,-1.6784,2.8420,...,-0.5641,-0.1065,-0.7057,-2.9164,-0.5097,-0.9851,-2.9248,1,0,60.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524,-6.6197,-0.4550,-0.4091,0.1796,0.4373,-0.1649,-9.1723,0.7523,1.3508,-0.2127,...,-1.1888,0.4482,-0.9482,-0.1755,0.4268,-1.4466,-2.9248,1,0,43.36
525,-6.6197,0.0533,-0.2036,0.2048,0.6325,1.0021,-9.1723,1.3671,-1.6784,0.7501,...,-0.2390,0.3560,-0.8485,0.1146,1.0037,2.0298,-2.9248,1,0,45.76
526,-6.6197,0.4764,-0.5069,-0.6743,1.1433,1.8313,-9.1723,1.6516,-0.7640,0.4344,...,0.8775,1.0635,-0.4482,0.7445,1.2134,1.5652,-2.9248,1,0,46.85
527,-6.6197,-0.4688,-0.8678,0.4390,1.0804,-1.0383,-9.1723,0.7536,-0.6087,1.5737,...,-1.1888,-0.1115,-0.6900,-0.4828,0.7991,1.2293,-2.9248,1,0,37.68


In [407]:
lgg_df_with_sex.isna().sum().sum()

0

In [408]:
lgg_df_with_sex.to_csv("TCGA_LGG_gene_expression_data.csv", index = False, header = True)

---

## Combine GBM with LGG

In [481]:
gbm_df_with_sex.columns

Index(['LOC100130426', 'UBE2Q2P3', 'HMGB1P1', 'TIMM23', 'LOC155060',
       'RNU12-2P', 'EZHIP', 'EFCAB8', 'SRP14P1', 'LOC391343',
       ...
       'ZYG11A', 'ZYG11B', 'ZYX', 'FLJ10821', 'ZZZ3', 'TPTEP1', 'AKR1C6P',
       'Sex', 'OS_STATUS', 'OS_MONTHS'],
      dtype='object', length=19810)

In [484]:
np.unique(gbm_df_with_sex.columns), np.unique(gbm_df_with_sex.columns).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11B', 'ZYX', 'ZZZ3'],
       dtype=object),
 19810)

In [485]:
lgg_df_with_sex.columns

Index(['LOC100130426', 'UBE2Q2P3', 'HMGB1P1', 'TIMM23', 'LOC155060',
       'RNU12-2P', 'SSX9', 'EZHIP', 'EFCAB8', 'SRP14P1',
       ...
       'ZYG11A', 'ZYG11B', 'ZYX', 'FLJ10821', 'ZZZ3', 'TPTEP1', 'AKR1C6P',
       'Sex', 'OS_STATUS', 'OS_MONTHS'],
      dtype='object', length=20154)

In [486]:
np.unique(lgg_df_with_sex.columns), np.unique(lgg_df_with_sex.columns).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11B', 'ZYX', 'ZZZ3'],
       dtype=object),
 20154)

In [487]:
np.intersect1d(gbm_df_with_sex.columns, lgg_df_with_sex.columns), np.intersect1d(gbm_df_with_sex.columns, lgg_df_with_sex.columns).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11B', 'ZYX', 'ZZZ3'],
       dtype=object),
 19790)

In [488]:
common_genes = np.intersect1d(lgg_df_with_sex.columns.astype('str'), gbm_df_with_sex.columns.astype('str'))

In [489]:
np.unique(common_genes), np.unique(common_genes).shape

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11B', 'ZYX', 'ZZZ3'],
       dtype=object),
 (19790,))

In [490]:
common_genes = np.delete(np.unique(common_genes), np.where(np.unique(common_genes) == 'nan'))
common_genes

array(['133K02', '5T4', 'A-C1', ..., 'ZYG11B', 'ZYX', 'ZZZ3'],
      dtype=object)

In [491]:
common_genes.shape

(19790,)

In [495]:
common_gbm_df_with_sex = pd.concat([gbm_df_with_sex[common_genes].drop(["Sex", "OS_STATUS", "OS_MONTHS"], axis = 1), gbm_df_with_sex[["Sex", "OS_STATUS", "OS_MONTHS"]]], axis = 1)

In [496]:
common_gbm_df_with_sex

Unnamed: 0,133K02,5T4,A-C1,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A4GNT,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZZ3,Sex,OS_STATUS,OS_MONTHS
0,0.1576,0.2193,0.5925,-0.2759,0.1873,-5.2251,0.9457,1.7523,0.0029,-0.1983,...,0.5403,0.0360,0.4028,-0.8701,0.4574,-0.9779,-0.3770,1,1,14.72
1,1.5141,2.3847,-0.6062,1.0956,0.8677,-5.2251,1.3224,1.1842,-1.8813,-1.0060,...,-0.4143,-0.7361,-1.6643,0.9106,-0.7419,1.0307,0.3210,0,1,2.5
2,-0.9668,-0.1414,1.1859,0.6560,0.2520,-5.2251,0.1660,-0.0379,-1.6928,-2.0141,...,-2.1522,0.4548,-0.4459,3.6280,0.4629,-0.4795,0.2492,1,0,15.31
3,0.6316,0.5721,-0.2321,-0.7516,-1.1399,-5.2251,-1.4495,-1.7727,1.1083,-1.1320,...,0.1094,-0.8058,1.0686,-1.5164,-0.0848,-0.4707,0.2209,1,0,15.44
4,0.2750,0.8880,0.3817,-0.4479,-0.4035,-5.2251,1.2485,1.1915,0.6243,-2.0141,...,-1.0731,-1.8484,-1.4203,-1.5164,-2.0586,0.3169,-0.9185,1,1,20.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,-0.7411,0.0733,0.0588,0.0398,0.0137,-5.2251,0.8316,0.2546,0.6240,-2.0141,...,-0.1382,-0.8198,-0.1896,-1.5164,-0.4434,0.2369,-0.8139,1,1,17.58
160,-0.0057,-0.7445,0.3318,-0.6426,-1.4120,-5.2251,0.9852,0.4493,-0.7233,-1.1910,...,-0.7678,-1.2737,-0.5106,-1.5164,-1.1488,1.7484,-1.0322,0,1,3.09
161,-0.7944,-0.9368,0.2811,0.2424,0.4920,-5.2251,-0.3080,-0.3670,1.5556,-0.3157,...,1.2540,0.7121,0.7392,0.2814,1.0287,-1.4953,-0.1884,0,1,3.65
162,-0.3734,-0.8762,-0.2960,-1.3948,-1.0064,-5.2251,-1.5342,0.3392,0.9148,0.7261,...,-0.1036,-0.0523,0.8455,-0.6217,0.2066,0.1723,-0.6144,0,1,9.17


In [497]:
common_lgg_df_with_sex = pd.concat([lgg_df_with_sex[common_genes].drop(["Sex", "OS_STATUS", "OS_MONTHS"], axis = 1), lgg_df_with_sex[["Sex", "OS_STATUS", "OS_MONTHS"]]], axis = 1)

In [498]:
common_lgg_df_with_sex

Unnamed: 0,133K02,5T4,A-C1,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A4GNT,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZZ3,Sex,OS_STATUS,OS_MONTHS
0,0.3511,-1.4556,-0.6086,0.2608,0.6771,-2.3768,0.5027,-1.0804,0.3046,-1.8419,...,-0.0068,0.7328,0.2305,-1.1888,0.4558,-0.8185,1.0014,0,0,117.41
1,1.0612,1.6396,0.7407,-0.0775,-1.1849,-2.3768,0.8361,1.5687,1.4047,0.1129,...,1.0027,0.3329,-0.0679,-0.7353,0.9001,0.9425,1.4876,1,1,7.69
2,-1.0613,1.4825,1.0719,-0.0389,-0.5575,-2.3768,0.9586,-1.6784,0.4578,-1.0245,...,1.1726,0.2307,-0.5447,-0.7343,0.6212,-0.1734,0.5814,0,1,43.86
3,0.6761,-0.8206,0.0608,-1.1907,-1.3667,-2.3768,0.1032,-1.6856,0.4803,0.4545,...,0.6514,0.5074,0.1509,1.0283,0.9145,-0.2491,0.5183,1,1,36.33
4,0.1584,-1.5733,-0.1886,-1.4321,-1.7757,-2.3768,-0.0224,-0.5889,0.3801,-0.7184,...,-0.8384,-0.9739,-0.9234,-0.5641,-0.1065,-0.7057,-0.5097,1,0,60.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524,-0.1732,-1.0319,0.4339,0.0259,0.1579,-2.3768,-0.2927,0.8416,0.9789,-0.6760,...,-0.3595,-0.6869,0.0627,-1.1888,0.4482,-0.9482,0.4268,1,0,43.36
525,0.0843,-1.1553,0.3367,0.1412,0.3205,-2.3768,0.8444,-0.4447,-0.8213,0.9120,...,-0.1344,0.5929,0.3519,-0.2390,0.3560,-0.8485,1.0037,1,0,45.76
526,-0.3955,-0.9626,0.1863,0.6345,0.8045,-0.2991,-0.0160,-0.4782,-0.9126,0.3594,...,0.3438,0.7846,0.9941,0.8775,1.0635,-0.4482,1.2134,1,0,46.85
527,-2.0450,-0.7424,0.8448,1.7572,1.0045,-2.3768,0.0948,0.5077,0.8312,-1.8419,...,-1.2175,-1.1043,0.6974,-1.1888,-0.1115,-0.6900,0.7991,1,0,37.68


In [499]:
combined_gbm_lgg_with_sex = pd.concat([common_gbm_df_with_sex, common_lgg_df_with_sex]).reset_index(drop = True)

In [500]:
combined_gbm_lgg_with_sex

Unnamed: 0,133K02,5T4,A-C1,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A4GNT,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZZ3,Sex,OS_STATUS,OS_MONTHS
0,0.1576,0.2193,0.5925,-0.2759,0.1873,-5.2251,0.9457,1.7523,0.0029,-0.1983,...,0.5403,0.0360,0.4028,-0.8701,0.4574,-0.9779,-0.3770,1,1,14.72
1,1.5141,2.3847,-0.6062,1.0956,0.8677,-5.2251,1.3224,1.1842,-1.8813,-1.0060,...,-0.4143,-0.7361,-1.6643,0.9106,-0.7419,1.0307,0.3210,0,1,2.5
2,-0.9668,-0.1414,1.1859,0.6560,0.2520,-5.2251,0.1660,-0.0379,-1.6928,-2.0141,...,-2.1522,0.4548,-0.4459,3.6280,0.4629,-0.4795,0.2492,1,0,15.31
3,0.6316,0.5721,-0.2321,-0.7516,-1.1399,-5.2251,-1.4495,-1.7727,1.1083,-1.1320,...,0.1094,-0.8058,1.0686,-1.5164,-0.0848,-0.4707,0.2209,1,0,15.44
4,0.2750,0.8880,0.3817,-0.4479,-0.4035,-5.2251,1.2485,1.1915,0.6243,-2.0141,...,-1.0731,-1.8484,-1.4203,-1.5164,-2.0586,0.3169,-0.9185,1,1,20.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,-0.1732,-1.0319,0.4339,0.0259,0.1579,-2.3768,-0.2927,0.8416,0.9789,-0.6760,...,-0.3595,-0.6869,0.0627,-1.1888,0.4482,-0.9482,0.4268,1,0,43.36
689,0.0843,-1.1553,0.3367,0.1412,0.3205,-2.3768,0.8444,-0.4447,-0.8213,0.9120,...,-0.1344,0.5929,0.3519,-0.2390,0.3560,-0.8485,1.0037,1,0,45.76
690,-0.3955,-0.9626,0.1863,0.6345,0.8045,-0.2991,-0.0160,-0.4782,-0.9126,0.3594,...,0.3438,0.7846,0.9941,0.8775,1.0635,-0.4482,1.2134,1,0,46.85
691,-2.0450,-0.7424,0.8448,1.7572,1.0045,-2.3768,0.0948,0.5077,0.8312,-1.8419,...,-1.2175,-1.1043,0.6974,-1.1888,-0.1115,-0.6900,0.7991,1,0,37.68


In [501]:
combined_gbm_lgg_with_sex.isna().sum().sum()

0

---

### Remove non pathway-genes from GBM & LGG

In [502]:
combined_gbm_lgg_with_sex.columns.values[:-3], combined_gbm_lgg_with_sex.columns.values[:-3].size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11B', 'ZYX', 'ZZZ3'],
       dtype=object),
 19787)

In [503]:
np.unique(combined_gbm_lgg_with_sex.columns.values[:-3]), np.unique(combined_gbm_lgg_with_sex.columns.values[:-3]).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11B', 'ZYX', 'ZZZ3'],
       dtype=object),
 19787)

In [504]:
combined_gbm_lgg_with_sex.columns.values, combined_gbm_lgg_with_sex.columns.values.size

(array(['133K02', '5T4', 'A-C1', ..., 'Sex', 'OS_STATUS', 'OS_MONTHS'],
       dtype=object),
 19790)

In [505]:
np.unique(combined_gbm_lgg_with_sex.columns.values), np.unique(combined_gbm_lgg_with_sex.columns.values).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11B', 'ZYX', 'ZZZ3'],
       dtype=object),
 19790)

In [506]:
removable_gene_gbm_lgg = np.setdiff1d(combined_gbm_lgg_with_sex.columns.values[:-3], unique_gene_in_pathway)

In [507]:
removable_gene_gbm_lgg, removable_gene_gbm_lgg.size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11A', 'ZYG11B', 'ZZZ3'],
       dtype=object),
 15437)

In [508]:
np.unique(removable_gene_gbm_lgg), np.unique(removable_gene_gbm_lgg).size

(array(['133K02', '5T4', 'A-C1', ..., 'ZYG11A', 'ZYG11B', 'ZZZ3'],
       dtype=object),
 15437)

In [509]:
combined_gbm_lgg_with_sex = combined_gbm_lgg_with_sex.drop(removable_gene_gbm_lgg, axis = 1)

In [510]:
combined_gbm_lgg_with_sex

Unnamed: 0,A2M,AACS,AADAT,AANAT,AARS2,AASDH,AASDHPPT,AASS,ABAT,ABCA10,...,ZFYVE16,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,0.9457,0.2310,0.4380,0.5911,-0.5742,-0.3994,0.1820,-0.3404,1.0665,0.3920,...,0.8032,0.7179,-1.8458,-0.7945,0.8365,-0.3184,-0.9779,1,1,14.72
1,1.3224,-1.1385,-1.6921,-0.8731,-0.2268,0.1284,-1.8299,-1.3493,-3.2963,-0.5713,...,0.1094,-0.5526,-1.0222,-0.2260,-0.8981,0.5467,1.0307,0,1,2.5
2,0.1660,0.3063,0.6997,-1.1499,-0.1483,-0.5233,1.2384,-1.0504,-0.4303,-2.2010,...,-1.5679,0.5098,0.2495,0.0916,-2.6503,1.5333,-0.4795,1,0,15.31
3,-1.4495,-1.2758,-0.0801,-1.8551,-0.0765,-0.5345,-0.7625,0.3655,0.9717,1.4854,...,-0.0324,0.5402,1.2708,0.7992,-1.7368,-0.7350,-0.4707,1,0,15.44
4,1.2485,-0.4453,0.2542,0.4176,-0.1170,-0.3279,-0.0754,-0.8850,-1.8596,-0.6940,...,-1.1768,-0.2062,0.0956,1.6558,-0.6314,-0.4226,0.3169,1,1,20.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,-0.2927,-0.0071,1.3750,0.1424,-0.2869,0.6011,0.0011,0.2679,0.2561,-1.0454,...,0.8332,0.3819,0.3507,0.0267,0.0615,1.0639,-0.9482,1,0,43.36
689,0.8444,-0.8818,1.7158,-0.8667,0.6668,0.8342,0.8409,0.6754,-0.2144,-0.1674,...,2.1262,0.2318,-0.7200,0.5324,-1.0072,1.2301,-0.8485,1,0,45.76
690,-0.0160,-0.5032,1.7822,0.3375,0.6621,0.7183,1.2917,0.8670,0.1620,0.1908,...,1.7647,1.0268,1.2121,0.2512,-1.2149,2.0410,-0.4482,1,0,46.85
691,0.0948,0.2616,0.4672,1.5114,-0.1265,0.0345,-1.1829,0.2674,0.4877,-0.0156,...,0.6297,-0.0323,0.9821,0.4749,-1.0888,1.2793,-0.6900,1,0,37.68


In [511]:
19787 - 15437

4350

In [512]:
np.intersect1d(combined_gbm_lgg_with_sex.columns.values, unique_gene_in_pathway)

array(['A2M', 'AACS', 'AADAT', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
      dtype=object)

In [513]:
np.intersect1d(combined_gbm_lgg_with_sex.columns.values, unique_gene_in_pathway).size

4350

In [514]:
combined_gbm_lgg_with_sex.to_csv("TCGA_GBM_&_LGG_gene_expression_data.csv", index = False, header = True)

---

## Pathway Mask

In [515]:
combined_gbm_lgg_with_sex.columns.values[:-3], combined_gbm_lgg_with_sex.columns.values[:-3].size

(array(['A2M', 'AACS', 'AADAT', ..., 'ZMAT3', 'ZNF274', 'ZYX'],
       dtype=object),
 4350)

In [516]:
gene_name = combined_gbm_lgg_with_sex.columns.values[:-3]

In [517]:
gene_name.size, np.unique(gene_name).size

(4350, 4350)

In [518]:
pathway_name, pathway_name.size

(0              KEGG_N_GLYCAN_BIOSYNTHESIS
 1           KEGG_OTHER_GLYCAN_DEGRADATION
 2              KEGG_O_GLYCAN_BIOSYNTHESIS
 3      KEGG_GLYCOSAMINOGLYCAN_DEGRADATION
 4            KEGG_GLYCEROLIPID_METABOLISM
                       ...                
 168                           KEGG_ASTHMA
 169       KEGG_AUTOIMMUNE_THYROID_DISEASE
 170              KEGG_ALLOGRAFT_REJECTION
 171        KEGG_GRAFT_VERSUS_HOST_DISEASE
 172                KEGG_VIRAL_MYOCARDITIS
 Name: Name, Length: 173, dtype: object,
 173)

In [519]:
pathway_sparse_mat = sparse.coo_matrix((pathway_name.size, gene_name.size)).toarray()
pathway_sparse_mat.shape

(173, 4350)

In [520]:
pathway_sparse_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [521]:
pathway_sparse_mat.sum().sum()

0.0

In [522]:
for i in range(len(pathway_name)):
    pathway_sparse_mat[i, np.argwhere(np.isin(gene_name, set_pathway['gene'][i])).reshape((-1, ))] = 1.
            
pathway_sparse_mat, pathway_sparse_mat.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 (173, 4350))

In [523]:
pathway_sparse_mat.sum()

10792.0

---

In [524]:
data_type = "GBM_&_LGG"

In [525]:
pathway_sparse_mat = sparse.coo_matrix(pathway_sparse_mat)
sparse.save_npz(f"TCGA_{data_type}_Pathway_Mask.npz", pathway_sparse_mat)