In [1]:
import pandas as pd

In [2]:
coad_df = pd.read_csv("COAD_df.csv")

In [3]:
"""
TNK2 = ENSG00000061938
filter TNK2
"""
coad_tnk2 = coad_df.loc[coad_df['Gene'].str.startswith('ENSG00000061938')]

In [4]:
"""
calculate the mean of all the genes
"""

coad_df['mean'] = coad_df.mean(axis=1)
coad_df['median'] = coad_df.median(axis=1)

In [5]:
# coad_df.head(2)

In [6]:
"""
TCGA-A6-A565 - TNK2 Q571Sfs*3
TCGA-AD-6964 - TNK2 Q571Sfs*3
TCGA-G4-6586 - TNK2 D558Rfs*40
TCGA-F4-6856 - TNK2 Q909Rfs*2
"""

'\nTCGA-A6-A565 - TNK2 Q571Sfs*3\nTCGA-AD-6964 - TNK2 Q571Sfs*3\nTCGA-G4-6586 - TNK2 D558Rfs*40\nTCGA-F4-6856 - TNK2 Q909Rfs*2\n'

In [7]:
"""
Identify patients with ACK1 truncation (TNK2 Q571Sfs*3)
"""
#finds columns starts with 
columns_starting_with_TC = coad_tnk2.filter(regex='^(TCGA-G4-6586|TCGA-AD-6964|TCGA-A6-A565|TCGA-F4-6856)').columns

In [8]:
columns_to_filter = ['Gene'] + list(columns_starting_with_TC)

In [9]:
"""
filter
"""
coad_tnk2_trunc = coad_tnk2.filter(items = columns_to_filter)

In [10]:
coad_tnk2_trunc.head(2)

Unnamed: 0,Gene,TCGA-A6-A565-01A-31R-A28H-07,TCGA-AD-6964-01A-11R-1928-07,TCGA-F4-6856-01A-11R-1928-07,TCGA-G4-6586-01A-11R-1774-07
835,ENSG00000061938.20,22.7127,12.3871,16.9094,26.3835


In [11]:
"""
1. transpose DF
2. remove 1st raw
"""
coad_tnk2_trunc_t = coad_tnk2_trunc.T.reset_index()
coad_tnk2_trunc_t = coad_tnk2_trunc_t.drop(0)

In [12]:
coad_tnk2_trunc_t.columns

Index(['index', 835], dtype='object')

In [13]:
#rename 
coad_tnk2_trunc_t = coad_tnk2_trunc_t.rename(columns={'index':'ID', 835:'COAD_trunc'})

In [14]:
coad_tnk2_trunc_t

Unnamed: 0,ID,COAD_trunc
1,TCGA-A6-A565-01A-31R-A28H-07,22.7127
2,TCGA-AD-6964-01A-11R-1928-07,12.3871
3,TCGA-F4-6856-01A-11R-1928-07,16.9094
4,TCGA-G4-6586-01A-11R-1774-07,26.3835


In [22]:
# create dictionary mapping IDs to labels
id_to_label = {
    'TCGA-A6-A565': 'Q571Sfs*3',
    'TCGA-AD-6964': 'Q571Sfs*3',
    'TCGA-G4-6586': 'D558Rfs*40',
    'TCGA-F4-6856': 'Q909Rfs*2'
}

# define function to map IDs to labels
def map_id_to_label(id):
    for key in id_to_label:
        if id.startswith(key):
            return id_to_label[key]
    return ''

# apply function to create label column
coad_tnk2_trunc_t['label'] = coad_tnk2_trunc_t['ID'].apply(map_id_to_label)




In [23]:
coad_tnk2_trunc_t

Unnamed: 0,ID,COAD_trunc,label
1,TCGA-A6-A565-01A-31R-A28H-07,22.7127,Q571Sfs*3
2,TCGA-AD-6964-01A-11R-1928-07,12.3871,Q571Sfs*3
3,TCGA-F4-6856-01A-11R-1928-07,16.9094,Q909Rfs*2
4,TCGA-G4-6586-01A-11R-1774-07,26.3835,D558Rfs*40


In [17]:
"""
1. select ACK1 none truncations
"""
#filter columns of df1 not in df2
cols_to_keep = coad_tnk2.columns[~coad_tnk2.columns.isin(coad_tnk2_trunc.columns)]
coad_tnk2_no_trunc = coad_tnk2[cols_to_keep]

In [18]:
#traspose
coad_tnk2_no_trunc_t =  coad_tnk2_no_trunc.T.reset_index()
#rename 
coad_tnk2_no_trunc_t = coad_tnk2_no_trunc_t.rename(columns={'index':'ID', 835:'COAD_No_trunc'})

In [33]:
coad_tnk2_no_trunc_t.head(2)

Unnamed: 0,ID,COAD_No_trunc
0,TCGA-D5-6540-01A-11R-1723-07,16.602
1,TCGA-AA-3525-11A-01R-A32Z-07,6.9256


In [34]:
len(coad_tnk2_no_trunc_t)

520

In [30]:
# concatenate dataframes vertically
coad = pd.concat([coad_tnk2_no_trunc_t, coad_tnk2_trunc_t]).reset_index(drop=True)

In [31]:
coad

Unnamed: 0,ID,COAD_No_trunc,COAD_trunc,label
0,TCGA-D5-6540-01A-11R-1723-07,16.6020,,
1,TCGA-AA-3525-11A-01R-A32Z-07,6.9256,,
2,TCGA-AA-3525-01A-02R-0826-07,6.8027,,
3,TCGA-AA-3815-01A-01R-1022-07,23.4176,,
4,TCGA-D5-6923-01A-11R-A32Z-07,19.7164,,
...,...,...,...,...
519,TCGA-AA-3814-01A-01R-0905-07,7.2063,,
520,TCGA-A6-A565-01A-31R-A28H-07,,22.7127,Q571Sfs*3
521,TCGA-AD-6964-01A-11R-1928-07,,12.3871,Q571Sfs*3
522,TCGA-F4-6856-01A-11R-1928-07,,16.9094,Q909Rfs*2


# STAD

In [36]:
stad_df = pd.read_csv("TCGA-STAD_df.csv")

In [37]:
"""
TNK2 = ENSG00000061938
filter TNK2
"""
stad_tnk2 = stad_df.loc[stad_df['Gene'].str.startswith('ENSG00000061938')]

In [None]:
"""
TCGA-BR-8078 - TNK2 Q909Rfs*2
TCGA-VQ-A8PT - TNK2 Q909Rfs*2
TCGA-RD-A8NB - TNK2 331*

"""

In [47]:
"""
Identify patients with ACK1 truncation
"""
#finds columns starts with 
columns_starting_with_TC_stad = stad_tnk2.filter(regex='^(TCGA-VQ-A8PT|TCGA-BR-8078|TCGA-RD-A8NB)').columns

In [48]:
columns_starting_with_TC_stad

Index(['TCGA-RD-A8NB-01A-12R-A39E-31', 'TCGA-BR-8078-01A-11R-2343-13'], dtype='object')

In [43]:
columns_to_filter_stad = ['Gene'] + list(columns_starting_with_TC_stad)

In [44]:
"""
filter
"""
stad_tnk2_trunc = stad_tnk2.filter(items = columns_to_filter_stad)

In [45]:
stad_tnk2_trunc

Unnamed: 0,Gene,TCGA-RD-A8NB-01A-12R-A39E-31,TCGA-BR-8078-01A-11R-2343-13
835,ENSG00000061938.20,24.2963,22.5231


In [51]:
"""
1. transpose DF
2. remove 1st raw
"""
stad_tnk2_trunc_t = stad_tnk2_trunc.T.reset_index()
stad_tnk2_trunc_t = stad_tnk2_trunc_t.drop(0)
#rename 
stad_tnk2_trunc_t = stad_tnk2_trunc_t.rename(columns={'index':'ID', 835:'STAD_trunc'})

In [52]:
stad_tnk2_trunc_t

Unnamed: 0,ID,STAD_trunc
1,TCGA-RD-A8NB-01A-12R-A39E-31,24.2963
2,TCGA-BR-8078-01A-11R-2343-13,22.5231


In [53]:
# create dictionary mapping IDs to labels
id_to_label = {
    'TCGA-BR-8078': 'Q909Rfs*2',
    'TCGA-VQ-A8PT': 'Q909Rfs*2',
    'TCGA-RD-A8NB': '331*'
}

# define function to map IDs to labels
def map_id_to_label(id):
    for key in id_to_label:
        if id.startswith(key):
            return id_to_label[key]
    return ''

# apply function to create label column
stad_tnk2_trunc_t['label'] = stad_tnk2_trunc_t['ID'].apply(map_id_to_label)




In [54]:
stad_tnk2_trunc_t

Unnamed: 0,ID,STAD_trunc,label
1,TCGA-RD-A8NB-01A-12R-A39E-31,24.2963,331*
2,TCGA-BR-8078-01A-11R-2343-13,22.5231,Q909Rfs*2


In [55]:
"""
1. select ACK1 none truncations
"""
#filter columns of df1 not in df2
cols_to_keep = stad_tnk2.columns[~stad_tnk2.columns.isin(stad_tnk2_trunc.columns)]
stad_tnk2_no_trunc = stad_tnk2[cols_to_keep]

In [57]:
#traspose
stad_tnk2_no_trunc_t =  stad_tnk2_no_trunc.T.reset_index()
#rename 
stad_tnk2_no_trunc_t = stad_tnk2_no_trunc_t.rename(columns={'index':'ID', 835:'STAD_No_trunc'})

In [58]:
stad_tnk2_no_trunc_t

Unnamed: 0,ID,STAD_No_trunc
0,TCGA-BR-8362-01A-11R-2343-13,14.0465
1,TCGA-3M-AB46-01A-11R-A414-31,4.6331
2,TCGA-BR-4357-01A-01R-1157-13,11.7281
3,TCGA-D7-A748-01A-12R-A32D-31,8.6376
4,TCGA-CD-A487-01A-21R-A24K-31,15.4969
...,...,...
441,TCGA-CD-A48C-01A-11R-A24K-31,30.2312
442,TCGA-HU-A4GN-11A-12R-A251-31,9.9068
443,TCGA-HU-A4GP-01A-11R-A251-31,38.6135
444,TCGA-HU-A4GP-11A-21R-A251-31,9.9396


In [59]:
# concatenate dataframes vertically
stad = pd.concat([stad_tnk2_no_trunc_t, stad_tnk2_trunc_t]).reset_index(drop=True)

In [60]:
stad

Unnamed: 0,ID,STAD_No_trunc,STAD_trunc,label
0,TCGA-BR-8362-01A-11R-2343-13,14.0465,,
1,TCGA-3M-AB46-01A-11R-A414-31,4.6331,,
2,TCGA-BR-4357-01A-01R-1157-13,11.7281,,
3,TCGA-D7-A748-01A-12R-A32D-31,8.6376,,
4,TCGA-CD-A487-01A-21R-A24K-31,15.4969,,
...,...,...,...,...
443,TCGA-HU-A4GP-01A-11R-A251-31,38.6135,,
444,TCGA-HU-A4GP-11A-21R-A251-31,9.9396,,
445,TCGA-IN-A7NR-01A-11R-A354-31,7.6221,,
446,TCGA-RD-A8NB-01A-12R-A39E-31,,24.2963,331*


In [61]:
# concatenate dataframes vertically
stad_coad = pd.concat([stad, coad]).reset_index(drop=True)

In [62]:
stad_coad

Unnamed: 0,ID,STAD_No_trunc,STAD_trunc,label,COAD_No_trunc,COAD_trunc
0,TCGA-BR-8362-01A-11R-2343-13,14.0465,,,,
1,TCGA-3M-AB46-01A-11R-A414-31,4.6331,,,,
2,TCGA-BR-4357-01A-01R-1157-13,11.7281,,,,
3,TCGA-D7-A748-01A-12R-A32D-31,8.6376,,,,
4,TCGA-CD-A487-01A-21R-A24K-31,15.4969,,,,
...,...,...,...,...,...,...
967,TCGA-AA-3814-01A-01R-0905-07,,,,7.2063,
968,TCGA-A6-A565-01A-31R-A28H-07,,,Q571Sfs*3,,22.7127
969,TCGA-AD-6964-01A-11R-1928-07,,,Q571Sfs*3,,12.3871
970,TCGA-F4-6856-01A-11R-1928-07,,,Q909Rfs*2,,16.9094


In [63]:
"""
Save as csv
"""
cols=stad_coad.columns.to_list()
stad_coad.to_csv('stad_coad.csv', columns= cols, index= False)