In [15]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from Processing_Functions import remove_redun, map_IDs

Import National Institute of Health Genomic Data Commons (GDA) and Human Protein Atlas (HPA) Data 

In [16]:
# Process the gdc dataset to include only desired columns and scores
gdc = pd.read_csv("../data/raw_data/gdc_luad_genes.csv")

featureNameColumns = ['# SSM Affected Cases in Cohort', '# CNV Gain', '# CNV Loss']
for i in featureNameColumns:
    gdc[['1', '2', '3', '4']] = gdc[i].replace({',':''}, regex=True).str.split(' ', 3, expand=True)
    gdc[i] = gdc['1'].astype(float)/gdc['3'].astype(float)
    gdc = gdc.drop(['1', '2', '3', '4'], axis=1)

gdc.drop(['Symbol', 'Name', 'Cytoband', 'Type', 'Annotations', 'Survival'], axis=1, inplace=True)

gdc[['1', '2', '3']] = gdc['# SSM Affected Cases Across the GDC'].replace({',':''}, regex=True).str.split(' ', 2, expand=True)
gdc['# SSM Affected Cases Across the GDC'] = gdc['1'].astype(float)/gdc['3'].astype(float)
gdc = gdc.drop(['1', '2', '3'], axis=1)

gdc = gdc.rename({'# SSM Affected Cases in Cohort': 'nih_ssm_in_cohort', '# SSM Affected Cases Across the GDC':'nih_ssm_across_gdc',
    '# CNV Gain':'nih_cnv_gain', '# CNV Loss':'nih_cnv_loss', 'Gene ID':'ensembl', '# Mutations':'nih_tot_mutations'}, axis=1)

gdc

Unnamed: 0,ensembl,nih_ssm_in_cohort,nih_ssm_across_gdc,nih_cnv_gain,nih_cnv_loss,nih_tot_mutations
0,ENSG00000147481,0.144621,0.028155,0.076023,0.056530,92
1,ENSG00000105877,0.144621,0.084772,0.074074,0.007797,105
2,ENSG00000188107,0.144621,0.068585,0.111111,0.056530,95
3,ENSG00000125414,0.144621,0.062601,0.005848,0.042885,95
4,ENSG00000009694,0.144621,0.071653,0.052632,0.025341,110
...,...,...,...,...,...,...
21087,ENSG00000146648,0.146384,0.052244,0.105263,0.019493,51
21088,ENSG00000133958,0.146384,0.059916,0.025341,0.046784,110
21089,ENSG00000134376,0.146384,0.056617,0.085770,0.007797,93
21090,ENSG00000109061,0.146384,0.057691,0.009747,0.040936,93


In [17]:
hpa = pd.read_csv('../data/hpa_gene_features.tsv', sep='\t').drop_duplicates(subset='Gene')

identifiers = [
    "Gene",
    "Ensembl"
]
discrete_features = [
    "Protein class",
    "Biological process",
    "Molecular function",
    "Disease involvement",
    "Subcellular location",
]
continuous_features = [
    "Tissue RNA - lung [NX]",
    "Single Cell Type RNA - Mucus-secreting cells [NX]"
]

hpa_features = hpa.iloc[:, hpa.columns.isin(identifiers+discrete_features+continuous_features)]

hpa_features["Tissue RNA - lung [NX]"] = (hpa_features["Tissue RNA - lung [NX]"] - hpa_features["Tissue RNA - lung [NX]"].mean()) / hpa_features["Tissue RNA - lung [NX]"].std()
col = "Single Cell Type RNA - Mucus-secreting cells [NX]"
hpa_features[col] = (hpa_features[col] - hpa_features[col].mean()) / hpa_features[col].std()

def explode(feature) :
    return feature.apply(lambda x: x.replace(' ', '').split(','))

hpa_clean = hpa.fillna('')
for ft in discrete_features :
    hpa_clean[ft] = explode(hpa_clean[ft])

protein_class = hpa_clean["Protein class"].explode().unique()
biological_process = hpa_clean["Biological process"].explode().unique()
molecular_function = hpa_clean["Molecular function"].explode().unique()
disease_involvement = hpa_clean["Disease involvement"].explode().unique()
subcellular_location = hpa_clean["Subcellular location"].explode().unique()
GO_features = np.concatenate([protein_class, biological_process, molecular_function, disease_involvement, subcellular_location])

RowFeatures = pd.DataFrame(data = 0,index = hpa_clean['Ensembl'],columns=GO_features)
counter = 0

for index, row in RowFeatures.iterrows() :
    features = hpa_clean.iloc[counter][['Protein class', 'Biological process', 'Molecular function', 'Disease involvement', 'Subcellular location']].to_list()
    flattened = [item for sublist in features for item in sublist if item]
    for t in flattened :
        row[t] = 1
    counter +=1 

n_comp = 100
svd = TruncatedSVD(n_components = n_comp)
svdModel = svd.fit(RowFeatures)
visits_emb = svdModel.transform(RowFeatures)
hpa = pd.DataFrame(data=visits_emb, index=RowFeatures.index).reset_index().rename(columns={"Ensembl":"ensembl"})
hpa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpa_features["Tissue RNA - lung [NX]"] = (hpa_features["Tissue RNA - lung [NX]"] - hpa_features["Tissue RNA - lung [NX]"].mean()) / hpa_features["Tissue RNA - lung [NX]"].std()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpa_features[col] = (hpa_features[col] - hpa_features[col].mean()) / hpa_features[col].std()


Unnamed: 0,ensembl,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,ENSG00000175899,0.443505,0.313574,0.430307,-0.285337,-0.351412,1.545059,-0.450936,-0.444200,-0.116774,...,-0.018373,0.094550,0.009357,-0.039162,0.032721,-0.079607,0.503267,0.576245,0.444453,-0.574186
1,ENSG00000128274,0.532289,0.789331,-0.122159,0.019000,1.427162,-0.065659,0.001669,0.386983,-0.028900,...,-0.007169,0.257972,-0.085797,0.044028,0.166343,-0.110665,0.140087,-0.089804,0.000472,0.025348
2,ENSG00000094914,1.605229,0.936681,0.345886,0.183699,-1.001764,-0.846346,0.550995,0.368205,0.202731,...,0.339739,0.019335,-0.186211,-0.270075,-0.205805,0.027753,-0.048839,-0.054099,-0.007369,0.078670
3,ENSG00000081760,0.467196,0.690415,-0.066723,0.335478,1.017124,0.029579,-0.131714,0.212387,0.075120,...,-0.102811,-0.352452,0.199631,-0.049215,-0.092627,0.010099,0.120493,0.138474,-0.167452,0.051508
4,ENSG00000114771,0.956718,0.202132,-0.490047,-0.524460,0.467132,-0.207775,-0.494652,0.192615,0.077836,...,0.003828,-0.002623,0.002410,0.000992,-0.009565,-0.015291,0.011153,-0.007147,-0.007970,0.007936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15008,ENSG00000198455,1.436813,-1.455513,0.510199,0.224208,0.323736,-0.171141,-0.060697,0.088626,0.103071,...,-0.038890,0.041711,-0.017337,-0.032425,-0.000801,0.011581,-0.021367,0.014229,-0.007050,0.008374
15009,ENSG00000070476,1.140170,-1.162868,0.551409,0.127004,0.119115,-0.120434,-0.421260,0.798186,0.041917,...,-0.029933,0.029976,-0.026192,-0.024701,0.009399,0.022374,-0.019030,0.024306,-0.011823,0.017133
15010,ENSG00000162378,0.782655,-0.057857,-0.397211,0.017498,-0.135123,-0.248882,-0.438176,0.113363,-0.295955,...,0.010618,0.012477,-0.010456,0.005255,0.017619,0.049873,0.018236,-0.051592,0.049855,-0.009883
15011,ENSG00000159840,1.042306,0.183392,-0.327697,0.065425,-0.459893,0.507171,-0.610148,0.067066,0.368216,...,0.318299,0.066721,0.485114,-0.249986,0.321037,-0.002111,-0.092111,-0.032771,0.033117,-0.052496


In [18]:
master = hpa.merge(gdc, on="ensembl")
master

Unnamed: 0,ensembl,0,1,2,3,4,5,6,7,8,...,95,96,97,98,99,nih_ssm_in_cohort,nih_ssm_across_gdc,nih_cnv_gain,nih_cnv_loss,nih_tot_mutations
0,ENSG00000175899,0.443505,0.313574,0.430307,-0.285337,-0.351412,1.545059,-0.450936,-0.444200,-0.116774,...,-0.079607,0.503267,0.576245,0.444453,-0.574186,0.049383,0.031607,0.042885,0.066277,29
1,ENSG00000128274,0.532289,0.789331,-0.122159,0.019000,1.427162,-0.065659,0.001669,0.386983,-0.028900,...,-0.110665,0.140087,-0.089804,0.000472,0.025348,0.001764,0.009436,0.048733,0.025341,1
2,ENSG00000094914,1.605229,0.936681,0.345886,0.183699,-1.001764,-0.846346,0.550995,0.368205,0.202731,...,0.027753,-0.048839,-0.054099,-0.007369,0.078670,0.005291,0.008822,0.068226,0.019493,3
3,ENSG00000081760,0.467196,0.690415,-0.066723,0.335478,1.017124,0.029579,-0.131714,0.212387,0.075120,...,0.010099,0.120493,0.138474,-0.167452,0.051508,0.031746,0.015036,0.038986,0.052632,19
4,ENSG00000114771,0.956718,0.202132,-0.490047,-0.524460,0.467132,-0.207775,-0.494652,0.192615,0.077836,...,-0.015291,0.011153,-0.007147,-0.007970,0.007936,0.010582,0.010817,0.066277,0.011696,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14893,ENSG00000198455,1.436813,-1.455513,0.510199,0.224208,0.323736,-0.171141,-0.060697,0.088626,0.103071,...,0.011581,-0.021367,0.014229,-0.007050,0.008374,0.024691,0.023168,0.033138,0.011696,16
14894,ENSG00000070476,1.140170,-1.162868,0.551409,0.127004,0.119115,-0.120434,-0.421260,0.798186,0.041917,...,0.022374,-0.019030,0.024306,-0.011823,0.017133,0.014109,0.013195,0.033138,0.015595,10
14895,ENSG00000162378,0.782655,-0.057857,-0.397211,0.017498,-0.135123,-0.248882,-0.438176,0.113363,-0.295955,...,0.049873,0.018236,-0.051592,0.049855,-0.009883,0.022928,0.011891,0.035088,0.048733,14
14896,ENSG00000159840,1.042306,0.183392,-0.327697,0.065425,-0.459893,0.507171,-0.610148,0.067066,0.368216,...,-0.002111,-0.092111,-0.032771,0.033117,-0.052496,0.012346,0.011047,0.064327,0.023392,8


Import STRING Dataset

In [20]:
el_map = pd.read_csv('../data/raw_data/9606.protein.aliases.v11.5.txt', sep="\t")
el = pd.read_csv('../data/raw_data/9606.protein.links.detailed.v11.5.txt', sep=" ")
el_map.loc[el_map.source == 'Ensembl_gene']

el = remove_redun(el, True)
el = map_IDs(el, el_map, verbose=True)

el_map

Original Size:  11938498


In [None]:
print(el.index.unique)

<bound method Index.unique of Int64Index([      0,       1,       2,       3,       4,       5,       6,
                  7,       8,       9,
            ...
            5969239, 5969240, 5969241, 5969242, 5969243, 5969244, 5969245,
            5969246, 5969247, 5969248],
           dtype='int64', length=5969249)>


Disease Gene Network Merge

In [None]:
dgn = pd.read_csv("../data/raw_data/gda_disease_summary_luad.csv")
dgn_dict = pd.read_csv("../data/raw_data/gda_dictionary.csv")

score_threshold = 0.02
ei_threshold = 0.7

dgn = dgn[['Gene', 'EI_gda', 'Score_gda']]
dgn = dgn.loc[dgn['Score_gda'] >= score_threshold]
dgn = dgn.loc[dgn['EI_gda'] > ei_threshold]
dgn.rename({'Score_gda':'gda_score'}, axis=1, inplace=True)
dgn = dgn.merge(dgn_dict, on="Gene").drop(['Unnamed: 0', 'Unnamed: 0.1', 'ensembl'], axis=1)
dgn

Unnamed: 0,Gene,EI_gda,gda_score
0,NAT2,1.0,0.02
1,ADA,1.0,0.02
2,CDH2,1.0,0.02
3,CDKN2B-AS1,1.0,0.12
4,HOTAIR,1.0,0.02
...,...,...,...
978,ZEB2,1.0,0.04
979,CDC20,1.0,0.03
980,MFN2,1.0,0.03
981,MVP,1.0,0.03
