In [1]:
import pandas as pd
import re
import networkx as nx
import csv
import matplotlib.pyplot as plt
import numpy as np

## 1.1) Explore information sources and compile the seed gene list:
a) Get the list of human genes (i.e. the **seed list**) involved in the disease **Cardiomyopathy, Dilated** from the dataset *“Curated gene-disease associations”* (from *https://www.disgenet.org/downloads*)

In [2]:
root = "C:/Users/clara/Documents/Bio/"
path = "C:/Users/clara/Documents/Bio/curated_gene_disease_associations.tsv"
curated_gene_desease_association_DF = pd.read_csv(path, sep = '\t', compression = 'infer')

In [3]:
cardio_DF = curated_gene_desease_association_DF.loc[curated_gene_desease_association_DF['diseaseId'] == 'C0007193']
print('Number of detected genes involved in the desease "Cardiomyopathy, Dilated": ', len(cardio_DF))
cardio_DF.head(5)

Number of detected genes involved in the desease "Cardiomyopathy, Dilated":  48


Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
502,58,ACTA1,0.54,0.769,C0007193,"Cardiomyopathy, Dilated",group,C14,Disease or Syndrome,0.4,1.0,2006.0,2013.0,2,0,GENOMICS_ENGLAND
610,70,ACTC1,0.61,0.538,C0007193,"Cardiomyopathy, Dilated",group,C14,Disease or Syndrome,0.65,1.0,2006.0,2019.0,0,3,CTD_human
1414,153,ADRB1,0.555,0.769,C0007193,"Cardiomyopathy, Dilated",group,C14,Disease or Syndrome,0.58,1.0,1998.0,2019.0,1,0,CTD_human
1444,154,ADRB2,0.442,0.923,C0007193,"Cardiomyopathy, Dilated",group,C14,Disease or Syndrome,0.51,1.0,2002.0,2008.0,1,0,CTD_human
3663,355,FAS,0.372,0.923,C0007193,"Cardiomyopathy, Dilated",group,C14,Disease or Syndrome,0.51,1.0,1999.0,2007.0,1,0,CTD_human


In [4]:
seed_list = list(cardio_DF['geneId'])
print("List of Entrez genes:\n", *seed_list)

List of Entrez genes:
 58 70 153 154 355 356 472 948 1440 1482 1499 1756 1956 2194 2876 3688 4000 4306 4624 4625 4878 4879 5318 5663 5664 5879 5894 5973 6331 6389 6443 6462 6584 6648 6934 7112 7137 7139 7273 7350 7840 8313 10060 55759 64651 137735 150094 347273


In [30]:
print("List of gene symbols:\n", *list(cardio_DF['geneSymbol']))
seed_genes_symbols = list(cardio_DF['geneSymbol'])

List of gene symbols:
 ACTA1 ACTC1 ADRB1 ADRB2 FAS FASLG ATM CD36 CSF3 NKX2-5 CTNNB1 DMD EGFR FASN GPX1 ITGB1 LMNA NR3C2 MYH6 MYH7 NPPA NPPB PKP2 PSEN1 PSEN2 RAC1 RAF1 RENBP SCN5A SDHA SGCB SHBG SLC22A5 SOD2 TCF7L2 TMPO TNNI3 TNNT2 TTN UCP1 ALMS1 AXIN2 ABCC9 WDR12 CSRNP1 ABRA SIK1 CAVIN4


b) Check if gene symbols for all genes in the seed gene list are updated and approved on the *HGNC* website (from *https://www.genenames.org/tools/multi-symbol-checker/*)

In [6]:
# Upload this csv in the multi-symbol checker tool of the HGNC: 
cardio_DF['geneSymbol'].to_csv('gene_symbols.csv', index = False, header = False, line_terminator = ',')

In [7]:
cardio_DF['geneId'].to_csv('geneId.csv', index = False, header = False, line_terminator = ',')

The multi-symbol checker on the *HGNC* database reports that all the gene symbols in our seed list are updated and approved. Three of them match both an approved and an alias symbol, namely ***FAS***, ***RAC1*** and ***RAF1***

For each protein in our seed list we want to collect the following information from the *Uniprot* website:
* official (primary) **gene symbol** --> *Gene names (primary)*
* **Uniprot AC**, alphanumeric ‘accession number’ (a.k.a. ’Uniprot entry’) --> *Entry*
* **protein name** (the main one only, do not report the aliases)
* **Entrez Gene ID** (a.k.a. ‘GeneID’) --> *geneID* from disgenet
* very brief description of its function (keep it very short, i.e. max 20 words)
* notes related to the above information, if any and if relevant

**NOTE**: With regards to the gene symbol **TMPO** only the entry corresponding to the protein *Thymopoietin, isoforms alpha* (P42166) has been kept since its information on the *HGNG* only refer to this one and not to the isoform beta/gamma

In [8]:
path = root + "uniprot-list-with-ids.csv"
uniprot_DF = pd.read_csv(path, sep = '\t')
uniprot_DF['ProteinName'] = uniprot_DF['ProteinName'].str.split(r"\(|\[").str[0]

In [9]:
uniprot_DF[['GeneName','UniprotAC', 'ProteinName', 'GeneId']].head(5)

Unnamed: 0,GeneName,UniprotAC,ProteinName,GeneId
0,ACTA1,P68133,"Actin, alpha skeletal muscle",58
1,ACTC1,P68032,"Actin, alpha cardiac muscle 1",70
2,ADRB1,P08588,Beta-1 adrenergic receptor,153
3,ADRB2,P07550,Beta-2 adrenergic receptor,154
4,FAS,P25445,Tumor necrosis factor receptor superfamily mem...,355


## 1.2) Collect interaction data

a) For each seed gene, collect all binary protein interactions from the *Biogrid Human*.

In [10]:
biogrid_path = root + "BIOGRID.tab3.txt"
biogrid_full_DF = pd.read_csv(biogrid_path, sep = '\t', na_values='-', keep_default_na=True)
biogrid_full_DF.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B
0,103,6416.0,2318,112315,108607,,,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,Q59H94,NP_001120959|NP_001449,,,,,,,Homo sapiens,Homo sapiens
1,117,84665.0,88,124185,106603,,,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,Q59FD9|F6THM6,NP_001094|NP_001265272|NP_001265273,,,,,,,Homo sapiens,Homo sapiens
2,183,90.0,2339,106605,108625,,,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,,NP_002018,,,,,,,Homo sapiens,Homo sapiens
3,278,2624.0,5371,108894,111384,,,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,,NP_150250|NP_150253|NP_150252|NP_150247|NP_150...,,,,,,,Homo sapiens,Homo sapiens
4,418,6118.0,6774,112038,112651,RP4-547C9.3,,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,,NP_644805|NP_003141|NP_001356447|NP_001356443|...,,,,,,,Homo sapiens,Homo sapiens


We select only the interactions among organisms being both *Homo Sapiens*

In [11]:
biogrid_full_DF = biogrid_full_DF[(biogrid_full_DF['Organism ID Interactor A'] == 9606) & (biogrid_full_DF['Organism ID Interactor B'] == 9606)]
biogrid_full_DF.astype({'Entrez Gene Interactor A': 'int64', 'Entrez Gene Interactor B': 'int64'})

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B
0,103,6416,2318,112315,108607,,,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,Q59H94,NP_001120959|NP_001449,,,,,,,Homo sapiens,Homo sapiens
1,117,84665,88,124185,106603,,,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,Q59FD9|F6THM6,NP_001094|NP_001265272|NP_001265273,,,,,,,Homo sapiens,Homo sapiens
2,183,90,2339,106605,108625,,,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,,NP_002018,,,,,,,Homo sapiens,Homo sapiens
3,278,2624,5371,108894,111384,,,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,,NP_150250|NP_150253|NP_150252|NP_150247|NP_150...,,,,,,,Homo sapiens,Homo sapiens
4,418,6118,6774,112038,112651,RP4-547C9.3,,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,,NP_644805|NP_003141|NP_001356447|NP_001356443|...,,,,,,,Homo sapiens,Homo sapiens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696765,2875130,9564,9768,114934,115114,hCG_1980470,L5,BCAR1,KIAA0101,CAS|CAS1|CASS1|CRKAS|P130Cas,...,,NP_001025160|NP_055551,,,,,,,Homo sapiens,Homo sapiens
696766,2875131,9564,65263,114934,122418,hCG_1980470,,BCAR1,PYCRL,CAS|CAS1|CASS1|CRKAS|P130Cas,...,,NP_001316795|NP_075566,,,,,,,Homo sapiens,Homo sapiens
696767,2875132,9564,79691,114934,122812,hCG_1980470,,BCAR1,QTRTD1,CAS|CAS1|CASS1|CRKAS|P130Cas,...,,NP_078914|NP_001243764|NP_001243765|NP_001243766,,,,,,,Homo sapiens,Homo sapiens
696768,2875133,9564,85465,114934,124549,hCG_1980470,,BCAR1,EPT1,CAS|CAS1|CASS1|CRKAS|P130Cas,...,,NP_277040,,,,,,,Homo sapiens,Homo sapiens


In [12]:
total_seed_list = set(biogrid_full_DF['Entrez Gene Interactor B']).union(set(biogrid_full_DF['Entrez Gene Interactor A']))

In [35]:
total_protein_symbol_list = set(biogrid_full_DF['Official Symbol Interactor B']).union(set(biogrid_full_DF['Official Symbol Interactor A']))
tot_non_seed_pr_list = total_protein_symbol_list.difference(seed_genes_symbols)
len(total_protein_symbol_list - tot_non_seed_pr_list)
for i in seed_genes_symbols:
    if i not in total_protein_symbol_list:
        print(i)

CAVIN4


In [38]:
print(len(total_seed_list.difference(seed_list)), len(tot_non_seed_pr_list))

19045 19044


We also check if all the seed genes are also present in the *Biogrid* database:

In [13]:
print('Seed genes extracted from DisGeNET: ', len(seed_list))
print('Seed genes also present in Biogrid: ', len(total_seed_list.intersection(seed_list)))

Seed genes extracted from DisGeNET:  48
Seed genes also present in Biogrid:  48


In [14]:
biogrid_seed_genes_interactions = biogrid_full_DF[((biogrid_full_DF['Entrez Gene Interactor A'].isin(seed_list)) | (biogrid_full_DF['Entrez Gene Interactor B'].isin(seed_list)) )]
print("Total number of interactions involving at least one seed gene: ", biogrid_seed_genes_interactions.shape[0])

Total number of interactions involving at least one seed gene:  12081


In [15]:
seed_genes_interactions_ONLY = biogrid_seed_genes_interactions[((biogrid_seed_genes_interactions['Entrez Gene Interactor A'].isin(seed_list)) & (biogrid_seed_genes_interactions['Entrez Gene Interactor B'].isin(seed_list)) )]
print("Total number of interactions involving only seed genes: ", seed_genes_interactions_ONLY.shape[0])

Total number of interactions involving only seed genes:  180


In [39]:
total_geneId_list = set(biogrid_seed_genes_interactions['Entrez Gene Interactor B']).union(set(biogrid_seed_genes_interactions['Entrez Gene Interactor A']))
not_seed_genes = total_geneId_list.difference(seed_list)

total_protein_symbol_list = set(biogrid_seed_genes_interactions['Official Symbol Interactor B']).union(set(biogrid_seed_genes_interactions['Official Symbol Interactor A']))
not_seed_genes_symbols = total_protein_symbol_list.difference(seed_genes_symbols)

print("Numeber of genes involved in the PPI interactions with the seed genes: ", len(not_seed_genes), len(not_seed_genes_symbols))

Numeber of genes involved in the PPI interactions with the seed genes:  4886 4887


In [41]:
for i in seed_genes_symbols:
    if i not in total_protein_symbol_list:
        print(i)

CAVIN4


The protein *CAVIN4* seems to compare in no interaction, but to be sure we filter the table by its GeneId:

In [45]:
cavin4_ID = uniprot_DF.loc[uniprot_DF['GeneName']=='CAVIN4', 'GeneId'].item()
cavin4_ID

347273

In [46]:
biogrid_seed_genes_interactions[(biogrid_seed_genes_interactions['Entrez Gene Interactor A']==cavin4_ID) | (biogrid_seed_genes_interactions['Entrez Gene Interactor B']==cavin4_ID)]

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B
118129,738323,347273.0,351,131419,106848,,,MURC,APP,CAVIN4|cavin-4,...,B4DGD0|E9PG40|B4DJT9,NP_958817|NP_958816|NP_001191230|NP_001191231|...,,,,,,,Homo sapiens,Homo sapiens
566648,2714823,705.0,347273,107167,131419,RP5-973N23.2,,BYSL,MURC,BYSTIN,...,,NP_001018126,,,,,,,Homo sapiens,Homo sapiens
584673,2732848,55093.0,347273,120405,131419,,,WDYHV1,MURC,C8orf32,...,,NP_001018126,,,,,,,Homo sapiens,Homo sapiens
591215,2739390,347273.0,84619,131419,124150,,RP4-583P15.3,MURC,ZGPAT,CAVIN4|cavin-4,...,,NP_001076582|NP_115916|NP_001182582|NP_0011825...,,,,,,,Homo sapiens,Homo sapiens
625353,2784767,2547.0,347273,108822,131419,CTA-216E10.7,,XRCC6,MURC,CTC75|CTCBF|G22P1|KU70|ML8|TLAA,...,,NP_001018126,,,,,,,Homo sapiens,Homo sapiens
628391,2787805,3159.0,347273,109402,131419,RP11-513I15.2,,HMGA1,MURC,HMG-R|HMGA1A|HMGIY,...,,NP_001018126,,,,,,,Homo sapiens,Homo sapiens


According to the *BioGrid* database, the official symbol for the gene 347273 is *MURC*, whereas *CAVIN4* is one of its aliases. Thus, we modify our seed list to meet this convention

In [54]:
seed_genes_symbols.append('MURC')
seed_genes_symbols.remove('CAVIN4')

total_protein_symbol_list = set(biogrid_seed_genes_interactions['Official Symbol Interactor B']).union(set(biogrid_seed_genes_interactions['Official Symbol Interactor A']))
not_seed_genes_symbols = total_protein_symbol_list.difference(seed_genes_symbols)

print("Numeber of genes involved in the PPI interactions with the seed genes: ", len(not_seed_genes_symbols))

Numeber of genes involved in the PPI interactions with the seed genes:  4886 4886


Select the interactions among the non-seed genes from the full dataset

In [55]:
biogrid_non_seed_genes_interactions_ONLY = biogrid_full_DF[(biogrid_full_DF['Official Symbol Interactor A'].isin(not_seed_genes_symbols)) & (biogrid_full_DF['Official Symbol Interactor B'].isin(not_seed_genes_symbols)) ]
print("Total number of interactions involving only seed genes: ", biogrid_non_seed_genes_interactions_ONLY.shape[0])

Total number of interactions involving only seed genes:  259756


In [18]:
print("In total, the PPI interactions collected are ", biogrid_non_seed_genes_interactions_ONLY.shape[0]+biogrid_seed_genes_interactions.shape[0])

In total, the PPI interactions collected are  271837


b) Merge in a single table the data gathered from *BioGrid* and remove useless columns

In [91]:
ppi_df = pd.concat([biogrid_seed_genes_interactions, biogrid_non_seed_genes_interactions_ONLY])
ppi_df.shape

(271837, 37)

We remove: the four columns reletad to organism of each interactor, since we selected only rows corresponding to *Homo Sapiens*, the column *'Tags'* that contains null values and the column *'Source Database'* that is always equal to *BIOGRID*

In [92]:
ppi_df.drop(columns=['Organism Name Interactor A', 'Organism Name Interactor B', 'Organism ID Interactor A', 'Organism ID Interactor B', 'Source Database', 'Tags'], inplace=True)

In [93]:
ppi_df['Throughput'].unique()

array(['Low Throughput', 'High Throughput',
       'High Throughput|Low Throughput'], dtype=object)

In [94]:
tot = ppi_df.shape[0]
for col in ppi_df:
    nan = ppi_df[col].isna().sum()
    print(col, nan, "\t", round(nan/tot*100, 2), '%\t', ppi_df[col].nunique())

#BioGRID Interaction ID 0 	 0.0 %	 271837
Entrez Gene Interactor A 0 	 0.0 %	 4574
Entrez Gene Interactor B 0 	 0.0 %	 4905
BioGRID ID Interactor A 0 	 0.0 %	 4574
BioGRID ID Interactor B 0 	 0.0 %	 4905
Systematic Name Interactor A 194979 	 71.73 %	 1367
Systematic Name Interactor B 194766 	 71.65 %	 1462
Official Symbol Interactor A 0 	 0.0 %	 4574
Official Symbol Interactor B 0 	 0.0 %	 4905
Synonyms Interactor A 14939 	 5.5 %	 4154
Synonyms Interactor B 15293 	 5.63 %	 4420
Experimental System 0 	 0.0 %	 27
Experimental System Type 0 	 0.0 %	 2
Author 0 	 0.0 %	 18998
Publication Source 0 	 0.0 %	 20068
Throughput 0 	 0.0 %	 3
Score 224406 	 82.55 %	 29908
Modification 264601 	 97.34 %	 18
Qualifications 152254 	 56.01 %	 6250
SWISS-PROT Accessions Interactor A 586 	 0.22 %	 4535
TREMBL Accessions Interactor A 126099 	 46.39 %	 2180
REFSEQ Accessions Interactor A 183 	 0.07 %	 4560
SWISS-PROT Accessions Interactor B 692 	 0.25 %	 4831
TREMBL Accessions Interactor B 127274 	 46.82 %

Some columns also have a very high percentage of missing values (higher than 90%), namely: *Modification, Ontology Term IDs, Ontology Term Names, Ontology Term Categories, Ontology Term Qualifier IDs, Ontology Term Qualifier Names, Ontology Types*. For this reason we can also remove them from the final database.

In [95]:
ppi_df.drop(columns=['Modification', 'Ontology Term IDs', 'Ontology Term Names', 'Ontology Term Categories', 'Ontology Term Qualifier IDs', 'Ontology Term Qualifier Names', 'Ontology Term Types'], inplace=True)

In [96]:
ppi_df.to_csv('PPIs.csv', index = False, header = True, line_terminator = '\n', sep=',')

c) Summarize the main results in a table reporting:
    1. no. of seed genes collected in Disgenet and no. of seed genes found in Biogrid (some seed genes may be missing in the Biogrid);
    2. total no. of interacting genes/proteins found, including seed genes;
    3. total no. of interactions found.

In [60]:
ppi_summary = {
    "seed_genes (DISGNET)": len(seed_list),
    "seed_genes (BIOGRID)": len(seed_list),
    "interacting proteins": len(seed_list) + len(not_seed_genes),
    "interactions": biogrid_non_seed_genes_interactions_ONLY.shape[0]+biogrid_seed_genes_interactions.shape[0]
}
pd.DataFrame.from_dict(ppi_summary, orient='index', columns = ['Tot number'])

Unnamed: 0,Tot number
seed_genes (DISGNET),48
seed_genes (BIOGRID),48
interacting proteins,4934
interactions,271837


## 1.3) Arrange interaction data
a) **Seed genes interactome**: interactions that involve seed genes only

In [26]:
add_uniprotAC_A = seed_genes_interactions_ONLY.join(uniprot_DF[['UniprotAC', 'GeneId']].set_index('GeneId'), on = 'Entrez Gene Interactor A').rename(columns={"UniprotAC": "UniprotAC Interactor A"})
add_uniprotAC_B = add_uniprotAC_A.join(uniprot_DF[['UniprotAC', 'GeneId']].set_index('GeneId'), on = 'Entrez Gene Interactor B').rename(columns={"UniprotAC": "UniprotAC Interactor B"})
final_ppi_seed_genes = add_uniprotAC_B[['Official Symbol Interactor A', 'Official Symbol Interactor B', 'UniprotAC Interactor A' ,'UniprotAC Interactor B' ]].drop_duplicates().reset_index(drop=True)
final_ppi_seed_genes.head()

Unnamed: 0,Official Symbol Interactor A,Official Symbol Interactor B,UniprotAC Interactor A,UniprotAC Interactor B
0,FASN,FASN,P49327,P49327
1,PSEN1,CTNNB1,P49768,P35222
2,FAS,FASLG,P25445,P48023
3,EGFR,CTNNB1,P00533,P35222
4,ATM,ATM,Q13315,Q13315


In [27]:
print('Total number of interacting couple of proteins among the seed list: ', final_ppi_seed_genes.shape[0])

Total number of interacting couple of proteins among the seed list:  59


In [28]:
final_ppi_seed_genes.to_csv('seed_genes_interactome.csv', index = False, header = True, line_terminator = '\n', sep=',')

In [25]:
# same as above, but keeping the BioGRID Interaction ID
add_uniprotAC_A = seed_genes_interactions_ONLY.join(uniprot_DF[['UniprotAC', 'GeneId']].set_index('GeneId'), on = 'Entrez Gene Interactor A').rename(columns={"UniprotAC": "UniprotAC Interactor A"})
add_uniprotAC_B = add_uniprotAC_A.join(uniprot_DF[['UniprotAC', 'GeneId']].set_index('GeneId'), on = 'Entrez Gene Interactor B').rename(columns={"UniprotAC": "UniprotAC Interactor B"})
final_ppi_seed_genes = add_uniprotAC_B[['#BioGRID Interaction ID','Official Symbol Interactor A', 'Official Symbol Interactor B', 'UniprotAC Interactor A' ,'UniprotAC Interactor B' ]]
final_ppi_seed_genes.head()

Unnamed: 0,#BioGRID Interaction ID,Official Symbol Interactor A,Official Symbol Interactor B,UniprotAC Interactor A,UniprotAC Interactor B
448,19043,FASN,FASN,P49327,P49327
6851,243863,PSEN1,CTNNB1,P49768,P35222
7227,244313,FAS,FASLG,P25445,P48023
7533,244745,EGFR,CTNNB1,P00533,P35222
7796,245113,ATM,ATM,Q13315,Q13315


In [26]:
final_ppi_seed_genes.to_csv('seed_genes_interactome_with_BiogridID.csv', index = False, header = True, line_terminator = '\n', sep=',')

b) **Disease interactome**: all proteins interacting with at least one seed gene

We first need to retrieve the *UniprotAC* identifier of all the non-seed genes involved in the selected interactions. We can accomplish this through the *Retrieve Id/Mapping* tool on the *Uniprot* website.

In [61]:
# mapping based on gene symbols -> 4865 out of 4887 
with open("non_seed_genes_symbol.csv",'w') as file:
    writer = csv.writer(file)
    writer.writerow(list(not_seed_genes_symbols))

The mapping tool reported that 4864 out of 4886 genes were successfully mapped to 4959 UniProtKB IDs, which means that 22 proteins are not mapped in *Uniprot*.

In [80]:
uniprot_not_seed_mapping = pd.read_csv(root + "uniprot-not-seed-symbol.csv", sep = '\t', na_values='-', keep_default_na=True)
uniprot_not_seed_mapping['Entry'].nunique()

4959

In [81]:
uniprot_not_seed_mapping['GeneName'].nunique()

4874

In [82]:
not_mapped = [i for i in not_seed_genes_symbols if i not in uniprot_not_seed_mapping['GeneName'].unique()]
print('The following proteins are also not been mapped to their UniprotAC identifier: \n', *not_mapped)

The following proteins are also not been mapped to their UniprotAC identifier: 
 HIST1H2BE COX3 LOC400499 HIST1H2BG HIST1H4H HIST1H4J HBA1 HIST1H2AB HIST1H4L HIST1H2BF HIST1H2BC RPL17-C18orf32


After a double check on the *Uniprot* website, we found out that the proteins listed above are unreviewed, therefore we keep them not mapped

In [85]:
print(uniprot_not_seed_mapping.shape[0])
nan = np.nan
for gene in not_mapped:
    df = pd.DataFrame([[nan, gene]], columns=['Entry', 'GeneName'])
    uniprot_not_seed_mapping = uniprot_not_seed_mapping.append(df, ignore_index=True)
print(uniprot_not_seed_mapping.shape[0])

5021
5033


In [86]:
uniprot_not_seed_mapping.rename(columns={'Entry' :'UniprotAC'}, inplace = True)
uniprot_full = pd.concat([uniprot_DF[['UniprotAC', 'GeneName']], uniprot_not_seed_mapping], ignore_index=True)
print(uniprot_full.shape[0])
uniprot_full.head()

5081


Unnamed: 0,UniprotAC,GeneName
0,P68133,ACTA1
1,P68032,ACTC1
2,P08588,ADRB1
3,P07550,ADRB2
4,P25445,FAS


Another problem is that some GeneName are associated to multiple UniprotAC identifiers, namely:

In [137]:
g = uniprot_full.groupby('GeneName')['UniprotAC'].apply(list) 
g = pd.DataFrame(g)
agg_uniprot = g[g['UniprotAC'].str.len() > 1]

In [139]:
agg_uniprot.reset_index(inplace=True)
agg_uniprot.head()

Unnamed: 0,GeneName,UniprotAC
0,ACAT1,"[P24752, P35610]"
1,ACAT2,"[O75908, Q9BWD1]"
2,ADRA1A,"[P25100, P35348]"
3,AIP,"[O00170, Q9NWT8]"
4,AK3,"[P27144, Q9UIJ7]"


In [141]:
multiple_geneName = list(agg_uniprot['GeneName'])
len(multiple_geneName)

130

We remove from the *Uniprot_full* db the corresponding rows and we add the aggregation table defined above

In [152]:
uniprot_full = uniprot_full[-uniprot_full['GeneName'].isin(multiple_geneName)]
uniprot_full = uniprot_full.append(agg_uniprot)
uniprot_full.shape

(4934, 2)

In [153]:
uniprot_full.to_csv('uniprot_full.csv', index = False, header = True, line_terminator = '\n', sep=',')

Now we can join the table with all the interactions with the uniprot table

In [157]:
ppi_df = ppi_df[['Official Symbol Interactor A', 'Official Symbol Interactor B']].drop_duplicates().reset_index(drop=True)
print(ppi_df.shape)
ppi_df.head()

(212556, 2)


Unnamed: 0,Official Symbol Interactor A,Official Symbol Interactor B
0,ADRB1,GIPC1
1,PSEN2,CAPN1
2,CAPN3,TTN
3,MAGI1,CTNNB1
4,DCN,EGFR


In [169]:
ppi_df = ppi_df.merge(uniprot_full, left_on ='Official Symbol Interactor A', right_on='GeneName', how = 'left', left_index=False, right_index=False).rename(columns={"UniprotAC": "UniprotAC Interactor A"})
ppi_df = ppi_df.drop(columns = ['GeneName'])
ppi_df = ppi_df.merge(uniprot_full, left_on ='Official Symbol Interactor B', right_on='GeneName', how = 'left', left_index=False, right_index=False).rename(columns={"UniprotAC": "UniprotAC Interactor B"})
ppi_df = ppi_df.drop(columns = ['GeneName'])

print(ppi_df.shape)
ppi_df.head(10)

(212556, 4)


Unnamed: 0,Official Symbol Interactor A,Official Symbol Interactor B,UniprotAC Interactor A,UniprotAC Interactor B
0,ADRB1,GIPC1,P08588,O14908
1,PSEN2,CAPN1,P49810,P07384
2,CAPN3,TTN,P20807,Q8WZ42
3,MAGI1,CTNNB1,"[Q6P9H4, Q96QZ7]",P35222
4,DCN,EGFR,P07585,P00533
5,SUMO1,FAS,P63165,P25445
6,FLNA,ITGB1,P21333,P05556
7,LMNA,LMNB1,P02545,P20700
8,LMNA,SREBF1,P02545,P36956
9,LMNA,NARF,P02545,"[Q8WVD3, Q9UHQ1]"


Since not all the gene symbols are mapped, we still have some null values in the columns corresponding to the UniprotAC identifier, namely:

In [172]:
print("Null values corresponding to the UniprotAC Interactor A: ", ppi_df['UniprotAC Interactor A'].isnull().sum())
print("Null values corresponding to the UniprotAC Interactor B: ", ppi_df['UniprotAC Interactor B'].isnull().sum())

Null values corresponding to the UniprotAC Interactor A:  1832
Null values corresponding to the UniprotAC Interactor B:  908


In [173]:
ppi_df.to_csv('desease_interactome.csv', index = False, header = True, line_terminator = '\n', sep=',')