In [16]:
import pandas as pd
import numpy as np
import csv 

In [17]:
PATH = "./disgenet/"
filename = "./disgenet/curated_gene_disease_associations.tsv"

In [15]:
df = pd.read_csv(filename, sep = '\t')
df.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,1,A1BG,0.857,0.172,C0019209,Hepatomegaly,phenotype,C06;C23,Finding,0.3,,2017.0,2017.0,1,0,CTD_human
1,1,A1BG,0.857,0.172,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction,0.3,,2015.0,2015.0,1,0,CTD_human
2,2,A2M,0.564,0.724,C0002395,Alzheimer's Disease,disease,C10;F03,Disease or Syndrome,0.4,0.848485,1998.0,2016.0,3,0,CTD_human
3,2,A2M,0.564,0.724,C0007102,Malignant tumor of colon,disease,C04;C06,Neoplastic Process,0.3,,2004.0,2004.0,1,0,CTD_human
4,2,A2M,0.564,0.724,C0009375,Colonic Neoplasms,group,C04;C06,Neoplastic Process,0.3,,2004.0,2004.0,1,0,CTD_human


In [16]:
print(len(df), df.columns)

81746 Index(['geneId', 'geneSymbol', 'DSI', 'DPI', 'diseaseId', 'diseaseName',
       'diseaseType', 'diseaseClass', 'diseaseSemanticType', 'score', 'EI',
       'YearInitial', 'YearFinal', 'NofPmids', 'NofSnps', 'source'],
      dtype='object')


#find row associated to disease Malignant Mesothelioma with id C0345967

The columns in the files are:
* geneId 		-> NCBI Entrez Gene Identifier
* geneSymbol	-> Official Gene Symbol
* DSI		-> The Disease Specificity Index for the gene
* DPI		-> The Disease Pleiotropy Index for the gene
* diseaseId 	-> UMLS concept unique identifier
* diseaseName 	-> Name of the disease	
* diseaseType  	-> The DisGeNET disease type: disease, phenotype and group
* diseaseClass	-> The MeSH disease class(es)
* diseaseSemanticType	-> The UMLS Semantic Type(s) of the disease
* score		-> DisGENET score for the Gene-Disease association
* EI		-> The Evidence Index for the Gene-Disease association
* YearInitial	-> First time that the Gene-Disease association was reported
* YearFinal	-> Last time that the Gene-Disease association was reported
* NofPmids	-> Total number of publications reporting the Gene-Disease association
* NofSnps		-> Total number of SNPs associated to the Gene-Disease association
* source		-> Original source reporting the Gene-Disease association

In [17]:
target = df.loc[df['diseaseName'] == 'Malignant mesothelioma']
target.to_csv(PATH+"malignant_mesothelioma_curated_genes.tsv", sep = '\t')

In [8]:
t = pd.read_csv(PATH+"malignant_mesothelioma_curated_genes.tsv", sep = '\t')

In [9]:
t = t.drop('Unnamed: 0', axis = 1)
t.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,10,NAT2,0.466,0.828,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.34,0.5,1995.0,2009.0,1,0,CTD_human
1,142,PARP1,0.432,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.32,1.0,2011.0,2011.0,1,0,CTD_human
2,302,ANXA2,0.485,0.793,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.3,,2010.0,2010.0,1,0,CTD_human
3,335,APOA1,0.463,0.759,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.3,,2013.0,2013.0,1,0,CTD_human
4,596,BCL2,0.312,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.31,1.0,2006.0,2010.0,1,0,CTD_human


### Explore the DisGeNet dataset, find the disease of interest and get the list of human genes involved.

In [18]:
curated = pd.read_csv("./disgenet/browser_source_genes_summary_CURATED.tsv", sep = '\t')
curated.head()            

Unnamed: 0,Gene,Gene_id,UniProt,Gene_Full_Name,Protein_Class,pLI,DSI_g,DPI_g,N_diseases,N_SNPs
0,NAT2,10,P11245,N-acetyltransferase 2,transferase,4e-06,0.466,0.828,37,1
1,PARP1,142,P09874,poly(ADP-ribose) polymerase 1,,0.000552,0.432,0.862,66,2
2,ANXA2,302,P07355,annexin A2,,0.000482,0.485,0.793,26,0
3,APOA1,335,P02647,apolipoprotein A1,,0.000549,0.463,0.759,54,15
4,BCL2,596,P10415,"BCL2, apoptosis regulator",signaling molecule,0.56705,0.312,0.862,137,8


In [19]:
#now let's save the gene symbols, entrez ID and names in arrays
geneSymbol=[]
geneID=[]
geneName=[]
for i in range(len(curated)):
    geneSymbol.append(curated['Gene'][i])
    geneID.append(curated['Gene_id'][i])
    geneName.append(curated['Gene_Full_Name'][i])    
#we check on HGNC to see if we need to change genes name

b) For all genes in the seed gene list, collect the following basic information from the Uniprot:

* official (primary) gene symbol (check if the symbols are updated and approved on the HGNC website; report any issue/lack of data/potential misinterpretation)

* Uniprot AC, alphanumeric ‘accession number’ (a.k.a. ’Uniprot entry’)

* protein name (the main one only, do not report the aliases)

* Entrez Gene ID (a.k.a. ‘GeneID’) very brief description of its function (keep it very short, i.e. max 20 words)

* notes related to the above information, if any and if relevant

Store the data gathered in a table in an easily accessible format of your choice (csv, tab,
excel, etc).

In [5]:
print(geneSymbol)

['NAT2', 'PARP1', 'ANXA2', 'APOA1', 'BCL2', 'C9', 'CALB2', 'TNFRSF8', 'CDK6', 'CDKN1A', 'CDKN2A', 'COL12A1', 'CSF1', 'CTNNB1', 'DDX3X', 'EGFR', 'EPHX1', 'F9', 'EFEMP1', 'FCN2', 'FGF9', 'MLANA', 'FN1', 'FTH1', 'GPR27', 'GPR37', 'GSTM1', 'HGF', 'ICAM2', 'IFNG', 'IL1A', 'IL2RA', 'IL3', 'IL4', 'IL6', 'CXCL8', 'IL12B', 'ILK', 'KIT', 'LTA', 'MCAM', 'MDK', 'MET', 'MIF', 'CXCL9', 'MUC1', 'NF2', 'NGF', 'SERPINA4', 'PTPRF', 'RAF1', 'RXRA', 'RYR2', 'CLEC11A', 'CCL3', 'CCL5', 'CCL7', 'CCL23', 'SDC1', 'CXCL12', 'SLC2A1', 'SLC22A5', 'SMARCA2', 'SOD2', 'TF', 'TFDP2', 'TGFB2', 'TP53', 'TXN', 'VIM', 'WNT3', 'WT1', 'BAP1', 'FGF18', 'BCL10', 'MTMR4', 'CCNE2', 'OSMR', 'ADAMTS2', 'ULK2', 'HDAC4', 'HEPH', 'SETDB1', 'MSLN', 'ENOX2', 'SEMA4F', 'CXCL13', 'PDPN', 'IGF2BP3', 'CCL27', 'COPG1', 'LIMCH1', 'WWC1', 'SF3B1', 'CFAP45', 'AGO1', 'AGO2', 'SETD2', 'PYCARD', 'PLLP', 'FGD6', 'TBL1XR1', 'TRAF7', 'DDX51', 'NANOS1', 'MIR125A', 'MIR126', 'MIR484', 'PWAR6']


In [27]:
#fino a COL12A1 approved

In [6]:
'''import urllib.request
import time

site = "https://www.genenames.org/tools/search/#!/all?query="
for i in range(0, len(geneSymbol)):
    symbol = geneSymbol[i]
    url = site+symbol
    print(symbol)
    response = urllib.request.urlopen(url)
    print(response)
    time.sleep(2)'''

'import urllib.request\nimport time\n\nsite = "https://www.genenames.org/tools/search/#!/all?query="\nfor i in range(0, len(geneSymbol)):\n    symbol = geneSymbol[i]\n    url = site+symbol\n    print(symbol)\n    response = urllib.request.urlopen(url)\n    print(response)\n    time.sleep(2)'

In [7]:
# printing original list  
print("The original list is : " + str(geneSymbol))
  
# using join() 
# avoiding printing last comma 
print("The formatted output is : ") 
print(', '.join(geneSymbol)) 

# deleted quotes to pass the list into site
#https://www.genenames.org/tools/multi-symbol-checker/

The original list is : ['NAT2', 'PARP1', 'ANXA2', 'APOA1', 'BCL2', 'C9', 'CALB2', 'TNFRSF8', 'CDK6', 'CDKN1A', 'CDKN2A', 'COL12A1', 'CSF1', 'CTNNB1', 'DDX3X', 'EGFR', 'EPHX1', 'F9', 'EFEMP1', 'FCN2', 'FGF9', 'MLANA', 'FN1', 'FTH1', 'GPR27', 'GPR37', 'GSTM1', 'HGF', 'ICAM2', 'IFNG', 'IL1A', 'IL2RA', 'IL3', 'IL4', 'IL6', 'CXCL8', 'IL12B', 'ILK', 'KIT', 'LTA', 'MCAM', 'MDK', 'MET', 'MIF', 'CXCL9', 'MUC1', 'NF2', 'NGF', 'SERPINA4', 'PTPRF', 'RAF1', 'RXRA', 'RYR2', 'CLEC11A', 'CCL3', 'CCL5', 'CCL7', 'CCL23', 'SDC1', 'CXCL12', 'SLC2A1', 'SLC22A5', 'SMARCA2', 'SOD2', 'TF', 'TFDP2', 'TGFB2', 'TP53', 'TXN', 'VIM', 'WNT3', 'WT1', 'BAP1', 'FGF18', 'BCL10', 'MTMR4', 'CCNE2', 'OSMR', 'ADAMTS2', 'ULK2', 'HDAC4', 'HEPH', 'SETDB1', 'MSLN', 'ENOX2', 'SEMA4F', 'CXCL13', 'PDPN', 'IGF2BP3', 'CCL27', 'COPG1', 'LIMCH1', 'WWC1', 'SF3B1', 'CFAP45', 'AGO1', 'AGO2', 'SETD2', 'PYCARD', 'PLLP', 'FGD6', 'TBL1XR1', 'TRAF7', 'DDX51', 'NANOS1', 'MIR125A', 'MIR126', 'MIR484', 'PWAR6']
The formatted output is : 
NAT2, 

# all gene symbols were approved by HGNC 

In [20]:
uniprotAC=[]
for i in range(len(curated)):
    uniprotAC.append(curated['UniProt'][i])

In [37]:
results = pd.DataFrame(list(zip(geneSymbol,geneName,geneID,uniprotAC)), columns=['Symbol','Name','ID','UniprotAC'])

In [38]:
results.to_csv('mesothelioma-curated-genes.csv')
results.head()

Unnamed: 0,Symbol,Name,ID,UniprotAC
0,NAT2,N-acetyltransferase 2,10,P11245
1,PARP1,poly(ADP-ribose) polymerase 1,142,P09874
2,ANXA2,annexin A2,302,P07355
3,APOA1,apolipoprotein A1,335,P02647
4,BCL2,"BCL2, apoptosis regulator",596,P10415


### Exercise 1.2

For each seed gene, collect all binary protein interactions from two different PPI sources:
* Biogrid Human, latest release available
* IID Integrated Interactions Database (experimental data only, all tissues, unless stated otherwise in further instruction)

Note: once you got the list of the proteins interacting with at least one seed gene, you must
also retrieve and include in your interactome the interactions among these non-seed
proteins

In [21]:
#open biogrid DB
biogrid=pd.read_csv('BIOGRID-ALL-3.5.179.tab2.txt', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
biogrid.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,8599089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,10938104,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [22]:
len(biogrid)

1730436

In [12]:
#let's select what we want from the biogrid dataset.
#select only human PPI
biogrid.columns

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Pubmed ID',
       'Organism Interactor A', 'Organism Interactor B', 'Throughput', 'Score',
       'Modification', 'Phenotypes', 'Qualifications', 'Tags',
       'Source Database'],
      dtype='object')

In [47]:
#I took the interactions for NAT2 gene in humans 
example = pd.read_csv('./biogrid/BIOGRID-GENE-106528-3.5.179.tab2.txt', sep='\t')
example.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,739497,10,351,106528,106848,-,-,NAT2,APP,AAC2|NAT-2|PNAT,...,21832049,9606,9606,High Throughput,-,-,-,-,-,BIOGRID
1,1180805,10,56910,106528,121238,-,-,NAT2,STARD7,AAC2|NAT-2|PNAT,...,26186194,9606,9606,High Throughput,0.99999353,-,-,Interaction also contained in second paper aft...,-,BIOGRID
2,1180806,10,23567,106528,117109,-,-,NAT2,ZNF346,AAC2|NAT-2|PNAT,...,26186194,9606,9606,High Throughput,0.999988426,-,-,Interaction also contained in second paper aft...,-,BIOGRID
3,1180807,10,25818,106528,117346,-,UNQ570/PRO1132,NAT2,KLK5,AAC2|NAT-2|PNAT,...,26186194,9606,9606,High Throughput,0.99345277,-,-,Interaction also contained in second paper aft...,-,BIOGRID
4,1417575,324,10,106821,106528,-,-,APC,NAT2,BTPS2|DP2|DP2.5|DP3|GS|PPP1R46,...,25640309,9606,9606,High Throughput,-,-,-,-,-,BIOGRID


In [23]:
#select only human genes
biogrid_human=biogrid.loc[(biogrid['Organism Interactor A']==9606) & (biogrid['Organism Interactor B']==9606)]
biogrid_human

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,8599089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,10938104,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730424,2621674,84376,10749,124068,115972,-,-,HOOK3,KIF1C,HK3,...,28718761,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1730425,2621675,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1730426,2621676,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1730427,2621677,50618,2059,119098,108373,-,-,ITSN2,EPS8,PRO2015|SH3D1B|SH3P18|SWA|SWAP,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [14]:
# look for the genes which interacts with at least one seed genes
biogrid_seed_genes = biogrid_human.loc[(biogrid_human['Official Symbol Interactor A'].isin(geneSymbol)) | (biogrid_human['Official Symbol Interactor B'].isin(geneSymbol))]
biogrid_seed_genes

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
30,3024,9223,1499,114655,107880,RP11-88H12.2,OK/SW-cl.35,MAGI1,CTNNB1,AIP-3|AIP3|BAIAP1|BAP-1|BAP1|MAGI-1|Magi1d|TNR...,...,10772923,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
32,3224,7251,1026,113102,107460,-,-,TSG101,CDKN1A,TSG10|VPS23,...,11943869,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
54,3956,7295,1277,113146,107674,RP11-427L11.1,-,TXN,COL1A1,TRDX|TRX|TRX1,...,12099690,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
63,4867,1634,1956,108002,108276,-,-,DCN,EGFR,CSCD|DSPG2|PG40|PGII|PGS2|SLRR1B,...,12105206,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
131,7994,3064,29072,109314,118845,-,HSPC069,HTT,SETD2,HD|IT15,...,9700202,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1729703,2620953,636,302,107105,106799,RP11-817I4.1,-,BICD1,ANXA2,BICD,...,28718761,9606,9606,High Throughput,-,-,-,-,-,BIOGRID
1729866,2621116,23299,302,116891,106799,RP11-476B13.3,-,BICD2,ANXA2,SMALED2|bA526D8.1,...,28718761,9606,9606,High Throughput,-,-,-,-,-,BIOGRID
1729988,2621238,23299,1654,116891,108020,RP11-476B13.3,-,BICD2,DDX3X,SMALED2|bA526D8.1,...,28718761,9606,9606,High Throughput,-,-,-,-,-,BIOGRID
1730086,2621336,51361,302,119496,106799,-,-,HOOK1,ANXA2,HK1,...,28718761,9606,9606,High Throughput,-,-,-,-,-,BIOGRID


## from our seed interaction let's search non-seeds genes that interacts with at least one seed gene

In [15]:
#Interactor A is not a seed gene but Interactor B is a seed gene
non_seed_df_A = biogrid_seed_genes.loc[~(biogrid_seed_genes['Official Symbol Interactor A'].isin(geneSymbol))]
seed_B = non_seed_df_A.loc[(non_seed_df_A ['Official Symbol Interactor B'].isin(geneSymbol))]

In [17]:
# interactor B is not a seed genes but Interactor A is a seed gene
non_seed_df_B = biogrid_seed_genes.loc[~(biogrid_seed_genes['Official Symbol Interactor B'].isin(geneSymbol))]
non_seed_df_B

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
54,3956,7295,1277,113146,107674,RP11-427L11.1,-,TXN,COL1A1,TRDX|TRX|TRX1,...,12099690,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
145,8799,3552,4692,109768,110772,-,-,IL1A,NDN,IL-1A|IL1|IL1-ALPHA|IL1F1,...,12913118,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
160,9270,596,8678,107068,114226,-,-,BCL2,BECN1,Bcl-2|PPP1R50,...,9765397,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
233,12266,335,5136,106832,111162,-,-,APOA1,PDE1A,-,...,11991719,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
253,13649,23286,5590,116884,111576,-,RP11-181G12.1,WWC1,PRKCZ,HBEBP3|HBEBP36|KIBRA|MEMRYQTL|PPP1R168,...,15081397,9606,9606,Low Throughput,-,-,-,figure 1,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1725655,2616344,3611,6386,109824,112287,-,-,ILK,SDCBP,HEL-S-28|ILK-1|ILK-2|P59|p59ILK,...,26172215,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1725656,2616345,3611,3987,109824,110175,-,-,ILK,LIMS1,HEL-S-28|ILK-1|ILK-2|P59|p59ILK,...,26172215,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1725775,2616467,7157,5888,113010,111825,-,-,TP53,RAD51,BCC7|LFS1|P53|TRP53,...,11948396,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1725819,2616562,1499,6925,107880,112787,OK/SW-cl.35,-,CTNNB1,TCF4,CTNNB|MRD19|armadillo,...,19114997,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [18]:
# Reset Index, otherwise they don't work with list
non_seed_df_A=non_seed_df_A.reset_index(drop=True)
non_seed_df_B=non_seed_df_B.reset_index(drop=True)

In [19]:
non_seed_df_B

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,3956,7295,1277,113146,107674,RP11-427L11.1,-,TXN,COL1A1,TRDX|TRX|TRX1,...,12099690,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,8799,3552,4692,109768,110772,-,-,IL1A,NDN,IL-1A|IL1|IL1-ALPHA|IL1F1,...,12913118,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,9270,596,8678,107068,114226,-,-,BCL2,BECN1,Bcl-2|PPP1R50,...,9765397,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,12266,335,5136,106832,111162,-,-,APOA1,PDE1A,-,...,11991719,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,13649,23286,5590,116884,111576,-,RP11-181G12.1,WWC1,PRKCZ,HBEBP3|HBEBP36|KIBRA|MEMRYQTL|PPP1R168,...,15081397,9606,9606,Low Throughput,-,-,-,figure 1,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9448,2616344,3611,6386,109824,112287,-,-,ILK,SDCBP,HEL-S-28|ILK-1|ILK-2|P59|p59ILK,...,26172215,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
9449,2616345,3611,3987,109824,110175,-,-,ILK,LIMS1,HEL-S-28|ILK-1|ILK-2|P59|p59ILK,...,26172215,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
9450,2616467,7157,5888,113010,111825,-,-,TP53,RAD51,BCC7|LFS1|P53|TRP53,...,11948396,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
9451,2616562,1499,6925,107880,112787,OK/SW-cl.35,-,CTNNB1,TCF4,CTNNB|MRD19|armadillo,...,19114997,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [20]:
#build a list with only non seed genes that interacts with at least one seed gene
non_seed_list=[]
for i in range(0, len(non_seed_df_A)):
    non_seed_list.append(non_seed_df_A['Official Symbol Interactor A'][i])

In [21]:
for i in range(0, len(non_seed_df_B)):
    non_seed_list.append(non_seed_df_B['Official Symbol Interactor B'][i])

In [22]:
#drop duplicates
non_seed_list = list(dict.fromkeys(non_seed_list))

In [23]:
# Verify Correctness
for i in range(0, len(geneSymbol)):
    for j in range(0, len(non_seed_list)):
        if(geneSymbol[i] == non_seed_list[j]):
            print("Something is wrong")
print("Ok")

Ok


In [24]:
print(len(non_seed_list))

4945


In [25]:
#now search for non seed interactions from the human DB
#Return positions of non-seed genes that interacts with a non seed gene but both interacts with at least one seed gene
biogrid_human = biogrid_human.reset_index(drop=True)
#create a list of index of the original matrix with non seed genes interacting each others
list_of_idx=[]
#fill the list
for i in range(len(biogrid_human)):
    if biogrid_human['Official Symbol Interactor A'][i] in non_seed_list and biogrid_human['Official Symbol Interactor B'][i] in non_seed_list:
        list_of_idx.append(i)

In [26]:
#biogrid_seed_genes
biogrid_non_seed = biogrid_human.loc[list_of_idx]
biogrid_non_seed = biogrid_non_seed.reset_index(drop=True)
biogrid_non_seed

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,586,375,23163,106870,116775,-,-,ARF1,GGA3,-,...,10747089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,917,333,1600,106830,107970,-,RP6-239D12.2,APLP1,DAB1,APLP,...,10460257,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,1161,2033,7020,108347,112878,RP1-85F18.1,RP1-290I10.1,EP300,TFAP2A,KAT3B|RSTS2|p300,...,12586840,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229600,2621674,84376,10749,124068,115972,-,-,HOOK3,KIF1C,HK3,...,28718761,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
229601,2621675,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
229602,2621676,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
229603,2621677,50618,2059,119098,108373,-,-,ITSN2,EPS8,PRO2015|SH3D1B|SH3P18|SWA|SWAP,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [111]:
#make a table in which interactor 1 is seed and interactor 2 can be seed or non-seed
biogrid_seed_df = biogrid_seed_genes.loc[(biogrid_seed_genes['Official Symbol Interactor A'].isin(geneSymbol))]
interactome = pd.concat([biogrid_seed_df, biogrid_non_seed], ignore_index=True)
#interactome.to_csv("interactome-biogrid.txt", sep='\t')

In [27]:
interactome = pd.read_csv("interactome-biogrid.txt", sep= '\t')
interactome = interactome.drop(['Unnamed: 0'], axis = 1)
interactome

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,3956,7295,1277,113146,107674,RP11-427L11.1,-,TXN,COL1A1,TRDX|TRX|TRX1,...,12099690,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,8799,3552,4692,109768,110772,-,-,IL1A,NDN,IL-1A|IL1|IL1-ALPHA|IL1F1,...,12913118,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,9270,596,8678,107068,114226,-,-,BCL2,BECN1,Bcl-2|PPP1R50,...,9765397,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,12266,335,5136,106832,111162,-,-,APOA1,PDE1A,-,...,11991719,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,13649,23286,5590,116884,111576,-,RP11-181G12.1,WWC1,PRKCZ,HBEBP3|HBEBP36|KIBRA|MEMRYQTL|PPP1R168,...,15081397,9606,9606,Low Throughput,-,-,-,figure 1,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239447,2621674,84376,10749,124068,115972,-,-,HOOK3,KIF1C,HK3,...,28718761,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
239448,2621675,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
239449,2621676,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
239450,2621677,50618,2059,119098,108373,-,-,ITSN2,EPS8,PRO2015|SH3D1B|SH3P18|SWA|SWAP,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [17]:
iid = pd.read_csv('human_annotated_PPIs.txt', sep='\t')
iid

Unnamed: 0,uniprot1,uniprot2,symbol1,symbol2,methods,pmids,dbs,evidence type,adipose tissue,adrenal gland,...,arteriosclerosis,lymphocytic leukemia,enzymes,ion channels,receptors,transporters,drug targets,targeting drugs,orthologs are drug targets,drugs targeting orthologs
0,Q9NUX5,Q9NVM4,POT1,PRMT7,bimolecular fluorescence complementation;two h...,21044950,biogrid;intact,exp,0,0,...,0,0,1,0,0,0,0,-,0,-
1,Q96JY6,Q9NPC6,PDLIM2,MYOZ2,-,21836163,iid-pred,pred,0,1,...,0,0,0,0,0,0,0,-,0,-
2,Q15414,Q32P51,-,HNRNPA1L2,-,23023127,iid-pred,pred,?,?,...,0,0,0,0,0,0,0,-,0,-
3,P62633,Q99729,CNBP,HNRNPAB,affinity chromatography technology;anti bait c...,17353931,biogrid;intact,exp,1,1,...,0,0,0,0,0,0,0,-,0,-
4,P29353,P42685,SHC1,FRK,-,25402006,iid-pred,pred,0,0,...,0,0,1,0,0,0,1,Dasatinib;Regorafenib,0,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
975872,O00506,P06239,STK25,LCK,-,23023127,iid-pred,pred,1,0,...,0,0,1,0,0,0,1,{4-[(2S)-2-Acetamido-3-({(1S)-1-[3-carbamoyl-4...,0,-
975873,P13500,P51681,CCL2,CCR5,biochemical;competition binding,10477718;16082366;21836163;25402006,dip;hprd;iid-pred;innatedb,exp;pred,1,1,...,1,0,1,0,1,0,1,AMD-070;Ibalizumab;INCB-9471;Maraviroc;Vicrivi...,0,-
975874,Q86X19,Q96J84,TMEM17,KIRREL1,bioid;proximity-dependent biotin identification,26638075,biogrid;intact,exp,0,1,...,0,0,0,0,0,0,0,-,0,-
975875,P40227,Q7Z6Z7,CCT6A,HUWE1,affinity chromatography technology,25147182,biogrid,exp,1,1,...,0,0,1,0,0,0,0,-,0,-


In [18]:
#select only rows with evidence type exp, drop some columns.
iid = iid.loc[iid['evidence type'] == 'exp']
icols = ['uniprot1', 'uniprot2', 'symbol1', 'symbol2' , 'evidence type', 'cancer']
iid = iid.reset_index(drop=True)
iid = iid[['uniprot1', 'uniprot2', 'symbol1', 'symbol2' , 'evidence type', 'cancer']]

In [19]:
iid

Unnamed: 0,uniprot1,uniprot2,symbol1,symbol2,evidence type,cancer
0,Q9NUX5,Q9NVM4,POT1,PRMT7,exp,0
1,P62633,Q99729,CNBP,HNRNPAB,exp,0
2,O43715,O95817,TRIAP1,BAG3,exp,0
3,Q13285,Q9UKM9,NR5A1,RALY,exp,0
4,P60174,Q9H0W5,TPI1,CCDC8,exp,0
...,...,...,...,...,...,...
272487,O60341,Q8NEZ2,KDM1A,VPS37A,exp,0
272488,Q00987,Q8TAQ2,MDM2,SMARCC2,exp,0
272489,P22314,Q5VTR2,UBA1,RNF20,exp,0
272490,Q86X19,Q96J84,TMEM17,KIRREL1,exp,0


In [56]:
iid.to_csv('iid.txt', sep = '\t')

In [48]:
#SAME PROCEDURE. 

In [70]:
iid_seed_genes = iid.loc[(iid['symbol1'].isin(geneSymbol)) | (iid['symbol2'].isin(geneSymbol))]
iid_seed_genes

Unnamed: 0,uniprot1,uniprot2,symbol1,symbol2,evidence type,cancer
101,Q7RTS6,Q9UKV8,OTOP2,AGO2,exp,0
106,P21741,Q02543,MDK,RPL18A,exp,0
151,O75915,P13598,ARL6IP5,ICAM2,exp,0
161,P13598,P55085,ICAM2,F2RL1,exp,0
187,P23508,P42771,MCC,CDKN2A,exp,0
...,...,...,...,...,...,...
272242,P01374,Q6AI12,LTA,ANKRD40,exp,0
272264,P02751,Q13148,FN1,TARDBP,exp,1
272321,P00533,Q13200,EGFR,PSMD2,exp,1
272372,P01375,Q9UKV8,TNF,AGO2,exp,1


In [71]:
non_seed_1 = iid_seed_genes.loc[~(iid_seed_genes['symbol1'].isin(geneSymbol))]
# interactor 1 is not a seed genes and interacts with a seed gene
#B is a seed gene
non_seed_1

Unnamed: 0,uniprot1,uniprot2,symbol1,symbol2,evidence type,cancer
101,Q7RTS6,Q9UKV8,OTOP2,AGO2,exp,0
151,O75915,P13598,ARL6IP5,ICAM2,exp,0
187,P23508,P42771,MCC,CDKN2A,exp,0
432,O15234,P42771,CASC3,CDKN2A,exp,0
452,P05412,Q9Y678,JUN,COPG1,exp,0
...,...,...,...,...,...,...
271849,P04626,P11245,ERBB2,NAT2,exp,1
271857,O15392,Q9ULZ3,BIRC5,PYCARD,exp,1
271966,Q5W0B1,Q9UKV8,RNF219,AGO2,exp,0
272022,O43617,P00533,TRAPPC3,EGFR,exp,0


In [72]:
non_seed_2 = iid_seed_genes.loc[~(iid_seed_genes['symbol2'].isin(geneSymbol))]
# interactor 1 is not a seed genes and interacts with a seed gene
#B is a seed gene
non_seed_2

Unnamed: 0,uniprot1,uniprot2,symbol1,symbol2,evidence type,cancer
106,P21741,Q02543,MDK,RPL18A,exp,0
161,P13598,P55085,ICAM2,F2RL1,exp,0
509,P10415,Q8IZY5,BCL2,BLID,exp,1
540,P13501,P31431,CCL5,SDC4,exp,1
629,P35222,Q16658,CTNNB1,FSCN1,exp,1
...,...,...,...,...,...,...
272084,P08670,Q96HL8,VIM,SH3YL1,exp,0
272242,P01374,Q6AI12,LTA,ANKRD40,exp,0
272264,P02751,Q13148,FN1,TARDBP,exp,1
272321,P00533,Q13200,EGFR,PSMD2,exp,1


In [73]:
non_seed_1=non_seed_1.reset_index(drop=True)
non_seed_2=non_seed_2.reset_index(drop=True)

In [117]:
#build a list with only non seed genes that interacts with at least one seed gene
non_seed=[]
#non_seed_df_A = biogrid_seed_genes.loc[~(biogrid_seed_genes['Official Symbol Interactor A'].isin(geneSymbol))]
for i in range(0, len(non_seed_1)):
    non_seed.append(non_seed_1['symbol1'][i])
for i in range(0, len(non_seed_2)):
    non_seed.append(non_seed_2['symbol2'][i])
#drop duplicates
non_seed = list(dict.fromkeys(non_seed))

In [119]:
len(non_seed)

4285

In [122]:
iid=iid.reset_index(drop=True)
list_of_idx=[]
#fill the list
for i in range(len(iid_seed_genes)):
    if iid['symbol1'][i] in non_seed and iid['symbol2'][i] in non_seed:
        list_of_idx.append(i)

In [123]:
#biogrid_seed_genes
iid_non_seed = iid.loc[list_of_idx]
iid_non_seed = iid_non_seed.reset_index(drop=True)
iid_non_seed

Unnamed: 0,uniprot1,uniprot2,symbol1,symbol2,evidence type,cancer
0,P62633,Q99729,CNBP,HNRNPAB,exp,0
1,P60174,Q9H0W5,TPI1,CCDC8,exp,0
2,P82650,Q99728,MRPS22,BARD1,exp,0
3,P11388,Q9H0W5,TOP2A,CCDC8,exp,0
4,P24539,Q99816,ATP5F1,TSG101,exp,0
...,...,...,...,...,...,...
2368,P49674,P67870,CSNK1E,CSNK2B,exp,0
2369,P10412,P15880,HIST1H1E,RPS2,exp,0
2370,Q15025,Q9Y2D8,TNIP1,SSX2IP,exp,0
2371,Q01844,Q86UU0,EWSR1,BCL9L,exp,1


In [125]:
iid_seed_df = iid_seed_genes.loc[(iid_seed_genes['symbol1'].isin(geneSymbol))]
interactome2 = pd.concat([iid_seed_df, iid_non_seed], ignore_index=True)
interactome2.to_csv("interactome-iid.txt", sep='\t')

In [29]:
interactome2 = pd.read_csv("interactome-iid.txt", sep='\t')

In [32]:
interactome2 = interactome2.drop(['Unnamed: 0'], axis = 1)

In [33]:
interactome2

Unnamed: 0,uniprot1,uniprot2,symbol1,symbol2,evidence type,cancer
0,P21741,Q02543,MDK,RPL18A,exp,0
1,P13598,P55085,ICAM2,F2RL1,exp,0
2,P10415,Q8IZY5,BCL2,BLID,exp,1
3,P13501,P31431,CCL5,SDC4,exp,1
4,P35222,Q16658,CTNNB1,FSCN1,exp,1
...,...,...,...,...,...,...
7449,P49674,P67870,CSNK1E,CSNK2B,exp,0
7450,P10412,P15880,HIST1H1E,RPS2,exp,0
7451,Q15025,Q9Y2D8,TNIP1,SSX2IP,exp,0
7452,Q01844,Q86UU0,EWSR1,BCL9L,exp,1


In [34]:
interactome

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,3956,7295,1277,113146,107674,RP11-427L11.1,-,TXN,COL1A1,TRDX|TRX|TRX1,...,12099690,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,8799,3552,4692,109768,110772,-,-,IL1A,NDN,IL-1A|IL1|IL1-ALPHA|IL1F1,...,12913118,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,9270,596,8678,107068,114226,-,-,BCL2,BECN1,Bcl-2|PPP1R50,...,9765397,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,12266,335,5136,106832,111162,-,-,APOA1,PDE1A,-,...,11991719,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,13649,23286,5590,116884,111576,-,RP11-181G12.1,WWC1,PRKCZ,HBEBP3|HBEBP36|KIBRA|MEMRYQTL|PPP1R168,...,15081397,9606,9606,Low Throughput,-,-,-,figure 1,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239447,2621674,84376,10749,124068,115972,-,-,HOOK3,KIF1C,HK3,...,28718761,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
239448,2621675,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
239449,2621676,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
239450,2621677,50618,2059,119098,108373,-,-,ITSN2,EPS8,PRO2015|SH3D1B|SH3P18|SWA|SWAP,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


### Summarize the main results in a table reporting:
* no. of seed genes found in each different DBs (some seed genes may be missing in the DBs);
* total no. of interacting proteins, including seed genes, for each DB;
* total no. of interactions found in each DB.

In [36]:
len(geneSymbol)

109

In [37]:
seed_B = non_seed_df_A.loc[(non_seed_df_A ['Official Symbol Interactor B'].isin(geneSymbol))]
seed_A = non_seed_df_B.loc[(non_seed_df_B ['Official Symbol Interactor A'].isin(geneSymbol))]

In [42]:
def find_unique_genes(dataframe, column_name):
    genes_found = []
    for index in range(len(dataframe)):
        if dataframe[column_name][index] not in genes_found:
            genes_found.append(dataframe[column_name][index])
    return genes_found

In [63]:
# Genes from Biogrid

la = find_unique_genes(seed_B, 'Official Symbol Interactor B')
lb = find_unique_genes(seed_A, 'Official Symbol Interactor A')
l_tot = la + lb
#drop duplicates
total_genes = list(dict.fromkeys(l_tot))
len(total_genes)
missing_gene = []
for index in range(len(geneSymbol)):
    if geneSymbol[index] not in total_genes:
        missing_gene.append(geneSymbol[index])
print("Genes missing in Biogrid\n", missing_gene)

Genes missing in Biogrid
 ['CCL27', 'MIR125A', 'MIR126', 'MIR484', 'PWAR6']


In [77]:
# Genes from IID 
seed1 = non_seed_2.loc[(non_seed_2['symbol1'].isin(geneSymbol))]
seed2 = non_seed_1.loc[(non_seed_1['symbol2'].isin(geneSymbol))]
# interactor 1 is not a seed genes and interacts with a seed gene
#B is a seed gene
l1 = find_unique_genes(seed1, 'symbol1')
l2 = find_unique_genes(seed2, 'symbol2')
l = l1+l2
#drop duplicates
i_genes = list(dict.fromkeys(l))
iid_missing = []
for index in range(len(geneSymbol)):
    if geneSymbol[index] not in i_genes:
        iid_missing.append(geneSymbol[index])
print("Genes missing from IID DB \n", iid_missing)

Genes missing from IID DB 
 ['GPR27', 'MIR125A', 'MIR126', 'MIR484', 'PWAR6']


total no. of interacting proteins, including seed genes, for each DB;

In [83]:
interactors= []
for index in range(len(biogrid_human)):
    if biogrid_human['Official Symbol Interactor A'][index] not in interactors:
        interactors.append(biogrid_human['Official Symbol Interactor A'][index])
    if biogrid_human['Official Symbol Interactor B'][index] not in interactors:
        interactors.append(biogrid_human['Official Symbol Interactor B'][index])

In [84]:
len(interactors)

17899

In [87]:
def count_interactors(dataframe, column1, column2):
    interactors= []
    for index in range(len(dataframe)):
        if dataframe[column1][index] not in interactors:
            interactors.append(dataframe[column1][index])
        if dataframe[column2][index] not in interactors:
            interactors.append(dataframe[column2][index])
    return len(interactors)

In [88]:
iid_len = count_interactors(iid, 'symbol1', 'symbol2')

In [90]:
print(iid_len)

17278


In [91]:
len(iid)

272492

In [92]:
# total no. of interactions found in each DB.

In [16]:
countbio = biogrid_human['#BioGRID Interaction ID'].nunique()
print(countbio)

485380


In [30]:
uniquesyms = []
for index in range(0, len(iid)):
    sym1 = iid['symbol1'][index]
    sym2 = iid['symbol2'][index]
    uniquesyms.append(sym1+sym2)
#drop duplicates

In [29]:
uniquesyms = list(dict.fromkeys(uniquesyms))
print(len(uniquesyms))

270230


### Build and store three tables:

* seed genes interactome: interactions that involve seed genes only, from all DBs, in the format:
  interactor A gene symbol, interactor B gene symbol, interactor A Uniprot AC, interactor B
  Uniprot AC, database source
* union interactome: all proteins interacting with at least one seed gene, from all DBs, same format as above.
* intersection interactome: all proteins interacting with at least one seed gene confirmed by both DBs, in the       format: interactor A gene symbol, interactor B gene symbol, interactor A Uniprot AC, interactor B Uniprot AC

Always check that interactors are both human (i.e. organism ID is always 9606, Homo
Sapiens)

In [11]:
uniprot_human = pd.read_csv("./uniprot/HUMAN_9606_idmapping.dat", sep = '\t')

In [13]:
uniprot_human

Unnamed: 0,P31946,UniProtKB-ID,1433B_HUMAN
0,P31946,Gene_Name,YWHAB
1,P31946,GI,4507949
2,P31946,GI,377656702
3,P31946,GI,67464628
4,P31946,GI,1345590
...,...,...,...
6288915,A0A0K0L4Y8,EMBL,KF833265
6288916,A0A0K0L4Y8,EMBL-CDS,AIS72444.1
6288917,A0A0K0L4Y8,NCBI_TaxID,9606
6288918,A0A0K0L4Y8,ChiTaRS,PNPLA2


In [25]:
#save a list of all the symbols in order to search their uniprot
sym_to_fix=[]
sym_to_fix.extend(biogrid_human['Official Symbol Interactor A'])
sym_to_fix.extend(biogrid_human['Official Symbol Interactor B'])

#remove duplicates
sym_to_fix=list(set(sym_to_fix))

# using join() 
# avoiding printing last comma 
print("The formatted output is : ") 
print(', '.join(sym_to_fix)) 
##print in order to search on uniprot.com

The formatted output is : 
STARD10, GALE, DMXL1, ZDHHC12, SCN2B, ZNF574, MYL3, MAGEB2, DOK2, TNFSF11, LOC101929876, REEP4, RPS4XP17, RUFY2, LDHA, PAQR6, TRPV2, PAFAH1B2, BNIP3, MCMBP, CEP350, DHX9, FGF3, TMEM92, EPB41L4A-AS2, MRPL2, SHH, PRKY, YPEL1, FAM63B, CA2, PRSS48, ETV2, ZCCHC13, SYVN1, ARHGEF11, EMP2, CCDC114, DTNA, MRVI1, RNY4, DEFB104B, SMC5, C14orf166, TMEM9, TAS2R62P, CHD9, TMEM194B, HNRNPH3, BANK1, MIR16-1, GJA3, KREMEN2, FCGR2A, ZFAS1, FAM74A1, KRCC1, TMEM183B, LIMS1, SNAR-B2, DOK1, WASL, LOC100272216, RNF185, AMFR, HTR1A, AHRR, CHURC1, PCDHA4, CTNND2, NDUFA1, TINF2, INS, CGRRF1, RIMKLB, TAB3, DMP1, PMS2, KPTN, AUNIP, RNF122, GABRE, ISX, COMP, RPL12P1, MTHFD1L, NES, MPHOSPH9, PPIH, THAP11, PPFIA1, MED25, UCKL1, MIR199A1, TRPM8, STK31, CKAP2L, CORO2A, SACS, RAB35, HERC4, ZSCAN5A, GCGR, KLHL40, ONECUT1, MIB2, EXPH5, AVPR1B, TNNC2, FAM76A, BOLA2B, KNDC1, TECRL, OR52L1, SUMO1P1, MGST1, CRHR2, FCER2, DR1, METTL5, TIMM21, MYOG, KCNE1L, STAM, ANKIB1, TMTC2, STK32A, SELP, AVEN, IF

SNORA63
MAGEA8
GINS1
EIF6
IPP
STON1
AGT
AGO3
VPS41
SP6
CTAGE6
AMHR2
ENPP3
ANKS1A
CHST10
C12orf45
PHLDA3
BMP5
SNORD116-24
NTN3
AICDA
MTA1
BET1
MYLPF
RPS26
PRDM10
BDKRB2
FAM132B
UPP1
FAM126A
ATP6V1H
LINC00518
C1QTNF8
CDH11
C9orf156
BMP6
KIAA1731NL
CCDC73
ALOX5AP
C12orf57
PDS5A
PCDHGB4
LRRC59
SPDEF
RBM4
SLC31A1
PPP1R15B
FAM135A
VSTM2A
BRCA1
SUZ12
SAMD1
PLCL2
ZFAND3
SLC9A2
RBFOX3
BEST1
ZNF443
PDIA4
GPER1
DNAAF5
TNNT1
NPHS2
DHRS3
HAVCR2
ODAM
TMEM236
CCDC50
RPL8
VASN
TFR2
CHRM1
THPO
DPCR1
VCL
TRIM33
CRLS1
BIN2
CCBL2
IFFO2
SHISA2
CLU
ATP5J
GMIP
FAM206A
AGBL1
SLC1A4
UBA52
ZGPAT
CRELD2
FSD1L
PPAPDC1A
UNC50
LIF
CREBBP
LSR
COPB1
PPIAP11
GLIPR1L2
C4orf46
PGK1
EEFSEC
OR6C70
RARS2
NLN
MSANTD2
SARS2
NPR3
SUN1
RNF224
CPN1
DUS2
OXR1
IGSF10
SNORD15A
NUP43
SLC24A5
PHKG1
HBZ
PGLS
TONSL
SPINK6
AP1AR
ARAP3
WFDC10A
NCF1
UBE2L3
TRIP11
C6orf25
CEP85L
OSTC
CHRM2
LARP1B
RNF5
MEF2A
MAT2B
AKR7L
UBE2MP1
MAST3
NFKB2
SPECC1L
TRBV12-3
ADSSL1
MIR128-1
MGAT1
ALG9-IT1
TAS2R60
SLC24A1
ABHD5
MTRR
ALG6
ZBTB5
SERPINA3
ABLIM3

STARD8
PCYT2
GSTT2
TRIM32
PWWP2A
LOC407835
LUZP4
PCDHB12
FBP2
METTL22
DLAT
DNAJB2
KRTAP10-11
ESCO2
IQCH
AATF
COQ4
SLC37A4
EFCAB3
GAS8
SEMA3A
SFRP4
BRAT1
OTC
RETSAT
SIPA1L1
CXCL12
TMEM207
GLIS1
FAM27E3
FOXA1
ABI3BP
CCDC57
ZC3H3
C2orf48
SLC6A4
GUCY2D
APOOL
ZKSCAN2
ATXN7
UBE2A
RORB
APOBEC3H
SV2B
GSTA2
PTPRN
ADRB1
LMO4
CST3
BAIAP3
GAD1
MINOS1-NBL1
RNA5S16
ZNF415
MIR9-3
ZBED3
GJB1
CD34
CXCR2
LRIG2
GSG1
SERF2-C15ORF63
METAP1D
PMS2P1
HMHA1
CYP8B1
EFNA5
ZSWIM8
ARL11
RASSF4
NCK1
KRT8P3
ZNF425
CRIPT
FABP2
ANKRD30B
KRTAP10-7
PRR23B
GMPS
OR8D4
SRXN1
CYP3A5
GLT8D2
ABTB1
C1QL1
HEATR5A
VCY
ORC2
RAB11FIP1
SAA2
PTGER2
AGPAT1
MIR138-2
KAT6B
SCYL1
GAPDH
IFNA4
NUP107
CGB
GPX4
PAPL
MANSC4
BCL3
SNORD16
TGFBR2
CS
LRRN2
DARS2
TTBK1
RASGRF1
KDM4B
ITM2C
OR9A4
SLC20A1
PAPOLG
C4orf32
HSPB6
IGKV1-27
DLX2
ARNTL
RASGRP2
ZNF519
POM121C
ZNF256
ATF5
NOS3
TTC29
NXPE4
ZNF562
SLC15A3
SLC26A8
RAB29
FBXO9
PRB3
HNRNPA3
SUZ12P1
AKR1C2
ITGA2B
FAM210B
FAM50B
NRG2
MID1IP1
LOC260334
MAPK14
AKR1C1
ITGA9
HYDIN2
PRF1
TSPY1
KIAA0430


CIB3
YAF2
HEPACAM2
RTN3
PDC
DUSP22
PPM1H
ALDH1L2
WNT9B
FGF11
KRBA2
CDK11A
EXOC3L2
KRTAP3-2
REST
KIF18B
GPM6B
NPNT
HTR2B
TGM4
CCDC65
ALDH1A2
UBE2Q2P2
SMPD2
TOM1L1
COG8
NR4A3
LOC400927
MYCL
HDAC6
STK19
SV2A
TBL1Y
C7orf34
CRYM
SASS6
RPL32P18
EPHB4
DEPDC5
ZNF761
RSPH3
LRRTM3
FLJ25613
PLA2G15
EGFR
CACNA1F
CDC42EP5
EML1
DHRS9
CFAP57
SLC18A3
HSD17B1
MPZ
CTAGE8
BRD8
TICAM1
RPH3AL
SLC38A10
FAM166A
PDK3
HNRNPCL2
TRNV
FAM58A
SCYL2
TSC22D4
TMED4
SEC62
SLC25A25
FARS2
CPT1A
RBFOX2
ICT1
VCAM1
CASS4
PLXNC1
MED14
GK2
UBTD2
BCL2L14
RBM39
OSBPL10
QTRT1
CA5B
AGPAT2
MMP7
SLK
TRIM41
NPM2
NXN
BCL9L
ACSL6
RAB5B
FAM219B
CLEC2L
RDH12
POLI
ANG
SOAT1
NDUFA8
COL7A1
TRMT1L
GLRB
UBE2C
ARHGDIG
NOM1
THEMIS2
SH3BP4
TMPRSS12
UGT1A6
CD58
CCDC36
IFT88
TRIB2
RBM47
ADCY6
SCFD2
TSPAN31
CCND3
PACSIN2
EID2B
PROP1
MDP1
HSPA13
OPLAH
CTSH
ZNF585A
HUNK
AMIGO1
UNC5B
CSRNP2
C9orf85
MORF4L1P2
ALDH3B2
SNORD118
HERC2
UNC13B
KRT3
MIR206
ANKRD49
ERLEC1
CSNK2A3
TRMT12
CCDC102B
XPR1
SPACA6P
GPR161
SEC61G
KCNAB3
TNFRSF1B
IRF8
CLSPN
SP7
RTN2

ZNF362
SPAG1
PCDHGA11
SLC17A4
RAB8A
NUBPL
CHST4
SPIRE1
NBPF20
CRNKL1
ZNF214
KAT2B
TCTEX1D1
PYROXD1
SAP30BP
HSPB9
RND1
TBCK
PPM1L
ZC3H8
SEMA4G
PKD2L1
F8A1
SRSF8
PSG2
SAAL1
TMEM234
SLC44A2
SPTBN4
UAP1
ANAPC1
CYP2S1
CAMKK2
TMEM159
REV3L
PTPLB
TEF
MYADM
RALB
CLSTN1
ZNF469
DMPK
MILR1
COX5B
VPS11
APOL4
LY6G6F
CT45A8
SAMD7
CPNE4
TMEM261
FRMD3
CSNK1A1
TRPC2
VEGFB
LOC401068
P2RX6
CCT8L1P
RPA1
TOP3A
C21orf59
ATP11A
FBXO17
RAD50
NR1H3
DOCK5
IPMK
CCDC6
FAM90A1
RASL11B
NAALADL2
SYPL2
ZNF704
MCC
KPNA3
LSP1P3
TTC5
FAM83B
ADCY1
CRYBG3
ZFC3H1
KCNB1
SORL1
TREML1
NDFIP2
IGF2R
SPPL3
NEK1
YBX2
RBL1
GDNF
RECQL5
NUDCD1
TFAP2C
METTL25
FEZ1
ZNF69
CDC42SE1
USP10
GDPD2
SLC7A8
CCDC69
PDK4
SH2D5
PTPN4
H2AFY
LOC101060521
AARS2
TSTA3
PHLDB2
FANCA
DDX55
POLR2J2
GGT5
ZNF112
TP53I13
BSCL2
TSPYL4
EAF2
ACSF2
TTC30B
SLC12A4
ATOH8
CT45A5
CALN1
C9orf91
NDUFB11
RNU2-1
LOC400682
LRRC2
ZNF484
VPS9D1-AS1
TSPAN6
PHLDB1
HNRNPA1
KXD1
GAMT
POLE2
MAPK15
SIMC1
FOXO3
SETD1B
CD300E
RPL36AL
BAG2
TCP10
SULT1A3
MIOS
AGA
C10orf131
CKAP5
SE

DDAH1
TXNIP
RABGGTB
KIAA1586
COX3
FAM129A
PIK3CA
CCDC144NL-AS1
FAM109B
LINC00526
PTPN23
S100A14
ARL13A
SERTAD1
IL26
CEP78
VSIG4
MORN2
DUX4
FCRL5
CNGA1
ATXN7L3B
RNF181
NRN1
LUC7L2
PRPSAP2
DPP8
AHCYL2
PAK6
TULP4
ERG
ZNF440
LRRC8A
LINC00152
ZNF746
SLC22A6
DNAH7
USP15
MRPL27
NETO1
PDCL3
GFM2
SLAMF7
ERI3
ETV3
TDRD5
PRG2
SPATA12
FKBPL
FAM222A
OSBPL9
MMP25
LIN7A
MORF4L2
GOLGA7B
C1orf127
GLI2
ZNF45
SIPA1L2
ZNF783
TGFB1
AGO4
VAMP4
TTN
APOH
EDC4
COA1
RAB37
ACTR5
PRTFDC1
MMAB
CTSO
PLB1
NRL
DHX15
TRIP10
OSGEP
GABRA4
CD93
SLC52A3
SPEF1
KLKB1
DNPH1
COL20A1
DPEP3
ARL4A
UNC119
PSG9
ZNF580
WASH3P
TMUB2
SCUBE2
COL4A3
IZUMO2
GRIK1
SIRPG
KRT1
ERF
GAMTP1
RNA5SP259
ERRFI1
COPS7B
PIWIL4
THRA
CKB
TMEM43
KIF9
ACTR2
TMCO6
MIDN
ABHD17A
CTU2
CA4
SVIL
GOLGA7
KRT7
DIDO1
CHCHD10
ZNF675
PTPRE
SERPINA12
PUF60
POLR3C
PRDM15
SLC26A1
GPN1
RSL1D1
SNAI1
SEC14L2
NDRG2
TAOK2
MKX
PROSER2
NCBP2-AS2
CLK2
PSMG2
CSTF3
TDGF1
PECR
ZFP91-CNTF
PRL
EIF4H
OSBPL11
RBMY1F
ZZEF1
USH2A
SLC27A1
CCR10
CCDC137
MAPK1IP1L
CACUL1
ACMSD
NHLRC1
NQ

LRP5
MAGEA2B
C9orf173
GAST
ARPC2
CDKAL1
RSPH14
PKHD1
CRAMP1L
TMEM39A
ARL15
GP1BA
MYO7A
LRCH2
EXOC8
PHKG2
HCN4
MED19
TRAP
C15orf27
NFIC
GFRA3
KLHL23
TEX14
IFT81
PPAN-P2RY11
ARNTL2
SGSM3
POTEE
ANXA9
IMP3
FCN3
SAYSD1
ZNF85
FSTL1
FMNL3
COX8A
GOLGA1
CLOCK
FAM98B
NDUFA11
OR2T6
DOPEY1
C15orf52
MED26
PPP1R3A
ZNF563
TMX2-CTNND1
FGF22
NOG
UBE2D1
ZKSCAN3
TMEM41B
SNX9
FOXL1
ASXL1
SIRPD
VCP
TRIM34
GFPT2
GSKIP
EP300
PMPCB
HUWE1
LGALS13
HOXD8
BTRC
TSPAN15
POLR2E
PDE6G
ADCYAP1R1
DDX19A
UBE2U
PNCK
LGALS8
KCTD8
JAM3
COL6A4P1
BCOR
SCIN
AP1M2
TNS1
NSMCE4A
QKI
WDR5
GTF2IRD1P1
TBCB
LY6E
LY96
ESPL1
SLC25A13
DCAF13
ATP5J2-PTCD1
PHB
IQCF3
USP9Y
FAHD2A
LYPD3
TFAP2A
BMPR1A
CACNA2D2
USP17L20
BBC3
PRDM13
BTG3
SULT1B1
GABPA
B4GALT4
MED17
MRPL1
ZBTB25
GRK4
WDR82
SEMA3B
TCOF1
MYCN
PEAK1
PPAT
TLCD2
MAPK3
CSAG3
ZFAT
SCN8A
CPSF4
CEBPG
RAX
SSX2B
AMY1B
THAP4
CELA2B
ZNF230
IGHV3-23
SPATA24
GRB7
OR7A10
TTI1
COL11A2
WNK1
CCDC71
GPX8
SMIM14
CHST11
HEATR1
PNLDC1
FRG1
ELAC1
EIF4G2
TIMP1
LGSN
IGKV2-30
TAF3
HOXD11
USP38
SEC31A
MS

LATS1
USP7
SCGB1D2
AHCYL1
APH1B
ACHE
ARHGEF35
ST6GALNAC4
UNC5A
CSTA
HIST1H2BM
TLX3
LYPLA2
UBTF
UACA
ANKRD22
ZNF622
XRCC2
MFSD10
RTN4RL2
SLC2A1
IGHV4-31
PARD3
KRTAP1-3
SPTB
PTPN9
EN1
PYROXD2
MAP7D2
CACNA1G
SLC25A51
SPOCK3
SNRNP25
ZSCAN29
C11orf98
VARS2
LHX6
UTP6
ZMYM3
CACNA1H
FCGR3A
SNRPB2
KLK4
SP8
GRN
NT5C1B
FNDC5
HIST2H2BC
AFAP1L2
CEP250
FAM167A
EMX1
B3GNT2
SNAI2
KRT33B
NPAS3
CCDC28A
RBM20
HDGFRP2
MEX3D
POU2F2
SLC7A5P2
KPNA4
RPS6KA2
OCIAD2
ZNF805
OGFR
ARPC3P1
MESDC1
CCDC170
B3GNTL1
LRPAP1
ZBED1
ZNF843
AP3M2
WFDC2
KLHL9
CDH6
WDR35
S100A7
TES
GJC2
CDRT15L2
SEPHS1
DSCAM
XAGE2
DUSP19
RNF144A
CDYL
EXOSC2
PRPF4B
KLK5
MRPL51
RCVRN
TNFRSF12A
SLC1A5
FBXO28
CLK3
FBXO18
C15orf39
COL8A2
FEM1C
KIR2DL3
CEP57L1
LOC283922
PINX1
SSC5D
CCP110
BNIP3P6
DERA
BAG4
MED20
RNU2-4P
SIGIRR
TAF12
ENTPD3
XDH
ERCC6L2
ADAM11
PPM1D
PDIA2
LAMA2
TRMT61B
CTNND1
NPAS1
SGK1
STAU1
STOML3
MDC1
ABCC8
WHAMM
RSRC2
BSG
ZNF628
NEUROG2
DRAM1
NDEL1
GLTSCR1L
ZNF701
ARHGAP30
PDPK1
FAM188A
LINC00898
TMEM25
ITFG2
HARBI1
ZBTB20
SRGAP3

COLEC10
RPL7P32
TMEM143
DNAJB6
HOXB13
CCL3L1
CRLF1
CYSRT1
SRSF3
ARHGEF7
ZNF830
CYSLTR1
RGMA
EDEM2
LCLAT1
VTRNA1-1
PRKX
ZNF23
ZNF513
OAS3
IGF2BP3
DDI2
CRTAP
PHOX2B
SSX6
BUB3
ZNF844
MMP28
FEZF1
GRIN3A
TOB1
SCARNA3
TRPT1
GOLGA8G
PRRC2A
ARFGAP1
RECQL4
FAM177A1
TK2
CLSTN2
P2RY14
CBX1
LAMTOR3
KIF20B
SLC12A8
IPO13
HSP90AB4P
MPND
SETD2
HSPB8
SLC22A24
WBP2
KCNG3
SOX1
PTX3
SLC15A4
RRP36
FAM122C
MRPS10
THY1
CCNB2
ELAVL3
MCM9
SS18
IGHG3
DYNC1I2
LINC01558
ZNF559
RBCK1
ACADSB
COQ9
RECK
UBE2T
AIFM3
KRT8P39
PDGFD
CCDC103
PICALM
CFC1
CADPS
ZNF732
SCAND2P
CNOT1
SNORD3A
FAM69A
BARD1
FAM72A
FOXP4-AS1
MIS12
DYNLL1
MIR663B
PASD1
SETD9
CERS1
MIR363
RNASEK-C17orf49
MAGEA9
RPL32P3
PET117
FAXC
FAIM
TBC1D3B
ZNF385B
GTF2H2B
COL9A1
TAF7
CMTR1
KCND2
KIF2B
RBAK
KSR2
RNU5E-6P
RPSAP26
WBP2NL
SLC2A2
SLC22A5
TTLL5
APITD1
TMEM184A
NHSL2
IER5
MCM6
NTMT1
FAM135B
FGFR3
PAX8
PRRT2
REXO1
MTMR1
TUBD1
EEPD1
IL12B
NEFM
FRMD5
TFF3
CDCA7L
OAZ2
FAM47B
MCL1
TMED10
EIF4EBP3
CAD
NAV3
ZNF316
TMEM255B
DAP
PCDHGA8
ARMC6
FZD8
ASCC2
VDR
MY

NameError: name 'fix_dic' is not defined

In [37]:
#upload the uniprot fixing file
unigene = pd.read_csv("./uniprot/uniprot-geneid-mapping.tab", sep = '\t')

In [39]:
#create a dictionary that maps symbol with its uniprot
unigene=unigene.rename(columns={"yourlist:M202001056746803381A1F0E0DB47453E0216320D5454DDB": "symbol"})
unigene=pd.Series(unigene.Entry.values, index=unigene.symbol).to_dict()
biogrid_human['UniprotAC interactor A']= biogrid_human['Official Symbol Interactor A'].map(unigene)
biogrid_human['UniprotAC interactor B']= biogrid_human['Official Symbol Interactor B'].map(unigene)
biogrid_human

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database,UniprotAC interactor A,UniprotAC interactor B
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,R4GN68,Q14315
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q86TC9,P35609
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q53SV1,P49354
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P23769,Q9UE85
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q5TEJ7,Q9BXH2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730424,2621674,84376,10749,124068,115972,-,-,HOOK3,KIF1C,HK3,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q86VS8,O43896
1730425,2621675,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q12929,Q9NZM3
1730426,2621676,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q12929,Q9NZM3
1730427,2621677,50618,2059,119098,108373,-,-,ITSN2,EPS8,PRO2015|SH3D1B|SH3P18|SWA|SWAP,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q9NZM3,Q12929


In [65]:
iid_human = pd.read_csv('iid.txt', sep = '\t')
iid_human.drop(['Unnamed: 0'], axis = 1)

Unnamed: 0,uniprot1,uniprot2,symbol1,symbol2,evidence type,cancer
0,Q9NUX5,Q9NVM4,POT1,PRMT7,exp,0
1,P62633,Q99729,CNBP,HNRNPAB,exp,0
2,O43715,O95817,TRIAP1,BAG3,exp,0
3,Q13285,Q9UKM9,NR5A1,RALY,exp,0
4,P60174,Q9H0W5,TPI1,CCDC8,exp,0
...,...,...,...,...,...,...
272487,O60341,Q8NEZ2,KDM1A,VPS37A,exp,0
272488,Q00987,Q8TAQ2,MDM2,SMARCC2,exp,0
272489,P22314,Q5VTR2,UBA1,RNF20,exp,0
272490,Q86X19,Q96J84,TMEM17,KIRREL1,exp,0


In [68]:
#SEED GENES INTERACTOME
def build_first_table(biogrid_human, iid_human):
    db1 = 'Biogrid Human'
    db2 = 'Integrated Interactions Database experimental data'
    t = pd.DataFrame(columns=['interactorA', 'interactorB', 
                                        'interactorA_Uniprot_AC', 'interactorB_Uniprot_AC', 'db_source'])
    for i in range(len(biogrid_human)):
        sa = biogrid_human['Official Symbol Interactor A'][i]
        sb = biogrid_human['Official Symbol Interactor B'][i]
        uniprota = biogrid_human['UniprotAC interactor A'][i]
        uniprotb = biogrid_human['UniprotAC interactor B'][i]
        if sa in geneSymbol and sb in geneSymbol:
            t = t.append({'interactorA':sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db1}
                         , ignore_index=True)
    for i in range(len(iid_human)):
        sa = iid_human['symbol1'][i]
        sb = iid_human['symbol2'][i]
        uniprota = iid_human['uniprot1'][i]
        uniprotb = iid_human['uniprot2'][i]
        if sa in geneSymbol and sb in geneSymbol:
            t = t.append({'interactorA':sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db2}
                         , ignore_index=True)
    t.to_csv("seed_genes_interactome.tsv", sep = '\t')

In [69]:
biogrid_human=biogrid_human.reset_index(drop=True)
iid_human=iid_human.reset_index(drop=True)
build_first_table(biogrid_human, iid_human)

In [70]:
interactome_seed = pd.read_csv("seed_genes_interactome.tsv", sep = '\t')
interactome_seed.drop(['Unnamed: 0'], axis = 1)

Unnamed: 0,interactorA,interactorB,interactorA_Uniprot_AC,interactorB_Uniprot_AC,db_source
0,ILK,DDX3X,Q13418,O00571,Biogrid Human
1,ILK,COPG1,Q13418,Q9Y678,Biogrid Human
2,CDKN2A,CDK6,Q9UPB7,Q00534,Biogrid Human
3,RXRA,CTNNB1,Q6P3U7,P35222,Biogrid Human
4,MUC1,CTNNB1,Q7Z551,P35222,Biogrid Human
...,...,...,...,...,...
475,TXN,CDKN1A,P10599,P38936,Integrated Interactions Database experimental ...
476,TRAF7,TRAF7,Q6Q0C0,Q6Q0C0,Integrated Interactions Database experimental ...
477,RXRA,IL12B,P19793,P29460,Integrated Interactions Database experimental ...
478,C9,FN1,P02748,P02751,Integrated Interactions Database experimental ...


union interactome: all proteins interacting with at least one seed gene, from all DBs, same format as above.

In [71]:
biogrid_human

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database,UniprotAC interactor A,UniprotAC interactor B
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,R4GN68,Q14315
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q86TC9,P35609
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q53SV1,P49354
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,P23769,Q9UE85
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q5TEJ7,Q9BXH2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485375,2621674,84376,10749,124068,115972,-,-,HOOK3,KIF1C,HK3,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q86VS8,O43896
485376,2621675,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q12929,Q9NZM3
485377,2621676,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q12929,Q9NZM3
485378,2621677,50618,2059,119098,108373,-,-,ITSN2,EPS8,PRO2015|SH3D1B|SH3P18|SWA|SWAP,...,9606,Low Throughput,-,-,-,-,-,BIOGRID,Q9NZM3,Q12929


In [100]:
def build_union_interactome(biogrid_human, iid_human):
    db1 = 'Biogrid Human'
    db2 = 'Integrated Interactions Database experimental data'
    t = pd.DataFrame(columns=['interactorA', 'interactorB', 
                                        'interactorA_Uniprot_AC', 'interactorB_Uniprot_AC', 'db_source'])
    for i in range(len(biogrid_human)):
        sa = biogrid_human['Official Symbol Interactor A'][i]
        sb = biogrid_human['Official Symbol Interactor B'][i]
        uniprota = biogrid_human['UniprotAC interactor A'][i]
        uniprotb = biogrid_human['UniprotAC interactor B'][i]
        if sa in geneSymbol or sb in geneSymbol:
            t = t.append({'interactorA':sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db1}
                         , ignore_index=True)
    for i in range(len(iid_human)):
        sa = iid_human['symbol1'][i]
        sb = iid_human['symbol2'][i]
        uniprota = iid_human['uniprot1'][i]
        uniprotb = iid_human['uniprot2'][i]
        if sa in geneSymbol or sb in geneSymbol:
            t = t.append({'interactorA':sa, 'interactorB':sb, 
                          'interactorA_Uniprot_AC':uniprota, 'interactorB_Uniprot_AC':uniprotb, 'db_source': db2}
                         , ignore_index=True)
    t.to_csv("union_interactome.tsv", sep = '\t')

In [101]:
biogrid_human=biogrid_human.reset_index(drop=True)
iid_human=iid_human.reset_index(drop=True)
build_union_interactome(biogrid_human, iid_human)

In [104]:
unionint = pd.read_csv("union_interactome.tsv", sep = '\t')

In [105]:
unionint

Unnamed: 0.1,Unnamed: 0,interactorA,interactorB,interactorA_Uniprot_AC,interactorB_Uniprot_AC,db_source
0,0,MAGI1,CTNNB1,H7C5T8,P35222,Biogrid Human
1,1,TSG101,CDKN1A,Q99816,P38936,Biogrid Human
2,2,TXN,COL1A1,P10599,T1RTD8,Biogrid Human
3,3,DCN,EGFR,Q6FH10,Q9H3D0,Biogrid Human
4,4,HTT,SETD2,X5DPA1,Q9BYW2,Biogrid Human
...,...,...,...,...,...,...
25034,25034,LTA,ANKRD40,P01374,Q6AI12,Integrated Interactions Database experimental ...
25035,25035,FN1,TARDBP,P02751,Q13148,Integrated Interactions Database experimental ...
25036,25036,EGFR,PSMD2,P00533,Q13200,Integrated Interactions Database experimental ...
25037,25037,TNF,AGO2,P01375,Q9UKV8,Integrated Interactions Database experimental ...


### intersection interactome: all proteins interacting with at least one seed gene confirmed by both DBs, in the       format: interactor A gene symbol, interactor B gene symbol, interactor A Uniprot AC, interactor B Uniprot AC


In [96]:
def build_intersection_interactome(biogrid_human, iid_human):
    db1 = 'Biogrid Human'
    db2 = 'Integrated Interactions Database experimental data'
    union = pd.read_csv("union_interactome.tsv", sep = '\t')
    
    union_biogrid = union.loc[(union['db_source'] == 'Biogrid Human')]
    union_biogrid = union_biogrid.drop(['Unnamed: 0', 'db_source'], axis = 1)
    
    union_iid = union.loc[(union['db_source'] == 'Integrated Interactions Database experimental data')]
    union_iid = union_iid.drop(['Unnamed: 0', 'db_source'], axis = 1)
    
    intersect = pd.merge(union_biogrid, union_iid)
    intersect.dropna(inplace=True)
    intersect.to_csv("intersection_interactome.tsv", sep = '\t')

In [97]:
biogrid_human=biogrid_human.reset_index(drop=True)
iid_human=iid_human.reset_index(drop=True)
build_intersection_interactome(biogrid_human, iid_human)

In [98]:
intersect = pd.read_csv("intersection_interactome.tsv", sep = '\t')

In [99]:
intersect

Unnamed: 0.1,Unnamed: 0,interactorA,interactorB,interactorA_Uniprot_AC,interactorB_Uniprot_AC
0,0,MIF,GORASP2,P14174,Q9H8Y8
1,1,CDKN1A,TEX11,P38936,Q8IYF3
2,2,CDKN1A,TEX11,P38936,Q8IYF3
3,3,CDKN1A,TEX11,P38936,Q8IYF3
4,4,CDKN1A,CCDC85B,P38936,Q15834
...,...,...,...,...,...
702,702,CTNNB1,DDB1,P35222,Q16531
703,703,CTNNB1,RANBP2,P35222,P49792
704,704,BCL10,COG6,O95999,Q9Y2V7
705,705,CTNNB1,CA9,P35222,Q16790
