In [1]:
import pandas as pd
import numpy as np
import csv 

In [2]:
PATH = "./disgenet/"
filename = "./disgenet/curated_gene_disease_associations.tsv"

In [15]:
df = pd.read_csv(filename, sep = '\t')
df.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,1,A1BG,0.857,0.172,C0019209,Hepatomegaly,phenotype,C06;C23,Finding,0.3,,2017.0,2017.0,1,0,CTD_human
1,1,A1BG,0.857,0.172,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction,0.3,,2015.0,2015.0,1,0,CTD_human
2,2,A2M,0.564,0.724,C0002395,Alzheimer's Disease,disease,C10;F03,Disease or Syndrome,0.4,0.848485,1998.0,2016.0,3,0,CTD_human
3,2,A2M,0.564,0.724,C0007102,Malignant tumor of colon,disease,C04;C06,Neoplastic Process,0.3,,2004.0,2004.0,1,0,CTD_human
4,2,A2M,0.564,0.724,C0009375,Colonic Neoplasms,group,C04;C06,Neoplastic Process,0.3,,2004.0,2004.0,1,0,CTD_human


In [16]:
print(len(df), df.columns)

81746 Index(['geneId', 'geneSymbol', 'DSI', 'DPI', 'diseaseId', 'diseaseName',
       'diseaseType', 'diseaseClass', 'diseaseSemanticType', 'score', 'EI',
       'YearInitial', 'YearFinal', 'NofPmids', 'NofSnps', 'source'],
      dtype='object')


#find row associated to disease Malignant Mesothelioma with id C0345967

The columns in the files are:
* geneId 		-> NCBI Entrez Gene Identifier
* geneSymbol	-> Official Gene Symbol
* DSI		-> The Disease Specificity Index for the gene
* DPI		-> The Disease Pleiotropy Index for the gene
* diseaseId 	-> UMLS concept unique identifier
* diseaseName 	-> Name of the disease	
* diseaseType  	-> The DisGeNET disease type: disease, phenotype and group
* diseaseClass	-> The MeSH disease class(es)
* diseaseSemanticType	-> The UMLS Semantic Type(s) of the disease
* score		-> DisGENET score for the Gene-Disease association
* EI		-> The Evidence Index for the Gene-Disease association
* YearInitial	-> First time that the Gene-Disease association was reported
* YearFinal	-> Last time that the Gene-Disease association was reported
* NofPmids	-> Total number of publications reporting the Gene-Disease association
* NofSnps		-> Total number of SNPs associated to the Gene-Disease association
* source		-> Original source reporting the Gene-Disease association

In [17]:
df.loc[df['diseaseId'] == 'C0345967']
target = df.loc[df['diseaseName'] == 'Malignant mesothelioma']
target.to_csv(PATH+"malignant_mesothelioma_curated_genes.tsv", sep = '\t')

In [8]:
t = pd.read_csv(PATH+"malignant_mesothelioma_curated_genes.tsv", sep = '\t')

In [9]:
t = t.drop('Unnamed: 0', axis = 1)
t.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,10,NAT2,0.466,0.828,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.34,0.5,1995.0,2009.0,1,0,CTD_human
1,142,PARP1,0.432,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.32,1.0,2011.0,2011.0,1,0,CTD_human
2,302,ANXA2,0.485,0.793,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.3,,2010.0,2010.0,1,0,CTD_human
3,335,APOA1,0.463,0.759,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.3,,2013.0,2013.0,1,0,CTD_human
4,596,BCL2,0.312,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.31,1.0,2006.0,2010.0,1,0,CTD_human


### Explore the DisGeNet dataset, find the disease of interest and get the list of human genes involved.

In [11]:
curated = pd.read_csv("./disgenet/browser_source_genes_summary_CURATED.tsv", sep = '\t')
curated.head()            

Unnamed: 0,Gene,Gene_id,UniProt,Gene_Full_Name,Protein_Class,pLI,DSI_g,DPI_g,N_diseases,N_SNPs
0,NAT2,10,P11245,N-acetyltransferase 2,transferase,4e-06,0.466,0.828,37,1
1,PARP1,142,P09874,poly(ADP-ribose) polymerase 1,,0.000552,0.432,0.862,66,2
2,ANXA2,302,P07355,annexin A2,,0.000482,0.485,0.793,26,0
3,APOA1,335,P02647,apolipoprotein A1,,0.000549,0.463,0.759,54,15
4,BCL2,596,P10415,"BCL2, apoptosis regulator",signaling molecule,0.56705,0.312,0.862,137,8


In [12]:
#now let's save the gene symbols, entrez ID and names in arrays
geneSymbol=[]
geneID=[]
geneName=[]
for i in range(len(curated)):
    geneSymbol.append(curated['Gene'][i])
    geneID.append(curated['Gene_id'][i])
    geneName.append(curated['Gene_Full_Name'][i])    
#we check on HGNC to see if we need to change genes name

b) For all genes in the seed gene list, collect the following basic information from the Uniprot:

* official (primary) gene symbol (check if the symbols are updated and approved on the HGNC website; report any issue/lack of data/potential misinterpretation)

* Uniprot AC, alphanumeric ‘accession number’ (a.k.a. ’Uniprot entry’)

* protein name (the main one only, do not report the aliases)

* Entrez Gene ID (a.k.a. ‘GeneID’) very brief description of its function (keep it very short, i.e. max 20 words)

* notes related to the above information, if any and if relevant

Store the data gathered in a table in an easily accessible format of your choice (csv, tab,
excel, etc).

In [13]:
print(geneSymbol)

['NAT2', 'PARP1', 'ANXA2', 'APOA1', 'BCL2', 'C9', 'CALB2', 'TNFRSF8', 'CDK6', 'CDKN1A', 'CDKN2A', 'COL12A1', 'CSF1', 'CTNNB1', 'DDX3X', 'EGFR', 'EPHX1', 'F9', 'EFEMP1', 'FCN2', 'FGF9', 'MLANA', 'FN1', 'FTH1', 'GPR27', 'GPR37', 'GSTM1', 'HGF', 'ICAM2', 'IFNG', 'IL1A', 'IL2RA', 'IL3', 'IL4', 'IL6', 'CXCL8', 'IL12B', 'ILK', 'KIT', 'LTA', 'MCAM', 'MDK', 'MET', 'MIF', 'CXCL9', 'MUC1', 'NF2', 'NGF', 'SERPINA4', 'PTPRF', 'RAF1', 'RXRA', 'RYR2', 'CLEC11A', 'CCL3', 'CCL5', 'CCL7', 'CCL23', 'SDC1', 'CXCL12', 'SLC2A1', 'SLC22A5', 'SMARCA2', 'SOD2', 'TF', 'TFDP2', 'TGFB2', 'TP53', 'TXN', 'VIM', 'WNT3', 'WT1', 'BAP1', 'FGF18', 'BCL10', 'MTMR4', 'CCNE2', 'OSMR', 'ADAMTS2', 'ULK2', 'HDAC4', 'HEPH', 'SETDB1', 'MSLN', 'ENOX2', 'SEMA4F', 'CXCL13', 'PDPN', 'IGF2BP3', 'CCL27', 'COPG1', 'LIMCH1', 'WWC1', 'SF3B1', 'CFAP45', 'AGO1', 'AGO2', 'SETD2', 'PYCARD', 'PLLP', 'FGD6', 'TBL1XR1', 'TRAF7', 'DDX51', 'NANOS1', 'MIR125A', 'MIR126', 'MIR484', 'PWAR6']


In [27]:
#fino a COL12A1 approved

In [28]:
import urllib.request
import time

site = "https://www.genenames.org/tools/search/#!/all?query="
for i in range(0, len(geneSymbol)):
    symbol = geneSymbol[i]
    url = site+symbol
    print(symbol)
    response = urllib.request.urlopen(url)
    print(response)
    time.sleep(2)

NAT2
<http.client.HTTPResponse object at 0x7f2a222b1f98>
PARP1
<http.client.HTTPResponse object at 0x7f2a22c3dcf8>
ANXA2
<http.client.HTTPResponse object at 0x7f2a22c3d8d0>
APOA1
<http.client.HTTPResponse object at 0x7f2a22c3da20>
BCL2
<http.client.HTTPResponse object at 0x7f2a22c3de10>


KeyboardInterrupt: 

In [14]:
# printing original list  
print("The original list is : " + str(geneSymbol))
  
# using join() 
# avoiding printing last comma 
print("The formatted output is : ") 
print(', '.join(geneSymbol)) 

# deleted quotes to pass the list into site
#https://www.genenames.org/tools/multi-symbol-checker/

The original list is : ['NAT2', 'PARP1', 'ANXA2', 'APOA1', 'BCL2', 'C9', 'CALB2', 'TNFRSF8', 'CDK6', 'CDKN1A', 'CDKN2A', 'COL12A1', 'CSF1', 'CTNNB1', 'DDX3X', 'EGFR', 'EPHX1', 'F9', 'EFEMP1', 'FCN2', 'FGF9', 'MLANA', 'FN1', 'FTH1', 'GPR27', 'GPR37', 'GSTM1', 'HGF', 'ICAM2', 'IFNG', 'IL1A', 'IL2RA', 'IL3', 'IL4', 'IL6', 'CXCL8', 'IL12B', 'ILK', 'KIT', 'LTA', 'MCAM', 'MDK', 'MET', 'MIF', 'CXCL9', 'MUC1', 'NF2', 'NGF', 'SERPINA4', 'PTPRF', 'RAF1', 'RXRA', 'RYR2', 'CLEC11A', 'CCL3', 'CCL5', 'CCL7', 'CCL23', 'SDC1', 'CXCL12', 'SLC2A1', 'SLC22A5', 'SMARCA2', 'SOD2', 'TF', 'TFDP2', 'TGFB2', 'TP53', 'TXN', 'VIM', 'WNT3', 'WT1', 'BAP1', 'FGF18', 'BCL10', 'MTMR4', 'CCNE2', 'OSMR', 'ADAMTS2', 'ULK2', 'HDAC4', 'HEPH', 'SETDB1', 'MSLN', 'ENOX2', 'SEMA4F', 'CXCL13', 'PDPN', 'IGF2BP3', 'CCL27', 'COPG1', 'LIMCH1', 'WWC1', 'SF3B1', 'CFAP45', 'AGO1', 'AGO2', 'SETD2', 'PYCARD', 'PLLP', 'FGD6', 'TBL1XR1', 'TRAF7', 'DDX51', 'NANOS1', 'MIR125A', 'MIR126', 'MIR484', 'PWAR6']
The formatted output is : 
NAT2, 

In [32]:
#all gene symbols were approved by HGNC 

In [15]:
uniprotAC=[]
for i in range(len(curated)):
    uniprotAC.append(curated['UniProt'][i])

In [37]:
results = pd.DataFrame(list(zip(geneSymbol,geneName,geneID,uniprotAC)), columns=['Symbol','Name','ID','UniprotAC'])
results.to_csv('mesothelioma-curated-genes.csv')

In [38]:
results.head()

Unnamed: 0,Symbol,Name,ID,UniprotAC
0,NAT2,N-acetyltransferase 2,10,P11245
1,PARP1,poly(ADP-ribose) polymerase 1,142,P09874
2,ANXA2,annexin A2,302,P07355
3,APOA1,apolipoprotein A1,335,P02647
4,BCL2,"BCL2, apoptosis regulator",596,P10415


### Exercise 1.2

For each seed gene, collect all binary protein interactions from two different PPI sources:
* Biogrid Human, latest release available
* IID Integrated Interactions Database (experimental data only, all tissues, unless stated otherwise in further instruction)

Note: once you got the list of the proteins interacting with at least one seed gene, you must
also retrieve and include in your interactome the interactions among these non-seed
proteins

In [3]:
#open biogrid DB
biogrid=pd.read_csv('BIOGRID-ALL-3.5.179.tab2.txt', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
biogrid.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,8599089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,10938104,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [43]:
len(biogrid)

1730436

In [5]:
#let's select what we want from the biogrid dataset.
#select only human PPI
biogrid.columns

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Pubmed ID',
       'Organism Interactor A', 'Organism Interactor B', 'Throughput', 'Score',
       'Modification', 'Phenotypes', 'Qualifications', 'Tags',
       'Source Database'],
      dtype='object')

In [47]:
#I took the interactions for NAT2 gene in humans 
example = pd.read_csv('./biogrid/BIOGRID-GENE-106528-3.5.179.tab2.txt', sep='\t')
example.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,739497,10,351,106528,106848,-,-,NAT2,APP,AAC2|NAT-2|PNAT,...,21832049,9606,9606,High Throughput,-,-,-,-,-,BIOGRID
1,1180805,10,56910,106528,121238,-,-,NAT2,STARD7,AAC2|NAT-2|PNAT,...,26186194,9606,9606,High Throughput,0.99999353,-,-,Interaction also contained in second paper aft...,-,BIOGRID
2,1180806,10,23567,106528,117109,-,-,NAT2,ZNF346,AAC2|NAT-2|PNAT,...,26186194,9606,9606,High Throughput,0.999988426,-,-,Interaction also contained in second paper aft...,-,BIOGRID
3,1180807,10,25818,106528,117346,-,UNQ570/PRO1132,NAT2,KLK5,AAC2|NAT-2|PNAT,...,26186194,9606,9606,High Throughput,0.99345277,-,-,Interaction also contained in second paper aft...,-,BIOGRID
4,1417575,324,10,106821,106528,-,-,APC,NAT2,BTPS2|DP2|DP2.5|DP3|GS|PPP1R46,...,25640309,9606,9606,High Throughput,-,-,-,-,-,BIOGRID


In [6]:
#select only human PPI
biogrid_human=biogrid.loc[(biogrid['Organism Interactor A']==9606) & (biogrid['Organism Interactor B']==9606)]
biogrid_human

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,8599089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,10938104,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730424,2621674,84376,10749,124068,115972,-,-,HOOK3,KIF1C,HK3,...,28718761,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1730425,2621675,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1730426,2621676,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1730427,2621677,50618,2059,119098,108373,-,-,ITSN2,EPS8,PRO2015|SH3D1B|SH3P18|SWA|SWAP,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [16]:
biogrid_seed_genes = biogrid_human.loc[(biogrid_human['Official Symbol Interactor A'].isin(geneSymbol)) | (biogrid_human['Official Symbol Interactor B'].isin(geneSymbol))]
biogrid_seed_genes

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
30,3024,9223,1499,114655,107880,RP11-88H12.2,OK/SW-cl.35,MAGI1,CTNNB1,AIP-3|AIP3|BAIAP1|BAP-1|BAP1|MAGI-1|Magi1d|TNR...,...,10772923,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
32,3224,7251,1026,113102,107460,-,-,TSG101,CDKN1A,TSG10|VPS23,...,11943869,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
54,3956,7295,1277,113146,107674,RP11-427L11.1,-,TXN,COL1A1,TRDX|TRX|TRX1,...,12099690,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
63,4867,1634,1956,108002,108276,-,-,DCN,EGFR,CSCD|DSPG2|PG40|PGII|PGS2|SLRR1B,...,12105206,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
131,7994,3064,29072,109314,118845,-,HSPC069,HTT,SETD2,HD|IT15,...,9700202,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1729703,2620953,636,302,107105,106799,RP11-817I4.1,-,BICD1,ANXA2,BICD,...,28718761,9606,9606,High Throughput,-,-,-,-,-,BIOGRID
1729866,2621116,23299,302,116891,106799,RP11-476B13.3,-,BICD2,ANXA2,SMALED2|bA526D8.1,...,28718761,9606,9606,High Throughput,-,-,-,-,-,BIOGRID
1729988,2621238,23299,1654,116891,108020,RP11-476B13.3,-,BICD2,DDX3X,SMALED2|bA526D8.1,...,28718761,9606,9606,High Throughput,-,-,-,-,-,BIOGRID
1730086,2621336,51361,302,119496,106799,-,-,HOOK1,ANXA2,HK1,...,28718761,9606,9606,High Throughput,-,-,-,-,-,BIOGRID


In [55]:
#from our seed interaction let's search non-seeds genes that interacts with a seed gene

In [17]:
non_seed_df_A = biogrid_seed_genes.loc[~(biogrid_seed_genes['Official Symbol Interactor A'].isin(geneSymbol))]
#B is a seed gene
seed_B = non_seed_df_A .loc[(non_seed_df_A ['Official Symbol Interactor B'].isin(geneSymbol))]

In [18]:
non_seed_df_B = biogrid_seed_genes.loc[~(biogrid_seed_genes['Official Symbol Interactor B'].isin(geneSymbol))]
#A is a seed gene
seed_A = non_seed_df_B.loc[(non_seed_df_B ['Official Symbol Interactor A'].isin(geneSymbol))]

In [19]:
non_seed_df_A=non_seed_df_A.reset_index(drop=True)
non_seed_df_B=non_seed_df_B.reset_index(drop=True)

In [20]:
non_seed_df_B

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,3956,7295,1277,113146,107674,RP11-427L11.1,-,TXN,COL1A1,TRDX|TRX|TRX1,...,12099690,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,8799,3552,4692,109768,110772,-,-,IL1A,NDN,IL-1A|IL1|IL1-ALPHA|IL1F1,...,12913118,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,9270,596,8678,107068,114226,-,-,BCL2,BECN1,Bcl-2|PPP1R50,...,9765397,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,12266,335,5136,106832,111162,-,-,APOA1,PDE1A,-,...,11991719,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,13649,23286,5590,116884,111576,-,RP11-181G12.1,WWC1,PRKCZ,HBEBP3|HBEBP36|KIBRA|MEMRYQTL|PPP1R168,...,15081397,9606,9606,Low Throughput,-,-,-,figure 1,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9448,2616344,3611,6386,109824,112287,-,-,ILK,SDCBP,HEL-S-28|ILK-1|ILK-2|P59|p59ILK,...,26172215,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
9449,2616345,3611,3987,109824,110175,-,-,ILK,LIMS1,HEL-S-28|ILK-1|ILK-2|P59|p59ILK,...,26172215,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
9450,2616467,7157,5888,113010,111825,-,-,TP53,RAD51,BCC7|LFS1|P53|TRP53,...,11948396,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
9451,2616562,1499,6925,107880,112787,OK/SW-cl.35,-,CTNNB1,TCF4,CTNNB|MRD19|armadillo,...,19114997,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [21]:
non_seed_list=[]
#fill the list
for i in range(0, len(non_seed_df_A)):
    non_seed_list.append(non_seed_df_A['Official Symbol Interactor A'][i])

In [22]:
for i in range(0, len(non_seed_df_B)):
    non_seed_list.append(non_seed_df_B['Official Symbol Interactor B'][i])

In [23]:
#drop duplicates
non_seed_list = list(dict.fromkeys(non_seed_list))

In [24]:
for i in range(0, len(geneSymbol)):
    for j in range(0, len(non_seed_list)):
        if(geneSymbol[i] == non_seed_list[j]):
            print("Something is wrong")
print("Ok")

Ok


In [31]:
print(len(non_seed_list))

4945


In [25]:
#now search for non seed interactions from the human DB
#reset index
biogrid_human=biogrid_human.reset_index(drop=True)
#create a list of index of the original matrix with non seed genes interacting each others
list_of_idx=[]
#fill the list
for i in range(len(biogrid_human)):
    if biogrid_human['Official Symbol Interactor A'][i] in non_seed_list and biogrid_human['Official Symbol Interactor B'][i] in non_seed_list:
        list_of_idx.append(i)

In [57]:
#biogrid_seed_genes
biogrid_non_seed = biogrid_human.loc[list_of_idx]
biogrid_non_seed = biogrid_non_seed.reset_index(drop=True)
biogrid_non_seed

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,586,375,23163,106870,116775,-,-,ARF1,GGA3,-,...,10747089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,917,333,1600,106830,107970,-,RP6-239D12.2,APLP1,DAB1,APLP,...,10460257,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,1161,2033,7020,108347,112878,RP1-85F18.1,RP1-290I10.1,EP300,TFAP2A,KAT3B|RSTS2|p300,...,12586840,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229600,2621674,84376,10749,124068,115972,-,-,HOOK3,KIF1C,HK3,...,28718761,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
229601,2621675,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
229602,2621676,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
229603,2621677,50618,2059,119098,108373,-,-,ITSN2,EPS8,PRO2015|SH3D1B|SH3P18|SWA|SWAP,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [63]:
#make a table in which interactor 1 is seed and interactor 2 can be seed or non-seed
biogrid_seed_df = biogrid_seed_genes.loc[(biogrid_seed_genes['Official Symbol Interactor A'].isin(geneSymbol))]
interactome = pd.concat([biogrid_seed_df, biogrid_non_seed], ignore_index=True)
interactome.to_csv("interactome-biogrid.txt", sep='\t')

In [64]:
interactome = pd.read_csv("interactome-biogrid.txt", sep= '\t')
interactome = interactome.drop(['Unnamed: 0'], axis = 1)
interactome

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,3956,7295,1277,113146,107674,RP11-427L11.1,-,TXN,COL1A1,TRDX|TRX|TRX1,...,12099690,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,8799,3552,4692,109768,110772,-,-,IL1A,NDN,IL-1A|IL1|IL1-ALPHA|IL1F1,...,12913118,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,9270,596,8678,107068,114226,-,-,BCL2,BECN1,Bcl-2|PPP1R50,...,9765397,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,12266,335,5136,106832,111162,-,-,APOA1,PDE1A,-,...,11991719,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,13649,23286,5590,116884,111576,-,RP11-181G12.1,WWC1,PRKCZ,HBEBP3|HBEBP36|KIBRA|MEMRYQTL|PPP1R168,...,15081397,9606,9606,Low Throughput,-,-,-,figure 1,-,BIOGRID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239447,2621674,84376,10749,124068,115972,-,-,HOOK3,KIF1C,HK3,...,28718761,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
239448,2621675,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
239449,2621676,2059,50618,108373,119098,-,-,EPS8,ITSN2,DFNB102,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
239450,2621677,50618,2059,119098,108373,-,-,ITSN2,EPS8,PRO2015|SH3D1B|SH3P18|SWA|SWAP,...,22449706,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [32]:
iid = pd.read_csv('human_annotated_PPIs.txt', sep='\t')
iid

Unnamed: 0,uniprot1,uniprot2,symbol1,symbol2,methods,pmids,dbs,evidence type,adipose tissue,adrenal gland,...,arteriosclerosis,lymphocytic leukemia,enzymes,ion channels,receptors,transporters,drug targets,targeting drugs,orthologs are drug targets,drugs targeting orthologs
0,Q9NUX5,Q9NVM4,POT1,PRMT7,bimolecular fluorescence complementation;two h...,21044950,biogrid;intact,exp,0,0,...,0,0,1,0,0,0,0,-,0,-
1,Q96JY6,Q9NPC6,PDLIM2,MYOZ2,-,21836163,iid-pred,pred,0,1,...,0,0,0,0,0,0,0,-,0,-
2,Q15414,Q32P51,-,HNRNPA1L2,-,23023127,iid-pred,pred,?,?,...,0,0,0,0,0,0,0,-,0,-
3,P62633,Q99729,CNBP,HNRNPAB,affinity chromatography technology;anti bait c...,17353931,biogrid;intact,exp,1,1,...,0,0,0,0,0,0,0,-,0,-
4,P29353,P42685,SHC1,FRK,-,25402006,iid-pred,pred,0,0,...,0,0,1,0,0,0,1,Dasatinib;Regorafenib,0,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
975872,O00506,P06239,STK25,LCK,-,23023127,iid-pred,pred,1,0,...,0,0,1,0,0,0,1,{4-[(2S)-2-Acetamido-3-({(1S)-1-[3-carbamoyl-4...,0,-
975873,P13500,P51681,CCL2,CCR5,biochemical;competition binding,10477718;16082366;21836163;25402006,dip;hprd;iid-pred;innatedb,exp;pred,1,1,...,1,0,1,0,1,0,1,AMD-070;Ibalizumab;INCB-9471;Maraviroc;Vicrivi...,0,-
975874,Q86X19,Q96J84,TMEM17,KIRREL1,bioid;proximity-dependent biotin identification,26638075,biogrid;intact,exp,0,1,...,0,0,0,0,0,0,0,-,0,-
975875,P40227,Q7Z6Z7,CCT6A,HUWE1,affinity chromatography technology,25147182,biogrid,exp,1,1,...,0,0,1,0,0,0,0,-,0,-


In [52]:
#select only rows with evidence type exp, drop some columns.
iid = iid.loc[iid['evidence type'] == 'exp']
icols = ['uniprot1', 'uniprot2', 'symbol1', 'symbol2' , 'evidence type', 'cancer']
iid = iid.reset_index(drop=True)
iid = iid[['uniprot1', 'uniprot2', 'symbol1', 'symbol2' , 'evidence type', 'cancer']]

In [54]:
iid

Unnamed: 0,uniprot1,uniprot2,symbol1,symbol2,evidence type,cancer
0,Q9NUX5,Q9NVM4,POT1,PRMT7,exp,0
1,P62633,Q99729,CNBP,HNRNPAB,exp,0
2,O43715,O95817,TRIAP1,BAG3,exp,0
3,Q13285,Q9UKM9,NR5A1,RALY,exp,0
4,P60174,Q9H0W5,TPI1,CCDC8,exp,0
...,...,...,...,...,...,...
272487,O60341,Q8NEZ2,KDM1A,VPS37A,exp,0
272488,Q00987,Q8TAQ2,MDM2,SMARCC2,exp,0
272489,P22314,Q5VTR2,UBA1,RNF20,exp,0
272490,Q86X19,Q96J84,TMEM17,KIRREL1,exp,0


In [56]:
iid.to_csv('iid.txt', sep = '\t')

In [48]:
#SAME PROCEDURE. 