In [20]:
import pandas as pd
import re

## 1.1) Explore information sources and compile the seed gene list:
a) Get the list of human genes (i.e. the **seed list**) involved in the disease **Cardiomyopathy, Dilated** from the dataset *“Curated gene-disease associations”* (from *https://www.disgenet.org/downloads*)

In [3]:
root = "C:/Users/clara/Documents/Bio/"
path = "C:/Users/clara/Documents/Bio/curated_gene_disease_associations.tsv"
curated_gene_desease_association_DF = pd.read_csv(path, sep = '\t', compression = 'infer')

In [4]:
cardio_DF = curated_gene_desease_association_DF.loc[curated_gene_desease_association_DF['diseaseId'] == 'C0007193']
print('Number of detected genes involved in the desease "Cardiomyopathy, Dilated": ', len(cardio_DF))
cardio_DF.head(5)

Number of detected genes involved in the desease "Cardiomyopathy, Dilated":  48


Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
502,58,ACTA1,0.54,0.769,C0007193,"Cardiomyopathy, Dilated",group,C14,Disease or Syndrome,0.4,1.0,2006.0,2013.0,2,0,GENOMICS_ENGLAND
610,70,ACTC1,0.61,0.538,C0007193,"Cardiomyopathy, Dilated",group,C14,Disease or Syndrome,0.65,1.0,2006.0,2019.0,0,3,CTD_human
1414,153,ADRB1,0.555,0.769,C0007193,"Cardiomyopathy, Dilated",group,C14,Disease or Syndrome,0.58,1.0,1998.0,2019.0,1,0,CTD_human
1444,154,ADRB2,0.442,0.923,C0007193,"Cardiomyopathy, Dilated",group,C14,Disease or Syndrome,0.51,1.0,2002.0,2008.0,1,0,CTD_human
3663,355,FAS,0.372,0.923,C0007193,"Cardiomyopathy, Dilated",group,C14,Disease or Syndrome,0.51,1.0,1999.0,2007.0,1,0,CTD_human


In [5]:
seed_list = list(cardio_DF['geneId'])

b) Check if gene symbols for all genes in the seed gene list are updated and approved on the *HGNC* website

In [6]:
cardio_DF['geneSymbol'].to_csv('gene_symbols.csv', index = False, header = False, line_terminator = ',')
gene_symbols_list = list(cardio_DF['geneSymbol'])
print(gene_symbols_list)

['ACTA1', 'ACTC1', 'ADRB1', 'ADRB2', 'FAS', 'FASLG', 'ATM', 'CD36', 'CSF3', 'NKX2-5', 'CTNNB1', 'DMD', 'EGFR', 'FASN', 'GPX1', 'ITGB1', 'LMNA', 'NR3C2', 'MYH6', 'MYH7', 'NPPA', 'NPPB', 'PKP2', 'PSEN1', 'PSEN2', 'RAC1', 'RAF1', 'RENBP', 'SCN5A', 'SDHA', 'SGCB', 'SHBG', 'SLC22A5', 'SOD2', 'TCF7L2', 'TMPO', 'TNNI3', 'TNNT2', 'TTN', 'UCP1', 'ALMS1', 'AXIN2', 'ABCC9', 'WDR12', 'CSRNP1', 'ABRA', 'SIK1', 'CAVIN4']


In [7]:
cardio_DF['geneId'].to_csv('geneId.csv', index = False, header = False, line_terminator = ',')

The multi-symbol checker on the *HGNC* database reports that all the gene symbols in our seed list are updated and approved. Three of them match both an approved and an alias symbol, namely ***FAS***, ***RAC1*** and ***RAF1***

For each protein in our seed list we want to collect the following information from the *Uniprot* website:
* official (primary) **gene symbol** --> *Gene names (primary)*
* **Uniprot AC**, alphanumeric ‘accession number’ (a.k.a. ’Uniprot entry’) --> *Entry*
* **protein name** (the main one only, do not report the aliases)
* **Entrez Gene ID** (a.k.a. ‘GeneID’) --> *geneID* from disgenet
* very brief description of its function (keep it very short, i.e. max 20 words)
* notes related to the above information, if any and if relevant

**NOTE**: With regards to the gene symbol **TMPO** only the entry corresponding to the protein *Thymopoietin, isoforms alpha* (P42166) has been kept since its information on the *HGNG* only refer to this one and not to the isoform beta/gamma

In [8]:
path = root + "uniprot-list-with-ids.csv"
uniprot_DF = pd.read_csv(path, sep = '\t')
uniprot_DF['ProteinName'] = uniprot_DF['ProteinName'].str.split(r"\(|\[").str[0]

In [33]:
uniprot_DF[['GeneName','UniprotAC', 'ProteinName', 'GeneId']].head(5)

Unnamed: 0,GeneName,UniprotAC,ProteinName,GeneId
0,ACTA1,P68133,"Actin, alpha skeletal muscle",58
1,ACTC1,P68032,"Actin, alpha cardiac muscle 1",70
2,ADRB1,P08588,Beta-1 adrenergic receptor,153
3,ADRB2,P07550,Beta-2 adrenergic receptor,154
4,FAS,P25445,Tumor necrosis factor receptor superfamily mem...,355


## 1.2) Collect interaction data

a) For each seed gene, collect all binary protein interactions from the *Biogrid Human*.

In [39]:
biogrid_path = root + "BIOGRID.tab3.txt"
biogrid_full_DF = pd.read_csv(biogrid_path, sep = '\t')
biogrid_full_DF.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,Q59H94,NP_001120959|NP_001449,-,-,-,-,-,-,Homo sapiens,Homo sapiens
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,Q59FD9|F6THM6,NP_001094|NP_001265272|NP_001265273,-,-,-,-,-,-,Homo sapiens,Homo sapiens
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,-,NP_002018,-,-,-,-,-,-,Homo sapiens,Homo sapiens
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,-,NP_150250|NP_150253|NP_150252|NP_150247|NP_150...,-,-,-,-,-,-,Homo sapiens,Homo sapiens
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,-,NP_644805|NP_003141|NP_001356447|NP_001356443|...,-,-,-,-,-,-,Homo sapiens,Homo sapiens


In [66]:
biogrid_seed_genes_interactions = biogrid_full_DF[((biogrid_full_DF['Official Symbol Interactor A'].isin(gene_symbols_list)) | (biogrid_full_DF['Official Symbol Interactor B'].isin(gene_symbols_list)) ) & (biogrid_full_DF['Organism Name Interactor A'] == 'Homo sapiens') & (biogrid_full_DF['Organism Name Interactor B'] == 'Homo sapiens')]

not_seed_genes = set(biogrid_seed_genes_interactions['Official Symbol Interactor A']).union(set(biogrid_seed_genes_interactions['Official Symbol Interactor B']))
not_seed_genes.difference_update(gene_symbols_list)

In [70]:
biogrid_seed_genes_interactions.shape

(12075, 37)

In [67]:
len(not_seed_genes)

4885

Select the interactions among the non-seed genes from the full dataset

In [69]:
biogrid_non_seed_genes_interactions = biogrid_full_DF[((biogrid_full_DF['Official Symbol Interactor A'].isin(not_seed_genes)) | (biogrid_full_DF['Official Symbol Interactor B'].isin(not_seed_genes)) ) & (biogrid_full_DF['Organism Name Interactor A'] == 'Homo sapiens') & (biogrid_full_DF['Organism Name Interactor B'] == 'Homo sapiens')]
biogrid_non_seed_genes_interactions.shape

(539688, 37)