In [10]:
import pandas as pd
import numpy as np
import csv 

In [14]:
PATH = "./disgenet/"
filename = "./disgenet/curated_gene_disease_associations.tsv"

In [15]:
df = pd.read_csv(filename, sep = '\t')
df.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,1,A1BG,0.857,0.172,C0019209,Hepatomegaly,phenotype,C06;C23,Finding,0.3,,2017.0,2017.0,1,0,CTD_human
1,1,A1BG,0.857,0.172,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction,0.3,,2015.0,2015.0,1,0,CTD_human
2,2,A2M,0.564,0.724,C0002395,Alzheimer's Disease,disease,C10;F03,Disease or Syndrome,0.4,0.848485,1998.0,2016.0,3,0,CTD_human
3,2,A2M,0.564,0.724,C0007102,Malignant tumor of colon,disease,C04;C06,Neoplastic Process,0.3,,2004.0,2004.0,1,0,CTD_human
4,2,A2M,0.564,0.724,C0009375,Colonic Neoplasms,group,C04;C06,Neoplastic Process,0.3,,2004.0,2004.0,1,0,CTD_human


In [16]:
print(len(df), df.columns)

81746 Index(['geneId', 'geneSymbol', 'DSI', 'DPI', 'diseaseId', 'diseaseName',
       'diseaseType', 'diseaseClass', 'diseaseSemanticType', 'score', 'EI',
       'YearInitial', 'YearFinal', 'NofPmids', 'NofSnps', 'source'],
      dtype='object')


#find row associated to disease Malignant Mesothelioma with id C0345967

The columns in the files are:
* geneId 		-> NCBI Entrez Gene Identifier
* geneSymbol	-> Official Gene Symbol
* DSI		-> The Disease Specificity Index for the gene
* DPI		-> The Disease Pleiotropy Index for the gene
* diseaseId 	-> UMLS concept unique identifier
* diseaseName 	-> Name of the disease	
* diseaseType  	-> The DisGeNET disease type: disease, phenotype and group
* diseaseClass	-> The MeSH disease class(es)
* diseaseSemanticType	-> The UMLS Semantic Type(s) of the disease
* score		-> DisGENET score for the Gene-Disease association
* EI		-> The Evidence Index for the Gene-Disease association
* YearInitial	-> First time that the Gene-Disease association was reported
* YearFinal	-> Last time that the Gene-Disease association was reported
* NofPmids	-> Total number of publications reporting the Gene-Disease association
* NofSnps		-> Total number of SNPs associated to the Gene-Disease association
* source		-> Original source reporting the Gene-Disease association

In [17]:
df.loc[df['diseaseId'] == 'C0345967']
target = df.loc[df['diseaseName'] == 'Malignant mesothelioma']
target.to_csv(PATH+"malignant_mesothelioma_curated_genes.tsv", sep = '\t')

In [18]:
t = pd.read_csv(PATH+"malignant_mesothelioma_curated_genes.tsv", sep = '\t')

In [19]:
t = t.drop('Unnamed: 0', axis = 1)
t.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,10,NAT2,0.466,0.828,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.34,0.5,1995.0,2009.0,1,0,CTD_human
1,142,PARP1,0.432,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.32,1.0,2011.0,2011.0,1,0,CTD_human
2,302,ANXA2,0.485,0.793,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.3,,2010.0,2010.0,1,0,CTD_human
3,335,APOA1,0.463,0.759,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.3,,2013.0,2013.0,1,0,CTD_human
4,596,BCL2,0.312,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.31,1.0,2006.0,2010.0,1,0,CTD_human


### Explore the DisGeNet dataset, find the disease of interest and get the list of human genes involved.

In [21]:
#now let's save the gene symbols, entrez ID and names in arrays
symbol=[]
entrezID=[]
for i in range(len(t)):
#     print(disgenet['Gene'][i]+" ")
    symbol.append(t['geneSymbol'][i])
    entrezID.append(t['geneId'][i]
#we check on HGNC to see if we need to change genes name

In [23]:
curated = pd.read_csv("./disgenet/browser_source_genes_summary_CURATED.tsv", sep = '\t')
curated.head()            

Unnamed: 0,Gene,Gene_id,UniProt,Gene_Full_Name,Protein_Class,pLI,DSI_g,DPI_g,N_diseases,N_SNPs
0,NAT2,10,P11245,N-acetyltransferase 2,transferase,4e-06,0.466,0.828,37,1
1,PARP1,142,P09874,poly(ADP-ribose) polymerase 1,,0.000552,0.432,0.862,66,2
2,ANXA2,302,P07355,annexin A2,,0.000482,0.485,0.793,26,0
3,APOA1,335,P02647,apolipoprotein A1,,0.000549,0.463,0.759,54,15
4,BCL2,596,P10415,"BCL2, apoptosis regulator",signaling molecule,0.56705,0.312,0.862,137,8


In [24]:
#now let's save the gene symbols, entrez ID and names in arrays
geneSymbol=[]
geneID=[]
geneName=[]
for i in range(len(curated)):
    geneSymbol.append(curated['Gene'][i])
    geneID.append(curated['Gene_id'][i])
    geneName.append(curated['Gene_Full_Name'][i])    
#we check on HGNC to see if we need to change genes name

b) For all genes in the seed gene list, collect the following basic information from the Uniprot:

* official (primary) gene symbol (check if the symbols are updated and approved on the HGNC website; report any issue/lack of data/potential misinterpretation)

* Uniprot AC, alphanumeric ‘accession number’ (a.k.a. ’Uniprot entry’)

* protein name (the main one only, do not report the aliases)

* Entrez Gene ID (a.k.a. ‘GeneID’) very brief description of its function (keep it very short, i.e. max 20 words)

* notes related to the above information, if any and if relevant

Store the data gathered in a table in an easily accessible format of your choice (csv, tab,
excel, etc).

In [26]:
print(geneSymbol)

['NAT2', 'PARP1', 'ANXA2', 'APOA1', 'BCL2', 'C9', 'CALB2', 'TNFRSF8', 'CDK6', 'CDKN1A', 'CDKN2A', 'COL12A1', 'CSF1', 'CTNNB1', 'DDX3X', 'EGFR', 'EPHX1', 'F9', 'EFEMP1', 'FCN2', 'FGF9', 'MLANA', 'FN1', 'FTH1', 'GPR27', 'GPR37', 'GSTM1', 'HGF', 'ICAM2', 'IFNG', 'IL1A', 'IL2RA', 'IL3', 'IL4', 'IL6', 'CXCL8', 'IL12B', 'ILK', 'KIT', 'LTA', 'MCAM', 'MDK', 'MET', 'MIF', 'CXCL9', 'MUC1', 'NF2', 'NGF', 'SERPINA4', 'PTPRF', 'RAF1', 'RXRA', 'RYR2', 'CLEC11A', 'CCL3', 'CCL5', 'CCL7', 'CCL23', 'SDC1', 'CXCL12', 'SLC2A1', 'SLC22A5', 'SMARCA2', 'SOD2', 'TF', 'TFDP2', 'TGFB2', 'TP53', 'TXN', 'VIM', 'WNT3', 'WT1', 'BAP1', 'FGF18', 'BCL10', 'MTMR4', 'CCNE2', 'OSMR', 'ADAMTS2', 'ULK2', 'HDAC4', 'HEPH', 'SETDB1', 'MSLN', 'ENOX2', 'SEMA4F', 'CXCL13', 'PDPN', 'IGF2BP3', 'CCL27', 'COPG1', 'LIMCH1', 'WWC1', 'SF3B1', 'CFAP45', 'AGO1', 'AGO2', 'SETD2', 'PYCARD', 'PLLP', 'FGD6', 'TBL1XR1', 'TRAF7', 'DDX51', 'NANOS1', 'MIR125A', 'MIR126', 'MIR484', 'PWAR6']


In [27]:
#fino a COL12A1 approved

In [28]:
import urllib.request
import time

site = "https://www.genenames.org/tools/search/#!/all?query="
for i in range(0, len(geneSymbol)):
    symbol = geneSymbol[i]
    url = site+symbol
    print(symbol)
    response = urllib.request.urlopen(url)
    print(response)
    time.sleep(2)

NAT2
<http.client.HTTPResponse object at 0x7f2a222b1f98>
PARP1
<http.client.HTTPResponse object at 0x7f2a22c3dcf8>
ANXA2
<http.client.HTTPResponse object at 0x7f2a22c3d8d0>
APOA1
<http.client.HTTPResponse object at 0x7f2a22c3da20>
BCL2
<http.client.HTTPResponse object at 0x7f2a22c3de10>


KeyboardInterrupt: 

In [31]:
# printing original list  
print("The original list is : " + str(geneSymbol))
  
# using join() 
# avoiding printing last comma 
print("The formatted output is : ") 
print(', '.join(geneSymbol)) 

# deleted quotes to pass the list into site
#https://www.genenames.org/tools/multi-symbol-checker/

The original list is : ['NAT2', 'PARP1', 'ANXA2', 'APOA1', 'BCL2', 'C9', 'CALB2', 'TNFRSF8', 'CDK6', 'CDKN1A', 'CDKN2A', 'COL12A1', 'CSF1', 'CTNNB1', 'DDX3X', 'EGFR', 'EPHX1', 'F9', 'EFEMP1', 'FCN2', 'FGF9', 'MLANA', 'FN1', 'FTH1', 'GPR27', 'GPR37', 'GSTM1', 'HGF', 'ICAM2', 'IFNG', 'IL1A', 'IL2RA', 'IL3', 'IL4', 'IL6', 'CXCL8', 'IL12B', 'ILK', 'KIT', 'LTA', 'MCAM', 'MDK', 'MET', 'MIF', 'CXCL9', 'MUC1', 'NF2', 'NGF', 'SERPINA4', 'PTPRF', 'RAF1', 'RXRA', 'RYR2', 'CLEC11A', 'CCL3', 'CCL5', 'CCL7', 'CCL23', 'SDC1', 'CXCL12', 'SLC2A1', 'SLC22A5', 'SMARCA2', 'SOD2', 'TF', 'TFDP2', 'TGFB2', 'TP53', 'TXN', 'VIM', 'WNT3', 'WT1', 'BAP1', 'FGF18', 'BCL10', 'MTMR4', 'CCNE2', 'OSMR', 'ADAMTS2', 'ULK2', 'HDAC4', 'HEPH', 'SETDB1', 'MSLN', 'ENOX2', 'SEMA4F', 'CXCL13', 'PDPN', 'IGF2BP3', 'CCL27', 'COPG1', 'LIMCH1', 'WWC1', 'SF3B1', 'CFAP45', 'AGO1', 'AGO2', 'SETD2', 'PYCARD', 'PLLP', 'FGD6', 'TBL1XR1', 'TRAF7', 'DDX51', 'NANOS1', 'MIR125A', 'MIR126', 'MIR484', 'PWAR6']
The formatted output is : 
NAT2, 

In [32]:
#all gene symbols were approved by HGNC 

In [36]:
uniprotAC=[]
for i in range(len(curated)):
    uniprotAC.append(curated['UniProt'][i])

In [37]:
results = pd.DataFrame(list(zip(geneSymbol,geneName,geneID,uniprotAC)), columns=['Symbol','Name','ID','UniprotAC'])
results.to_csv('mesothelioma-curated-genes.csv')

In [38]:
results.head()

Unnamed: 0,Symbol,Name,ID,UniprotAC
0,NAT2,N-acetyltransferase 2,10,P11245
1,PARP1,poly(ADP-ribose) polymerase 1,142,P09874
2,ANXA2,annexin A2,302,P07355
3,APOA1,apolipoprotein A1,335,P02647
4,BCL2,"BCL2, apoptosis regulator",596,P10415


### Exercise 1.2