In [1]:
import pandas as pd
import numpy as np
import csv 
import os

In [2]:
PATH = "./disgenet/"
filename = "all_gene_disease_associations.tsv"

In [3]:
df = pd.read_csv(PATH+filename, sep = '\t')
df.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,1,A1BG,0.857,0.172,C0001418,Adenocarcinoma,group,C04,Neoplastic Process,0.01,,2008.0,2008.0,1,0,LHGDN
1,1,A1BG,0.857,0.172,C0002736,Amyotrophic Lateral Sclerosis,disease,C10;C18,Disease or Syndrome,0.01,1.0,2009.0,2009.0,1,0,BEFREE
2,1,A1BG,0.857,0.172,C0013080,Down Syndrome,disease,C10;C16,Disease or Syndrome,0.01,1.0,2011.0,2011.0,1,0,BEFREE
3,1,A1BG,0.857,0.172,C0017636,Glioblastoma,disease,C04,Neoplastic Process,0.01,1.0,2014.0,2014.0,1,0,BEFREE
4,1,A1BG,0.857,0.172,C0019209,Hepatomegaly,phenotype,C06;C23,Finding,0.3,,2017.0,2017.0,1,0,CTD_human


In [4]:
print(len(df), df.columns)

628685 Index(['geneId', 'geneSymbol', 'DSI', 'DPI', 'diseaseId', 'diseaseName',
       'diseaseType', 'diseaseClass', 'diseaseSemanticType', 'score', 'EI',
       'YearInitial', 'YearFinal', 'NofPmids', 'NofSnps', 'source'],
      dtype='object')


#find row associated to disease Malignant Mesothelioma with id C0345967

The columns in the files are:
* geneId 		-> NCBI Entrez Gene Identifier
* geneSymbol	-> Official Gene Symbol
* DSI		-> The Disease Specificity Index for the gene
* DPI		-> The Disease Pleiotropy Index for the gene
* diseaseId 	-> UMLS concept unique identifier
* diseaseName 	-> Name of the disease	
* diseaseType  	-> The DisGeNET disease type: disease, phenotype and group
* diseaseClass	-> The MeSH disease class(es)
* diseaseSemanticType	-> The UMLS Semantic Type(s) of the disease
* score		-> DisGENET score for the Gene-Disease association
* EI		-> The Evidence Index for the Gene-Disease association
* YearInitial	-> First time that the Gene-Disease association was reported
* YearFinal	-> Last time that the Gene-Disease association was reported
* NofPmids	-> Total number of publications reporting the Gene-Disease association
* NofSnps		-> Total number of SNPs associated to the Gene-Disease association
* source		-> Original source reporting the Gene-Disease association

In [6]:
df.loc[df['diseaseId'] == 'C0345967']

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
343,10,NAT2,0.466,0.828,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.34,0.5,1995.0,2009.0,4,0,BEFREE;CTD_human
1871,31,ACACA,0.552,0.759,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.01,1.0,2014.0,2014.0,1,0,BEFREE
6612,142,PARP1,0.432,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.32,1.0,2011.0,2011.0,2,0,BEFREE;CTD_human
8831,182,JAG1,0.467,0.724,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.01,1.0,2001.0,2001.0,1,0,BEFREE
11117,207,AKT1,0.355,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.01,1.0,2015.0,2015.0,1,0,BEFREE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617173,619553,MIR484,0.752,0.241,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.30,,2015.0,2015.0,1,0,CTD_human
619069,693210,MIR625,0.681,0.172,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.01,1.0,2012.0,2012.0,1,0,BEFREE
624677,100506965,PWAR6,1.000,0.069,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.30,,2015.0,2015.0,1,0,CTD_human
626381,100862685,ERVK-19,0.573,0.724,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.01,1.0,2015.0,2015.0,1,0,BEFREE


In [8]:
target = df.loc[df['diseaseName'] == 'Malignant mesothelioma']

In [9]:
target.to_csv(PATH+"malignant_mesothelioma_genes.tsv", sep = '\t')

In [10]:
t = pd.read_csv(PATH+"malignant_mesothelioma_genes.tsv", sep = '\t')

In [13]:
t = t.drop('Unnamed: 0', axis = 1)
t.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,10,NAT2,0.466,0.828,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.34,0.5,1995.0,2009.0,4,0,BEFREE;CTD_human
1,31,ACACA,0.552,0.759,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.01,1.0,2014.0,2014.0,1,0,BEFREE
2,142,PARP1,0.432,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.32,1.0,2011.0,2011.0,2,0,BEFREE;CTD_human
3,182,JAG1,0.467,0.724,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.01,1.0,2001.0,2001.0,1,0,BEFREE
4,207,AKT1,0.355,0.862,C0345967,Malignant mesothelioma,disease,C04;C08,Neoplastic Process,0.01,1.0,2015.0,2015.0,1,0,BEFREE


### Explore the DisGeNet dataset, find the disease of interest and get the list of human genes involved.

In [19]:
genes = [t.loc[ : , 'geneSymbol']]

In [21]:
genes = pd.Series(t.geneSymbol.values,index=t.geneId).to_dict()

In [22]:
genes

{10: 'NAT2',
 31: 'ACACA',
 142: 'PARP1',
 182: 'JAG1',
 207: 'AKT1',
 214: 'ALCAM',
 238: 'ALK',
 302: 'ANXA2',
 324: 'APC',
 335: 'APOA1',
 367: 'AR',
 445: 'ASS1',
 467: 'ATF3',
 595: 'CCND1',
 596: 'BCL2',
 598: 'BCL2L1',
 650: 'BMP2',
 672: 'BRCA1',
 735: 'C9',
 794: 'CALB2',
 834: 'CASP1',
 841: 'CASP8',
 847: 'CAT',
 894: 'CCND2',
 943: 'TNFRSF8',
 959: 'CD40LG',
 960: 'CD44',
 963: 'CD53',
 978: 'CDA',
 999: 'CDH1',
 1001: 'CDH3',
 1017: 'CDK2',
 1019: 'CDK4',
 1021: 'CDK6',
 1026: 'CDKN1A',
 1027: 'CDKN1B',
 1029: 'CDKN2A',
 1030: 'CDKN2B',
 1033: 'CDKN3',
 1303: 'COL12A1',
 1385: 'CREB1',
 1435: 'CSF1',
 1437: 'CSF2',
 1440: 'CSF3',
 1465: 'CSRP1',
 1490: 'CTGF',
 1499: 'CTNNB1',
 1523: 'CUX1',
 1543: 'CYP1A1',
 1588: 'CYP19A1',
 1612: 'DAPK1',
 1633: 'DCK',
 1654: 'DDX3X',
 1803: 'DPP4',
 1917: 'EEF1A2',
 1950: 'EGF',
 1956: 'EGFR',
 1969: 'EPHA2',
 2048: 'EPHB2',
 2052: 'EPHX1',
 2056: 'EPO',
 2057: 'EPOR',
 2067: 'ERCC1',
 2068: 'ERCC2',
 2099: 'ESR1',
 2100: 'ESR2',
 2152