# Combine 3 datasets

From DisGeNET, Human Phenotype Ontology, and DISEASES

In [2]:
import pandas as pd
import numpy as np

In [4]:
# Load saved data
dfsaved = pd.read_csv('GeneDiseaseMoreCats.csv')
print(dfsaved.shape)
dfsaved.head(2)

(275, 7)


Unnamed: 0,diseaseName,Number of genes,score,geneSymbol,geneName,relurl,category
0,canavan disease,4,0.839553,ASPA,aspartoacylase,/condition/canavan-disease,Brain and nervous system
1,cleidocranial dysplasia,3,0.763477,RUNX2,runt-related transcription factor 2,/condition/cleidocranial-dysplasia,"Bones, muscles, and connective tissues"


In [6]:
# Load other datasets
dfhpo = pd.read_csv('hpo.csv', sep='\t')
print(dfhpo.shape)
dfmined = pd.read_csv('mined.csv', sep='\t')
print(dfmined.shape)

(96796, 3)
(710973, 3)


In [52]:
dfhpo.columns=['geneSymbol','diseaseName','score_hpo']
dfhpo['diseaseName']=dfhpo['diseaseName'].str.lower()
dfhpo.head()

Unnamed: 0,geneSymbol,diseaseName,score_hpo
0,CYP7B1,acholic stools,1
1,CYP7B1,hepatic failure,1
2,CYP7B1,steatorrhea,1
3,CYP7B1,pruritus,1
4,CYP7B1,hepatomegaly,1


In [53]:
dfmined.columns = ['geneSymbol','diseaseName','score_conf']
dfmined['diseaseName']=dfmined['diseaseName'].str.lower()
dfmined.head()

Unnamed: 0,geneSymbol,diseaseName,score_conf
0,7SK,human immunodeficiency virus infectious disease,1.5
1,7SK,acquired immunodeficiency syndrome,1.4
2,7SK,viral infectious disease,1.3
3,7SK,disease,1.2
4,7SK,disease by infectious agent,1.1


In [None]:
# dfmined[dfmined.diseaseName=='canavan disease']

In [89]:
dfsavedhpo = pd.merge(dfsaved,dfhpo,how='left',on=['diseaseName','geneSymbol'], 
                      suffixes=['','_hpo'])
print(dfsavedhpo.shape)
dfsavedhpo.tail()

(275, 8)


Unnamed: 0,diseaseName,Number of genes,score,geneSymbol,geneName,relurl,category,score_hpo
270,joubert syndrome 6,1,0.72,TMEM67,transmembrane protein 67,/condition/andersen-tawil-syndrome,"Skin, hair, and nails",
271,"charcot-marie-tooth disease, type 2b1",1,0.72,LMNA,lamin A/C,/condition/charcot-marie-tooth-disease,"Bones, muscles, and connective tissues",
272,"deafness, autosomal dominant 25",2,0.72,SLC17A8,solute carrier family 17 (vesicular glutamate ...,/condition/22q112-deletion-syndrome,"Ear, nose, and throat",
273,"usher syndrome, type ig",2,0.72,USH1G,Usher syndrome 1G (autosomal recessive),/condition/pol-iii-related-leukodystrophy,Kidneys and urinary system,
274,muscular dystrophy-dystroglycanopathy (congeni...,1,0.72,LARGE,like-glycosyltransferase,/condition/doors-syndrome,"Bones, muscles, and connective tissues",


In [90]:
# dfthree = pd.merge(dfsavedhpo,dfmined,how='left',on='diseaseName',suffixes=['','_mined'])
# dfthree = pd.merge(dfsavedhpo,dfmined,how='left',on='geneSymbol',suffixes=['','_mined'])

In [None]:
print(dfthree.shape)
dfthree.tail()
# dfsame = dfthree[dfthree.geneSymbol_mined.notnull()]
# print(dfsame.shape)
# dfsame.head()

In [123]:
dfthree = pd.merge(dfsavedhpo,dfmined,how='left',on=['diseaseName','geneSymbol'], suffixes=['','_mined'])
dfthree['score_dgn']=dfthree['score']
dfthree.drop('score',axis=1,inplace=True)
dfthree[['score_dgn','score_hpo','score_conf']] = dfthree[['score_dgn','score_hpo', 
                                                           'score_conf']].fillna(0.0, axis=1)
dfthree['score_total']=dfthree['score_dgn']*0.8+dfthree['score_hpo']*0.1+dfthree['score_conf']*0.25*0.1

In [None]:
dfthree.head()

___
# Update obvious categories

In [34]:
import pandas as pd
dfthree = pd.read_csv('ThreeGDA.tsv',sep='\t')

In [35]:
# Fill in obvious categories
dfalias = pd.read_csv('ConditionAliasAddtl.csv')

# Dictionary of categories and keywords
catdict = {}
catkeys = ['blood','bone','brain','cancer','digest','ent','endocrine','eye',
           'metabolism','heart','immune','kidney','lung','mental','mouth','reproductive','skin']
catvals = sorted(dfalias['Category'].unique())

catdict = dict(zip(catkeys,catvals))

idxs = ['cardi','neur','deaf','retin','musc','cataract','ventri','brain',
        'osteo','diabetes','tooth','albin','arterial','bestrophinopathy','immun', 
       'sterol', 'glyco', 'dactyl','thyro','renal']
newcats = ['heart','brain','ent','eye','bone','eye','heart','brain','bone',
           'metabolism','mouth','skin','heart','eye','immune', 
          'metabolism', 'metabolism', 'bone', 'endocrine','kidney']
for idx,newcat in zip(idxs,newcats):
    mask = dfthree['diseaseName'].str.contains(idx)
    dfthree['category'][mask] = catdict[newcat]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [36]:
dfthree.to_csv('ThreeGDA.tsv',sep='\t',index=False)
dfthree.to_csv('../genediseaselink-web/ThreeGDA.tsv',sep='\t',index=False)

In [1]:
# Create disease keyword dictionary

disease_dict = {'digest': 'Digestive system', 'cancer': 'Cancers', 'skin': 'Skin, hair, and nails', 'heart': 'Heart and circulation', 'bone': 'Bones, muscles, and connective tissues', 'lung': 'Lungs and breathing', 'endocrine': 'Endocrine system (hormones)', 'brain': 'Brain and nervous system', 'reproductive': 'Reproductive system', 'kidney': 'Kidneys and urinary system', 'immune': 'Immune system', 'mouth': 'Mouth and teeth', 'metabolism': 'Food, nutrition, and metabolism', 'ent': 'Ear, nose, and throat', 'blood': 'Blood/lymphatic system', 'mental': 'Mental health and behavior', 'eye': 'Eyes and vision'}
dislist = disease_dict.items()

In [4]:
import operator

dissort = sorted(disease_dict.items(), key=operator.itemgetter(1))

for dis in dissort:
    print(dis[0],dis[1])

blood Blood/lymphatic system
bone Bones, muscles, and connective tissues
brain Brain and nervous system
cancer Cancers
digest Digestive system
ent Ear, nose, and throat
endocrine Endocrine system (hormones)
eye Eyes and vision
metabolism Food, nutrition, and metabolism
heart Heart and circulation
immune Immune system
kidney Kidneys and urinary system
lung Lungs and breathing
mental Mental health and behavior
mouth Mouth and teeth
reproductive Reproductive system
skin Skin, hair, and nails


In [27]:
import os
import datetime
t = os.path.getmtime('../genediseaselink-web/app.py')
updated = '{modt:%B} {modt.day}, {modt:%Y}'.format(modt=datetime.date.fromtimestamp(t))
print(updated)

February 2, 2016
