# Phase I goal: 
# Identify the disease(s) with a strong association to a small number of genes

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.patches as mpatches

import requests
from bs4 import BeautifulSoup
import re

%matplotlib inline

## Disease aliases
In order to find the disease category and incidence rate, known aliases of each disease must be determined. This data comes from an NIH Genetics database (<a href="http://ghr.nlm.nih.gov/">http://ghr.nlm.nih.gov/</a>). 

The information on each disease category page will be downloaded with the requests library and a local database constructed.

In [10]:
# 
conditionurl = 'http://ghr.nlm.nih.gov/BrowseConditions'

# Request content from web page
result = requests.get(conditionurl)

# Set as Beautiful Soup Object
soup = BeautifulSoup(result.content, "lxml")

# Condition links are located in the div element with class 'nmx'

links = [[link.contents[0],link.get('href')] for link in soup.select("[class==nmx] a")]
dfcatlinks = pd.DataFrame(links, columns=['Category','Relative Link'])
dfcatlinks.to_csv('BrowseConditions.csv', index=False)

dfcatlinks

Unnamed: 0,Category,Relative Link
0,Blood/lymphatic system,/conditionCategory/blood-lymphatic-system
1,"Bones, muscles, and connective tissues",/conditionCategory/bones-muscles-and-connectiv...
2,Brain and nervous system,/conditionCategory/brain-and-nervous-system
3,Cancers,/conditionCategory/cancers
4,Digestive system,/conditionCategory/digestive-system
5,"Ear, nose, and throat",/conditionCategory/ear-nose-and-throat
6,Endocrine system (hormones),/conditionCategory/endocrine-system-hormones
7,Eyes and vision,/conditionCategory/eyes-and-vision
8,"Food, nutrition, and metabolism",/conditionCategory/food-nutrition-and-metabolism
9,Heart and circulation,/conditionCategory/heart-and-circulation


In [11]:
# Scrape condition category page for conditions and known aliases then save to a dataframe
baseurl = 'http://ghr.nlm.nih.gov'

# Split link into disease and alias
# Condition,Alias = re.split(' <em>see</em> ')

# Request content from web page and set beautiful soup object
condlinks = []
ptrn = re.compile('^\s')

for i in dfcatlinks.index:
    categoryurl = baseurl+dfcatlinks.iloc[i].get('Relative Link')
    result = requests.get(categoryurl)
    soup = BeautifulSoup(result.content, "lxml")
    pageprimary = soup.select("[class==page-primary] li")
    
    items = []
    
    # Save every category, alias, and link for every condition in NIH database
    for li in pageprimary:
        items.append([dfcatlinks.iloc[i].get('Category'), li.a.contents[0], ptrn.sub('',li.contents[-1]), li.a.get('href')])

    condlinks.extend(items)

In [12]:
# Convert conditions and aliases to lower case and save CSV
dfcondlinks = pd.DataFrame(condlinks, columns=['Category','Condition','Alias','Relative Link'])

dfcondlinks['Condition']=dfcondlinks['Condition'].str.lower()
dfcondlinks['Alias']=dfcondlinks['Alias'].str.lower()

dfcondlinks.to_csv('ConditionAlias.csv', index=False)
dfcondlinks.head()

Unnamed: 0,Category,Condition,Alias,Relative Link
0,Blood/lymphatic system,5-oxoprolinemia,glutathione synthetase deficiency,/condition/glutathione-synthetase-deficiency
1,Blood/lymphatic system,5-oxoprolinuria,glutathione synthetase deficiency,/condition/glutathione-synthetase-deficiency
2,Blood/lymphatic system,5q minus syndrome,,/condition/5q-minus-syndrome
3,Blood/lymphatic system,8p11 myeloproliferative syndrome,,/condition/8p11-myeloproliferative-syndrome
4,Blood/lymphatic system,11q deletion disorder,jacobsen syndrome,/condition/jacobsen-syndrome


In [13]:
len(dfcondlinks.index)

10145

## Identify category and relative link for each disease

In [14]:
# For list of conditions in dfall.csv find the ConditionAlias.csv dataset 
# then save the category and relative link in dfall.csv.

dfall = pd.read_csv('dfall.csv')
dfcondlinks = pd.read_csv('ConditionAlias.csv')
dfall['relurl']=''
dfall['category']=''

for disease in dfall['diseaseName']:
    if disease in dfcondlinks['Condition'].values:
        link = dfcondlinks['Relative Link'][dfcondlinks['Condition']==disease].iloc[0]
        category = dfcondlinks['Category'][dfcondlinks['Condition']==disease].iloc[0]
    elif disease in dfcondlinks['Alias'].values:
        link = dfcondlinks['Relative Link'][dfcondlinks['Alias']==disease].iloc[0]
        category = dfcondlinks['Category'][dfcondlinks['Alias']==disease].iloc[0]
    else:
        link = np.nan
        category = np.nan
    dfall.set_value(dfall.index[dfall['diseaseName']==disease][0],'relurl', link)
    dfall.set_value(dfall.index[dfall['diseaseName']==disease][0],'category', category)
    
dfall.to_csv('dfall.csv', index=False)
dfall.tail(10)

Unnamed: 0,diseaseName,Number of genes,score,geneSymbol,geneName,relurl,category
265,sitosterolemia,2,0.72,ABCG5,"ATP-binding cassette, sub-family G (WHITE), me...",/condition/sitosterolemia,Blood/lymphatic system
266,"parkinson disease 1, autosomal dominant",1,0.72,SNCA,"synuclein, alpha (non A4 component of amyloid ...",,
267,"deafness, autosomal dominant 13",1,0.72,COL11A2,"collagen, type XI, alpha 2",,
268,"cardiomyopathy, dilated, 1g",1,0.72,TTN,titin,,
269,focal segmental glomerulosclerosis 2,1,0.72,TRPC6,"transient receptor potential cation channel, s...",,
270,joubert syndrome 6,1,0.72,TMEM67,transmembrane protein 67,,
271,"charcot-marie-tooth disease, type 2b1",1,0.72,LMNA,lamin A/C,,
272,"deafness, autosomal dominant 25",2,0.72,SLC17A8,solute carrier family 17 (vesicular glutamate ...,,
273,"usher syndrome, type ig",2,0.72,USH1G,Usher syndrome 1G (autosomal recessive),,
274,muscular dystrophy-dystroglycanopathy (congeni...,1,0.72,LARGE,like-glycosyltransferase,,


## Scrape additional info to create more complete alias database

The same NIH Genetics database has a list of aliases on each page for disease in the database. This information will be added to the already existing condition alias database.

The relative links included in the ConditionAlias dataset will be used to generate the url for each condition. Then the aliases will be found and saved to the database under a new column.

In [38]:
baseurl = 'http://ghr.nlm.nih.gov'
dfcondlinks = pd.read_csv('ConditionAlias.csv')
dfcondlinks['OtherAliases'] = ""


for i in range(10142, len(dfcondlinks)):
    condurl = baseurl+dfcondlinks['Relative Link'].iloc[i]
    result = requests.get(condurl)
    soup = BeautifulSoup(result.content, "lxml")
    if soup.find_all("li", class_="othername")==[]:
        aliases = []
    elif type(soup.find_all("li", class_="othername")[0].contents) == list:
        aliases = [s.contents[0] for s in soup.find_all("li", class_="othername")]
    else:
        aliases = [s.contents for s in soup.find_all("li", class_="othername")]
    dfcondlinks.set_value(i,'OtherAliases',aliases)
    if i%20 == 0 or i == len(dfcondlinks)-1:
        dfcondlinks.to_csv('ConditionAliasAddtl.csv', index=False)

In [45]:
dfcondlinks.head()

Unnamed: 0,Category,Condition,Alias,Relative Link,OtherAliases
0,Blood/lymphatic system,5-oxoprolinemia,glutathione synthetase deficiency,/condition/glutathione-synthetase-deficiency,"[5-oxoprolinemia, 5-oxoprolinuria, deficiency ..."
1,Blood/lymphatic system,5-oxoprolinuria,glutathione synthetase deficiency,/condition/glutathione-synthetase-deficiency,"[5-oxoprolinemia, 5-oxoprolinuria, deficiency ..."
2,Blood/lymphatic system,5q minus syndrome,,/condition/5q-minus-syndrome,"[5q- syndrome, chromosome 5q deletion syndrome..."
3,Blood/lymphatic system,8p11 myeloproliferative syndrome,,/condition/8p11-myeloproliferative-syndrome,"[8p11 stem cell leukemia/lymphoma syndrome, 8p..."
4,Blood/lymphatic system,11q deletion disorder,jacobsen syndrome,/condition/jacobsen-syndrome,"[11q23 deletion disorder, 11q deletion disorde..."


In [53]:
# Match disease from dfall to disease in condition, alias or other alias column in dfcond
dfall = pd.read_csv('dfall.csv')
dfcondlinks = pd.read_csv('ConditionAliasAddtl.csv')
dfall['relurl']=''
dfall['category']=''

for disease in dfall['diseaseName']:
    if disease in dfcondlinks['Condition'].values:
        link = dfcondlinks['Relative Link'][dfcondlinks['Condition']==disease].iloc[0]
        category = dfcondlinks['Category'][dfcondlinks['Condition']==disease].iloc[0]
    elif disease in dfcondlinks['Alias'].values:
        link = dfcondlinks['Relative Link'][dfcondlinks['Alias']==disease].iloc[0]
        category = dfcondlinks['Category'][dfcondlinks['Alias']==disease].iloc[0]
    elif disease in dfcondlinks['OtherAliases'].values:
        link = dfcondlinks['Relative Link'][dfcondlinks['OtherAliases']==disease].iloc[0]
        category = dfcondlinks['Category'][dfcondlinks['OtherAliases']==disease].iloc[0]
    else:
        link = np.nan
        category = np.nan
    dfall.set_value(dfall.index[dfall['diseaseName']==disease][0],'relurl', link)
    dfall.set_value(dfall.index[dfall['diseaseName']==disease][0],'category', category)

In [49]:
dfcondlinks.head()

Unnamed: 0,Category,Condition,Alias,Relative Link,OtherAliases
0,Blood/lymphatic system,5-oxoprolinemia,glutathione synthetase deficiency,/condition/glutathione-synthetase-deficiency,"['5-oxoprolinemia', '5-oxoprolinuria', 'defici..."
1,Blood/lymphatic system,5-oxoprolinuria,glutathione synthetase deficiency,/condition/glutathione-synthetase-deficiency,"['5-oxoprolinemia', '5-oxoprolinuria', 'defici..."
2,Blood/lymphatic system,5q minus syndrome,,/condition/5q-minus-syndrome,"['5q- syndrome', 'chromosome 5q deletion syndr..."
3,Blood/lymphatic system,8p11 myeloproliferative syndrome,,/condition/8p11-myeloproliferative-syndrome,"['8p11 stem cell leukemia/lymphoma syndrome', ..."
4,Blood/lymphatic system,11q deletion disorder,jacobsen syndrome,/condition/jacobsen-syndrome,"['11q23 deletion disorder', '11q deletion diso..."


In [54]:
print(dfall['relurl'].count())
dfall.tail()

43


Unnamed: 0,diseaseName,Number of genes,score,geneSymbol,geneName,relurl,category
270,joubert syndrome 6,1,0.72,TMEM67,transmembrane protein 67,,
271,"charcot-marie-tooth disease, type 2b1",1,0.72,LMNA,lamin A/C,,
272,"deafness, autosomal dominant 25",2,0.72,SLC17A8,solute carrier family 17 (vesicular glutamate ...,,
273,"usher syndrome, type ig",2,0.72,USH1G,Usher syndrome 1G (autosomal recessive),,
274,muscular dystrophy-dystroglycanopathy (congeni...,1,0.72,LARGE,like-glycosyltransferase,,


___
## Less restrictive category search

In [3]:
# Fetch saved data

dfgda = pd.read_csv('GeneDiseaseUnique.csv')
dfalias = pd.read_csv('ConditionAliasAddtl.csv')

print(sorted(dfalias['Category'].unique()))

dfgda.head(5)

['Blood/lymphatic system', 'Bones, muscles, and connective tissues', 'Brain and nervous system', 'Cancers', 'Digestive system', 'Ear, nose, and throat', 'Endocrine system (hormones)', 'Eyes and vision', 'Food, nutrition, and metabolism', 'Heart and circulation', 'Immune system', 'Kidneys and urinary system', 'Lungs and breathing', 'Mental health and behavior', 'Mouth and teeth', 'Reproductive system', 'Skin, hair, and nails']


Unnamed: 0,diseaseName,Number of genes,score,geneSymbol,geneName,relurl,category
0,canavan disease,4,0.839553,ASPA,aspartoacylase,/condition/canavan-disease,Brain and nervous system
1,cleidocranial dysplasia,3,0.763477,RUNX2,runt-related transcription factor 2,/condition/cleidocranial-dysplasia,"Bones, muscles, and connective tissues"
2,glycogen storage disease type iib,4,0.741969,LAMP2,lysosomal-associated membrane protein 2,/condition/danon-disease,"Bones, muscles, and connective tissues"
3,primary hyperoxaluria type 1,4,0.738144,AGXT,alanine-glyoxylate aminotransferase,,
4,mucopolysaccharidosis vii,4,0.734742,GUSB,"glucuronidase, beta",,


In [78]:
# Join together condition, alias, otheraliases, columns in alias table to speed up search

# Fill in NaN values in Alias column then sum together
dfnewalias = dfalias.fillna('NA')
dfnewalias['aliases']=dfnewalias['Condition']+dfnewalias['Alias']+dfnewalias['OtherAliases']
dfnewalias.drop(['Condition','Alias','OtherAliases'],axis=1,inplace=True)
dfnewalias.head()

Unnamed: 0,Category,Relative Link,aliases
0,Blood/lymphatic system,/condition/glutathione-synthetase-deficiency,5-oxoprolinemiaglutathione synthetase deficien...
1,Blood/lymphatic system,/condition/glutathione-synthetase-deficiency,5-oxoprolinuriaglutathione synthetase deficien...
2,Blood/lymphatic system,/condition/5q-minus-syndrome,"5q minus syndromeNA['5q- syndrome', 'chromosom..."
3,Blood/lymphatic system,/condition/8p11-myeloproliferative-syndrome,8p11 myeloproliferative syndromeNA['8p11 stem ...
4,Blood/lymphatic system,/condition/jacobsen-syndrome,11q deletion disorderjacobsen syndrome['11q23 ...


In [None]:
# Use regex to select first two names in GeneDiseaseUnique.csv and match those to ConditionAliasAddtl.csv
p = re.compile('^(\w+).+?(\w+)')
psub = re.compile(',|[0-9]|type')


# Parse disease names
idxbool = dfgda[dfgda['relurl'].isnull()].index
terms = []
for idx in idxbool:
    term = list(p.findall(dfgda['diseaseName'].iloc[idx])[0])
    term = [s for s in term if len(s)>3]
    term = [s.replace(',','') for s in term]
    term = list(filter(None,[psub.sub('',s) for s in term]))
    terms.append(term)

print(len(terms),'out of',len(dfgda),'links not found.')
# print(terms)

In [299]:
# Search for each term in dfnewaliases.aliases and save link and category

# Set link and category for corresponding disease in dfgda

dfgdafill = dfgda.fillna('nnn')

np.random.seed(1)

for idx,term in zip(idxbool,terms):
    # Check if term is a one word (string) or two words (list)
    if len(term)>1:
        matched = dfnewalias[['Category','Relative Link']][dfnewalias.aliases.str.contains(term[0] and term[1])]
        if len(matched)>0:
            hi = len(matched)
            newcat = matched['Category'].iloc[np.random.randint(0,hi)]
            newurl = matched['Relative Link'].iloc[np.random.randint(0,hi)]
            dfgdafill.set_value(idx,'category',newcat)
            dfgdafill.set_value(idx,'relurl',newurl)
    else:
        matched = dfnewalias[['Category','Relative Link']][dfnewalias.aliases.str.contains(term[0])]
        if len(matched)>0:
            hi = len(matched)
            newcat = matched['Category'].iloc[np.random.randint(0,hi)]
            newurl = matched['Relative Link'].iloc[np.random.randint(0,hi)]
            dfgdafill.set_value(idx,'category',newcat)
            dfgdafill.set_value(idx,'relurl',newurl)
#     print(matched)
# dfgdafill.head()

In [266]:
# Number of gda without category and link

print(len(dfgdafill[dfgdafill['relurl']=='nnn']),'of',len(dfgdafill),'links not found.')
dfgdafill.head()

21 of 275 categories not identified.


Unnamed: 0,diseaseName,Number of genes,score,geneSymbol,geneName,relurl,category
0,canavan disease,4,0.839553,ASPA,aspartoacylase,/condition/canavan-disease,Brain and nervous system
1,cleidocranial dysplasia,3,0.763477,RUNX2,runt-related transcription factor 2,/condition/cleidocranial-dysplasia,"Bones, muscles, and connective tissues"
2,glycogen storage disease type iib,4,0.741969,LAMP2,lysosomal-associated membrane protein 2,/condition/danon-disease,"Bones, muscles, and connective tissues"
3,primary hyperoxaluria type 1,4,0.738144,AGXT,alanine-glyoxylate aminotransferase,/condition/primary-hyperoxaluria,"Food, nutrition, and metabolism"
4,mucopolysaccharidosis vii,4,0.734742,GUSB,"glucuronidase, beta",/condition/mucopolysaccharidosis-type-vi,"Bones, muscles, and connective tissues"


In [301]:
dfgdafill.replace('nnn',np.nan,inplace=True)
# print(dfgdafill['category'].isnull())

  mask = arr == x


In [1]:
dfgdafill.to_csv('GeneDiseaseMoreCats.csv',index=False,encoding='UTF-8')

NameError: name 'dfgdafill' is not defined

___
## Obvious category assignments

In [2]:
import pandas as pd
# Load saved data
dfsaved = pd.read_csv('GeneDiseaseMoreCats.csv')
dfalias = pd.read_csv('ConditionAliasAddtl.csv')

# Fill in obvious categories

# Dictionary of categories and keywords
catdict = {}
catkeys = ['blood','bone','brain','cancer','digest','ent','endocrine','eye','metabolism','heart','immune','kidney','lung','mental','mouth','reproductive','skin']
catvals = sorted(dfalias['Category'].unique())

catdict = dict(zip(catkeys,catvals))

print(catdict)
dfsaved.head()

{'digest': 'Digestive system', 'cancer': 'Cancers', 'skin': 'Skin, hair, and nails', 'heart': 'Heart and circulation', 'bone': 'Bones, muscles, and connective tissues', 'lung': 'Lungs and breathing', 'endocrine': 'Endocrine system (hormones)', 'brain': 'Brain and nervous system', 'reproductive': 'Reproductive system', 'kidney': 'Kidneys and urinary system', 'immune': 'Immune system', 'mouth': 'Mouth and teeth', 'metabolism': 'Food, nutrition, and metabolism', 'ent': 'Ear, nose, and throat', 'blood': 'Blood/lymphatic system', 'mental': 'Mental health and behavior', 'eye': 'Eyes and vision'}


Unnamed: 0,diseaseName,Number of genes,score,geneSymbol,geneName,relurl,category
0,canavan disease,4,0.839553,ASPA,aspartoacylase,/condition/canavan-disease,Brain and nervous system
1,cleidocranial dysplasia,3,0.763477,RUNX2,runt-related transcription factor 2,/condition/cleidocranial-dysplasia,"Bones, muscles, and connective tissues"
2,glycogen storage disease type iib,4,0.741969,LAMP2,lysosomal-associated membrane protein 2,/condition/danon-disease,"Bones, muscles, and connective tissues"
3,primary hyperoxaluria type 1,4,0.738144,AGXT,alanine-glyoxylate aminotransferase,/condition/primary-hyperoxaluria,"Food, nutrition, and metabolism"
4,mucopolysaccharidosis vii,4,0.734742,GUSB,"glucuronidase, beta",/condition/mucopolysaccharidosis-type-vii,"Bones, muscles, and connective tissues"


In [4]:
# Rules for assigning obvious categories

# Find all rows with diseaseName containing certain search terms and rewrite category

idxs = ['cardi','neur','deaf','retin','musc','cataract','ventri','brain','osteo','diabetes','tooth','albin','arterial']
newcats = ['heart','brain','ent','eye','bone','eye','heart','brain','bone','metabolism','mouth','skin','heart']
for idx,newcat in zip(idxs,newcats):
    mask = dfsaved['diseaseName'].str.contains(idx)
    dfsaved['category'][mask] = catdict[newcat]
    
# Update file with new categories
dfsaved.to_csv('../genediseaselink-web/GeneDiseaseMoreCats.csv',index=False,encoding='UTF-8')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
