In [1]:
!pip install beautifulsoup4 wikipedia-api multiprocess



In [2]:
import bs4
import wikipediaapi
import pandas as pd
import requests
import json
import multiprocess

In [24]:
def get_info(row):
    disease_link = row.select('a')[0].get('href')
    vals = row.getText().split(";")
    return (vals[0], vals[1].strip(), disease_link)

def check_link_validity(disease_link):
    return '/wiki/' in disease_link

def get_symptoms(info):
    base_url = "https://en.wikipedia.org"
    import wikipediaapi, requests, bs4
    wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.WIKI)
    code, values = info[0], info[1]
    url = base_url + values[1]
    response = requests.get(url)
    response.raise_for_status()
    gkzSoup = bs4.BeautifulSoup(response.text)
    heading = gkzSoup.select('h1')[0].getText()
    page = wiki_wiki.page(heading)
    symptoms = ""
    for section in page.sections:
        if "symptoms" in section.title.lower():
            symptoms = section.text
    return [code, values[0], symptoms]

def call_and_insert(row):
    import requests, json
    API_STRING = "https://f29bio-dev.northeurope.cloudapp.azure.com/api/BioEntity/disease/phenotypes/en/tree/omim:"
    API_HEADERS = headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
                         "Accept-Encoding":"gzip, deflate",
                         "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1",
                         "Connection":"close", "Upgrade-Insecure-Requests":"1"}
    try:
        omim = row[0]
        url = API_STRING + omim
        key = "omim:" + omim
        html_content = requests.get(url, headers=API_HEADERS).text
        result = json.loads(html_content)
        symptoms = result[key]['phenotypes']
        extra_key = ""
        keys = []
        for key in symptoms.keys():
            if key.find("HP")==0:
                keys.append(key)
        row.append(keys)
    except Exception as e:
        row.append([])
    finally:
        return row

In [4]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_OMIM_disorder_codes")
response.raise_for_status()
gkzSoup = bs4.BeautifulSoup(response.text)
lists = gkzSoup.select('ul')
codes = lists[0]
listOfCodes = codes.select('li')
omim_dict = {}
for val in listOfCodes:
    disease, code, link = get_info(val)
    if code not in omim_dict.keys() and check_link_validity(link):
        omim_dict[code] = [disease, link]

In [5]:
pool = multiprocess.Pool()
data = pool.map(get_symptoms, [[k ,v] for k, v in omim_dict.items()])
pool.close()
print(data[0])

['202110', 'Isolated 17,20-lyase deficiency', 'The symptoms of isolated 17,20-lyase deficiency, in males, include pseudohermaphroditism (i.e., feminized, ambiguous, or mildly underdeveloped (e.g., micropenis, perineal hypospadias, and/or cryptorchidism (undescended testes)) external genitalia), female gender identity, and, in non-complete cases of deficiency where partial virilization occurs, gynecomastia up to Tanner stage V (due to low androgen levels, which results in a lack of suppression of estrogen); in females, amenorrhoea or, in cases of only partial deficiency, merely irregular menses, and enlarged cystic ovaries (due to excessive stimulation by high levels of gonadotropins); and in both sexes, hypergonadotropic hypogonadism (hypogonadism despite high levels of gonadotropins), delayed, impaired, or fully absent adrenarche and puberty with an associated reduction in or complete lack of development of secondary sexual characteristics (sexual infantilism), impaired fertility or c

In [25]:
pool = multiprocess.Pool()
new_data = pool.map(call_and_insert, [val for val in data])
pool.close()

In [26]:
df = pd.DataFrame(new_data, columns=["OMIM ID", "Disease", "Symptoms from Wikipedia", "HPO Codes"])
df.head()

Unnamed: 0,OMIM ID,Disease,Symptoms from Wikipedia,HPO Codes
0,202110,"Isolated 17,20-lyase deficiency","The symptoms of isolated 17,20-lyase deficienc...","[HP:0000013, HP:0000028, HP:0000033, HP:000003..."
1,300438,17-beta-hydroxysteroid dehydrogenase X deficiency,17-β-Hydroxysteroid dehydrogenase III deficien...,"[HP:0000252, HP:0000365, HP:0000407, HP:000052..."
2,610006,2-methylbutyrylglycinuria,SBCADD is included as a secondary target condi...,"[HP:0000252, HP:0000577, HP:0001250, HP:000125..."
3,231530,3-hydroxyacyl-coa dehydrogenase deficiency,"Typically, initial signs and symptoms of this ...",[]
4,273750,3-M syndrome,,"[HP:0000047, HP:0000179, HP:0000268, HP:000027..."


In [27]:
df.to_csv("omims_from_wiki.csv", encoding="utf-8", index=False)