In [11]:
# import scispacy  ## Doesnt work on M1

import os
import webscrape as ws
import dill as pickle
import sys

from tqdm.notebook import tqdm

In [43]:
NORD_DATA_BASE_PAGE = "https://rarediseases.org/rare-diseases/page/{page_number}/"
NORD_DATA_BASE_PAGE_MAXIMUM = 3 #87
DATA_PATH = "data"
NORD_DATA_DISEASE_LIST_CACHE_PATH = os.path.join(DATA_PATH, "disease_list_cache.pkl")

In [None]:
class Disease:
    def __init__(self, disease_html):
        self.name = disease_html.contents[0].strip()
        self.article_url = disease_html["href"]
#         self.html = disease_html
        self.fully_populated = False
        
    def fetch_all_info(self):
        """"""
        if self.fully_populated:
            return
        self.article = ws.get_html(self.article_url)
        self.fully_populated = True
        return
    
    def __repr__(self):
        return self.name + f": {self.article_url}"

In [None]:
def cache_write(file_path: str, obj, overwrite: bool = False):
    if overwrite or not os.path.exists(file_path):
        with open(file_path, "wb") as cache:
            pickle.dump(obj, cache)

            
def cache_load(file_path: str):
    """"""
    with open(file_path, "rb") as cache:
        return pickle.load(cache)

In [90]:
def get_all_nord_diseases(overwrite: bool = False):
    """"""
    if overwrite or not os.path.exists(NORD_DATA_DISEASE_LIST_CACHE_PATH):
        diseases = []
        for page_number in tqdm(range(1, NORD_DATA_BASE_PAGE_MAXIMUM + 1),
                                desc="Fetching all NORD Diseases", unit=" Page"):
            # get diseases from page
            page_html = ws.get_html(NORD_DATA_BASE_PAGE.format(page_number=page_number))
            articles = page_html.find_all("article")
            for article in articles:
                diseases.append(Disease(article.find("a")))
        cache_write(NORD_DATA_DISEASE_LIST_CACHE_PATH, diseases, overwrite=True)
        return diseases
    
    print("Loaded NORD Database from Cache.")
    return cache_load(NORD_DATA_DISEASE_LIST_CACHE_PATH)


def disease_list_to_txt(diseases: list, file_name: str):
    """"""
    with open(file_name, "w") as file:
        for disease in diseases:
            file.write(f"{disease}\n")

In [91]:
diseases = get_all_nord_diseases(overwrite=True)

Fetching all NORD Diseases:   0%|          | 0/3 [00:00<?, ? Page/s]

In [92]:
disease = diseases[]

SyntaxError: invalid syntax (3409174139.py, line 1)

In [127]:
incidence_text = []

for disease in tqdm(diseases):
    disease.fetch_all_info()
    text = disease.article.find_all("h3")[4].parent.get_text(strip=True)
    incidence_text.append(text)

  0%|          | 0/45 [00:00<?, ?it/s]

In [96]:
text

'< Previous sectionNext section >Affected populationsAfrican iron overload affects males and females in equal numbers. The exact incidence of the disorder is unknown. It has been reported in numerous countries in sub-Saharan Africa. Researchers believe that the disorder often goes unrecognized and is underdiagnosed, making it difficult to determine its true frequency in the general population. Some estimates suggest that iron overload affects more than 10 percent of the population in sub-Saharan Africa.Inherited forms of iron overload have been reported in natives of other countries who may be of African descent (e.g. African Americans). Whether this may represent the same disease as that seen in sub-Saharan Africa remains unknown.< Previous sectionNext section >'

In [142]:
import re

In [181]:
incidence_text

for text in incidence_text:
    text = ws.str_remover(text, " individuals", " individual", " people", " every", "affected")
    if " per " in text:
#         print(text)
        string = " ".join(text.partition(" per ")[0].split(" ")[-1:]) + " per " + text.partition(" per ")[2].split(" ")[0]
        print(string)
        numbers = re.findall("[-+]?(?:\d*\.*\d+)", string.replace(",", ""))
        
        if "million" in string:
            numbers += ["1000000"]
        print(numbers)

1 per 2,000,000
['1', '2000000']
million per year.
['1000000']
.2-1 per 1,000,000/year
['.2', '-1', '1000000']
5.9 per million
['5.9', '1000000']
3-5 per 100,000
['3', '-5', '100000']
2.2 per million,
['2.2', '1000000']
 per 100,000
['100000']
2 per 1,000
['2', '1000']
1.5 per 9,000,000
['1.5', '9000000']


In [135]:
for text in incidence_text:
    if "per million" in text:
#         print(text)
        print("Stripped: " + " ".join(text.partition(" per million")[0].split(" ")[-2:]) + " in 1,000,000 \n")
    elif "1 in " in text:
#         print(text)
        print("Stripped: 1 in " + text.partition("1 in ")[2].split(" ")[0] + "\n")
    
    elif "1/" in text:
        print("Stripped: 1 in " + text.partition("1/")[2].split(" ")[0].replace(".<", ""))
        print()

Stripped: 1 in 500-1,000

Stripped: 1 in 18,000

Stripped: 1 in 25,000

Stripped: 1 in 1,000,000

Stripped: 1 in 1,000,000.

Stripped: 1 in 40,000-1/60,000

Stripped: 1 in 50,000

Stripped: 1 in 100,000

Stripped: 1 in 500,000

Stripped: be 5.9 in 1,000,000 

Stripped: 2.2 people in 1,000,000 

Stripped: 1 in 20,000



In [31]:
disease_list_to_txt(diseases, "disease_list.txt")

In [33]:
diseases

[47, XXY (Klinefelter Syndrome): https://rarediseases.org/rare-diseases/47-xxy-klinefelter-syndrome/,
 48, XXYY Syndrome: https://rarediseases.org/rare-diseases/48-xxyy-syndrome/,
 Aarskog Syndrome: https://rarediseases.org/rare-diseases/aarskog-syndrome/,
 Abetalipoproteinemia: https://rarediseases.org/rare-diseases/abetalipoproteinemia/,
 Ablepharon-Macrostomia Syndrome: https://rarediseases.org/rare-diseases/ablepharon-macrostomia-syndrome/,
 Acanthocheilonemiasis: https://rarediseases.org/rare-diseases/acanthocheilonemiasis/,
 Aceruloplasminemia: https://rarediseases.org/rare-diseases/aceruloplasminemia/,
 Achalasia: https://rarediseases.org/rare-diseases/achalasia/,
 Achard Thiers Syndrome: https://rarediseases.org/rare-diseases/achard-thiers-syndrome/,
 Achondrogenesis: https://rarediseases.org/rare-diseases/achondrogenesis/,
 Achondroplasia: https://rarediseases.org/rare-diseases/achondroplasia/,
 Acid Sphingomyelinase Deficiency: https://rarediseases.org/rare-diseases/acid-sphi

In [39]:
disease_names = ["Klinefelter Syndrome"] + [disease.name for disease in diseases[2:]]