# Retrieving information from [diagnos-online.ru](http://www.diagnos-online.ru/symp.html)

In [1]:
from selenium.webdriver import Chrome
from rx import Observable
from rx.concurrency import ThreadPoolScheduler
from threading import current_thread
import multiprocessing
from re import search, compile
import requests
from rx.core import Scheduler
from bs4 import BeautifulSoup
from functools import reduce, partial
import sqlite3
optimal_thread_count = multiprocessing.cpu_count() + 1

## Opening chrome and getting to site

In [5]:
driver = Chrome()
driver

<selenium.webdriver.chrome.webdriver.WebDriver (session="f61617897094c03fba5370da7ef24ab2")>

In [6]:
driver.get("http://www.diagnos-online.ru/symp.html")

## Preparation for scraping

### Getting symptom groups

In [7]:
def get_symptom_groups(driver):
    return driver.find_elements_by_css_selector("#List1 option")
get_symptom_groups(driver)

[<selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-1")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-2")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-3")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-4")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-5")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-6")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-7")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", el

### Getting symptoms

In [8]:
def get_symptoms(driver):
    return driver.find_elements_by_css_selector("#List2 option")
get_symptoms(driver)

[<selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-25")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-26")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-27")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-28")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-29")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-30")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24ab2", element="0.6674204163755637-31")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f61617897094c03fba5370da7ef24a

### Getting selected symptoms

In [9]:
def get_selected_symptoms(driver):
    return driver.find_elements_by_css_selector("select[name='SelSymp[]'] option")
get_selected_symptoms(driver)

[]

### Getting diagnosis

In [10]:
def get_diagnosis(driver):
    return driver.find_elements_by_css_selector("#List11 option")

### Getting buttons 

In [11]:
def get_buttons(driver):
    return {
        "add": driver.find_element_by_css_selector("button[onclick='addfunc()']"),
        "analyze": driver.find_element_by_css_selector("input[type='submit']")
    }

In [12]:
def get_remove_button(driver):
    return driver.find_element_by_css_selector("button[onclick='delfunc()']")

### Getting description links

In [35]:
def parse_page(url):
    response = request.get(url)
    page = response.content.decode("windows-1251", "ignore")
    return BeautifulSoup(page, "html.parser")

In [36]:
def get_description_links(request):
    bs = parse_page("http://www.diagnos-online.ru/zabolevaniya.html")
    return bs.select("div.submen a")

In [14]:
descriptions = Observable.from_(get_description_links(requests)) \
    .skip(9) \
    .to_dict(lambda e: e.string, lambda e: "http://www.diagnos-online.ru/" + e["href"])
descriptions

<rx.core.anonymousobservable.AnonymousObservable at 0x24198c30048>

## Performing scraping actions

### Performing user actions to get desease data

In [15]:
def clear_symptoms(symptoms, button):
    """ Removes all selected symptoms """
    for symptom in symptoms:
        symptom.click()
        button.click()

In [16]:
def scrape_diagnosis(driver):
    symptom_groups = get_symptom_groups(driver)
    symptom_group_pointer = 0
    symptoms_pointer = 0
    while symptom_group_pointer < len(symptom_groups):
        buttons = get_buttons(driver)
        symptom_groups[symptom_group_pointer].click()
        symptoms = get_symptoms(driver)
        symptoms[symptoms_pointer].click()
        buttons["add"].click()
        buttons["analyze"].click()
        symptom_groups = get_symptom_groups(driver)
        symptom_groups[symptom_group_pointer].click()
        symptoms = get_symptoms(driver)
        yield {
            "diagnosis": get_diagnosis(driver),
            "symptom_group": symptom_groups[symptom_group_pointer],
            "symptom": symptoms[symptoms_pointer]
        }
        clear_symptoms(get_selected_symptoms(driver), get_remove_button(driver))
        symptoms_pointer += 1
        if symptoms_pointer >= len(symptoms):
            symptom_group_pointer += 1
            symptoms_pointer = 0

## Managing scraped data

### Selecting contents of ui elements

In [17]:
data = Observable.from_(scrape_diagnosis(driver)) \
    .take(3) \
    .map(lambda e: {
        "diagnosis": [(d.text.split("    ")[1], float(d.text.split("    ")[0][:-1])) for d in e["diagnosis"]],
        "symptom_group": e["symptom_group"].text.strip(),
        "symptom": e["symptom"].text.strip()
    })
data

<rx.core.anonymousobservable.AnonymousObservable at 0x2419bf33828>

### Diagnoses in app have substrings of how diagnosis named in desease list

In [18]:
def match_link(links, name):
    keys = links.keys()
    for k in keys:
        if k.lower() in name.lower():
            return links[k]
    return ""

### Waiting for description links

In [19]:
data = Observable.concat(descriptions, data).skip(1)

### Matches diseases and links to their pages

In [20]:
diagnoses = Observable.combine_latest(data, descriptions, lambda d, l: {
    "diagnosis": [(i[0], i[1], match_link(l, i[0])) for i in d["diagnosis"]],
    "symptom_group": d["symptom_group"],
    "symptom": d["symptom"]
})

### Retrieving information about diagnosis

## Prepearing database

### Reading scema file

In [6]:
scema = ""
with open("./sqlite/create.sql", "r") as file:
    scema = file.read()
scema

'CREATE TABLE symptom_group (\n    name TEXT PRIMARY KEY\n)\n\nCREATE TABLE symptom (\n    name TEXT PRIMARY KEY,\n    group_name TEXT NOT NULL,\n    FOREIGN KEY (group_name) REFERENCES symptom_group (name)\n)\n\nCREATE TABLE diagnosis (\n    name TEXT PRIMARY KEY,\n    description TEXT\n)\n\nCREATE TABLE symptom_diagnosis (\n    symptom_name TEXT,\n    diagnosis_name TEXT,\n    probability REAL NOT NULL,\n    PRIMARY KEY (symptom_name, diagnosis_name),\n    FOREIGN KEY (symptom_name) REFERENCES symptom (name) ON DELETE CASCADE ON UPDATE NO ACTION,\n    FOREIGN KEY (diagnosis_name) REFERENCES diagnosis (name) ON DELETE CASCADE ON UPDATE NO ACTION\n)'

### Prepearing database

In [47]:
connection = sqlite3.connect("./sqlite/diagnoses.db")
connection

<sqlite3.Connection at 0x20674a2b9d0>

In [48]:
cursor = connection.cursor()
cursor

<sqlite3.Cursor at 0x20674af30a0>

In [49]:
cursor.execute("""DROP TABLE symptom_group""")
cursor.execute("""DROP TABLE symptom""")
cursor.execute("""DROP TABLE diagnosis""")
cursor.execute("""DROP TABLE symptom_diagnosis""")

OperationalError: no such table: symptom_group

In [7]:
for c in scema.split("\n\n"):
    cursor.execute(c)

## Writing data into database

In [8]:
def database_writer(cursor, data):
    check_template = """SELECT EXISTS(SELECT * FROM {0} WHERE name=?)"""
    insert_template = """INSERT INTO {0} {1} VALUES {2}"""
    group_check = cursor.execute(check_template.format("symptom_group"), (data["symptom_group"],)).fetchall()
    symptom_check = cursor.execute(check_template.format("symptom"), (data["symptom"],)).fetchall()
    if not group_check[0][0]:
        cursor.execute(insert_template.format("symptom_group", "(name)", "(?)"), (data["symptom_group"],))
    if not symptom_check[0][0]:
        cursor.execute(insert_template.format("symptom", "(name, group_name)", "(?, ?)"), (data["symptom"], data["symptom_group"]))
    for diagnosis in data["diagnosis"]:
        diag_check = cursor.execute(check_template.format("diagnosis"), (diagnosis[0],)).fetchall()
        if not diag_check[0][0]:
            cursor.execute(insert_template.format("diagnosis", "(name, description)", "(?, ?)"), (diagnosis[0], diagnosis[2]))
        cursor.execute(insert_template.format("symptom_diagnosis", "(symptom_name, diagnosis_name, probability)", "(?, ?, ?)"),
                      (data["symptom"], diagnosis[0], diagnosis[1]))

In [22]:
def scrape_complete(connection, driver):
    connection.commit()
    connection.close()
    driver.close()

In [10]:
diagnoses.subscribe(on_next=partial(database_writer, cursor),
                   on_complete=partial(scrape_complete, connection, driver))

<rx.disposables.anonymousdisposable.AnonymousDisposable at 0x206771bc1d0>

## Writing description contents instead of links

In [37]:
def request_description(data):
    regex = compile("\w+")
    if data[1]:
        soup = parse_page(data[1])
        data = (data[0], reduce(lambda a, v: a + str(v), soup.find_all(["p", "h2"], string=regex), ""))
    return data

In [30]:
def fetch_diagnoses(cursor):
    cursor.execute("""SELECT * FROM diagnosis""")
    return cursor.fetchall()

In [54]:
scheduler = ThreadPoolScheduler(optimal_thread_count)
scheduler

<rx.concurrency.threadpoolscheduler.ThreadPoolScheduler at 0x2419c33ce10>

In [57]:
diagnosis_info = Observable.from_(fetch_diagnoses(cursor)) \
    .select_many(lambda i: Observable.start(lambda: request_description(i), scheduler=scheduler)) \
    .observe_on(Scheduler.event_loop)
diagnosis_info
# scheduler.executor.shutdown()

<rx.core.anonymousobservable.AnonymousObservable at 0x2419c0fb048>

In [32]:
def update_description(cursor, row):
    cursor.execute("""UPDATE diagnosis SET description=? WHERE name=?""", (row[1], row[0]))

In [33]:
def update_complete(connection):
    connection.commit()
    connection.close()

In [None]:
diagnosis_info.subscribe(on_next=partial(update_description, cursor),
                        on_complete=partial(update_complete, connection))