In [27]:
from bs4 import BeautifulSoup
import requests
import json

import sys
sys.path.append("../..")
from functions import save_file
from datetime import datetime
from tqdm.notebook import tqdm

In [28]:
def make_soup(link):
    r = requests.get(link)
    soup = BeautifulSoup(r.content, features="html5lib")
    return soup

In [29]:
def find_links_to_ratings():
    result = {} 
    
    soup = make_soup("https://www.nstu.ru/entrance/admission_campaign/entrance")
    
    content = soup.find_all("div", {"class": "pleft"})
    tags = [i for i in content[0].children]
    
    for tag in tags:
        if tag.name == "h3":
            faculty_name = tag.text.strip()
            if (faculty_name == "Программы бакалавриата и специалитета, специальности среднего профессионального образования"):
                continue
            if (faculty_name == "Программы магистратуры"):
                break

            result[faculty_name] = {}
        if tag.name == "table":
            if tag.tbody.tr.td.text.strip()[-8:] == "Бакалавр":
                course_name = tag.tbody.tr.td.text.split(",")[0].strip().replace("\xa0", " ")
                link = tag.find("span").a["href"]
                result[faculty_name][course_name] = link
    return result

In [30]:
def get_information_about_course(soup):
    content = soup.find_all("main", {"class": "page-content"})[0]
    
    # date
    idx = content.text.find("Время")
    date = content.text[idx+49:idx+69].strip()
    
    # group
    group = content.find_all("b", string="Конкурсная группа: ")[0].next_sibling.strip().replace(" ", "")
    print(group)
    # free
    free_cnt = content.find_all("b", string="Количество бюджетных мест в конкурсной группе по всем условиям поступления: ")
    if len(free_cnt) != 0:
        free_cnt = free_cnt[0].next_sibling
        free_cnt = int("".join([i for i in free_cnt if i.isdigit()]))
    else:
        free_cnt = 0
    
    
    
    # rating
    table = content.table.find_all("tbody")[1].find_all("tr")
    
    k = 0
    scores = []
    for tag in table:
        data = tag.find_all("td")
        if data[0].b is not None:
            if data[0].b.i is not None:
                if data[0].b.i.text == "По конкурсу":
                    k = 1
                    continue
                if data[0].b.i.text == "Не выдержавшие вступительные испытания":
                    break
        if k != 0:
            if k == 1:
                olymp_cnt = int(data[0].text) - 1
                k = 2
            scores.append(int(data[10].b.text))
    scores = scores[olymp_cnt:free_cnt]
    
    return (date, free_cnt, olymp_cnt, scores, group)

In [35]:
def get_comp17():
    with open('c17.json') as json_file:
        data = json.load(json_file)
    return data

def get_comp18():
    with open('c18.json') as json_file:
        data = json.load(json_file)
    return data

def get_comp19():
    with open('c19.json') as json_file:
        data = json.load(json_file)
    return data

def get_subjects():
    with open('subj.json') as json_file:
        data = json.load(json_file)
    return data

In [36]:
data = find_links_to_ratings()
competition2017 = get_comp17()
competition2018 = get_comp18()
competition2019 = get_comp19()
subjects = get_subjects()
res = [str(datetime.today()), []]
for faculty in tqdm(list(data.keys())):
    for (course, link) in tqdm(data[faculty].items()):
        info_about_course = {}
        soup = make_soup(link)
        
        info_about_course["fac_name"] = faculty + " " + course
        
        info = get_information_about_course(soup)
        info_about_course["date_updated"] = info[0]
        info_about_course["scores"] = info[3]
        if (len(info[3]) != 0):
            info_about_course["last_score"] = info[3][-1]
        else:
            info_about_course["last_score"] = None
        info_about_course["free_places"] = info[1]
        info_about_course["olymp_cnt"] = info[2]
        
        info_about_course["subjects"] = subjects[info_about_course["fac_name"]]
        
        try:
            info_about_course["prev_years17"] = competition2017[info[4]]
        except KeyError:
            info_about_course["prev_years17"] = None
        try:
            info_about_course["prev_years17"] = competition2018[info[4]]
        except KeyError:
            info_about_course["prev_years17"] = None
        try:
            info_about_course["prev_years17"] = competition2019[info[4]]
        except KeyError:
            info_about_course["prev_years17"] = None 
        
        res[1].append(info_about_course)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

АВТФ.1
АВТФ.2
АВТФ.3
АВТФ.4
АВТФ.5
АВТФ.7
АВТФ.8
АВТФ.9



HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

ФЛА.1
ФЛА.3
ФЛА.4
ФЛА.7
ФЛА.8
ФЛА.9
ФЛА.10
ФЛА.12
ФЛА.13



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

МТФ.1
МТФ.2
МТФ.3
МТФ.4
МТФ.5
МТФ.6
МТФ.7
МТФ.8
МТФ.9
МТФ.10



HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

ФМА.1
ФМА.2
ФМА.3
ФМА.1з



HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

ФПМИ.1
ФПМИ.2



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

РЭФ.1
РЭФ.2
РЭФ.3
РЭФ.4
РЭФ.5



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

ФТФ.1
ФТФ.2
ФТФ.3
ФТФ.4
ФТФ.5



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

ФЭН.1з
ФЭН.2
ФЭН.3
ФЭН.4
ФЭН.2з



HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

ФБ.1.1
ФБ.1.2
ФБ.2.1
ФБ.2.2
ФБ.2.3
ФБ.2.4
ФБ.3
ФБ.4



HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

ФГО.1.1
ФГО.1оз
ФГО.2.1
ФГО.2.2
ФГО.3.1
ФГО.3.2
ФГО.4.1
ФГО.4.2
ФГО.5.1
ФГО.5.2
ФГО.5.3
ФГО.6



HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))

ИДО.1
ИДО.2
ИДО.3
ИДО.4
ИДО.5
ИДО.6
ИДО.7
ИДО.8
ИДО.10
ИДО.11
ИДО.12.1
ИДО.12.2
ИДО.14
ИДО.15
ИДО.17
ИДО.18
ИДО.19



HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

ИСТР.1
ИСТР.2
ИСТР.3
ИСТР.4




In [33]:
save_file(json.dumps(res))

Saved to 2020-07-30 16:26:40.026894.json


'2020-07-30 16:26:40.026894.json'

## "Парсеры-однодневки"

In [21]:
s = make_soup("https://www.nstu.ru/entrance/competition/competition2017")
l = [i.div.span.text for i in s.find("table", {"width": "896"}).find_all("td", {"width": ["76", "73"]})[3:]]
c17 = {}
for i in range(0, len(l), 3):
    c17[l[i]] = (l[i+2] if l[i+2] != "–" else None)
    
with open('c17.json', 'w') as outfile:
    json.dump(c17, outfile)

In [22]:
c18 = {}
s = make_soup("https://www.nstu.ru/entrance/competition/competition2018")
l = s.find("table", {"width": "948"}).find_all("td", {"width": ["119", "105"]})[2:]
for i in range(0, len(l), 2):
    c18[l[i].p.b.span.text] = (l[i+1].p.span.text if l[i+1].p is not None else None)
        
with open('c18.json', 'w') as outfile:
    json.dump(c18, outfile)

In [23]:
c19 = {}
s = make_soup("https://www.nstu.ru/entrance/admission_campaign/completition2019")
l = s.find("table", {"width": "861"}).find_all("tr")[1:]
for i in range(len(l)):
    tds = l[i].find_all("td")
    if (tds[0].text == "Магистратура"):
        break
    c19[tds[0].find("br").previous_sibling] = (int(tds[1].text) if tds[1].text.strip() != "" else None)
    
with open('c19.json', 'w') as outfile:
    json.dump(c19, outfile)

In [31]:
subj = {}
data = find_links_to_ratings()
decoder = {
    'Био.': 'Биология',
    'Гео.': 'География',
      'ИЯ': 'Иностранный язык',
    'Инф.': 'Информатика и ИКТ',
    'Ист.': 'История',
    'Лит.': 'Литература',
    'Мат.': 'Математика',
    'Общ.': 'Обществознание',
      'РЯ': 'Русский язык',
    'Физ.': 'Физика',
    'Хим.': 'Химия'
}
for faculty in tqdm(list(data.keys())):
    for (course, link) in tqdm(data[faculty].items()):
        soup = make_soup(link)
        subj[faculty+' '+course] = [decoder[tag.text] for tag in soup.find("table").thead.find_all("tr")[1].find_all("th")[:3]]

with open('subj.json', 'w') as outfile:
    json.dump(subj, outfile)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))



