This repository has been archived by the owner on Feb 23, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_stalbeton.py
136 lines (105 loc) · 4.11 KB
/
parse_stalbeton.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""Takes catalog data from stalbeton.pro site."""
import typing
from functools import lru_cache
from itertools import chain
from urllib.parse import urljoin
import bs4
import requests
from django.core.management.base import BaseCommand
class ThroughElements:
"""Such elements are presented on every page. Header and footer for example."""
def __init__(self, page: 'Page'):
self.page = page
def roots(self) -> typing.List['RootCategoryPage']:
roots = self.page.soup.select('.catalog-tabs-content__list .catalog-list__link')
assert roots
return [RootCategoryPage(path=r['href']) for r in roots]
def work_doc(self) -> typing.List['CategoryPage']:
# @todo #741:30m Parse work docs from stalbeton.
# Don't create series entity. It's for another task.
raise NotImplemented()
class Page:
SITE_URL = 'https://stalbeton.pro'
def __init__(self, path: str):
# '/catalog/dorozhnoe-stroitelstvo' for example
self.path = path
@property
def url(self) -> str:
return urljoin(self.SITE_URL.strip('/'), self.path.strip('/'))
@property
@lru_cache(maxsize=1)
def page(self) -> requests.Response:
response = requests.get(self.url)
assert response.status_code == 200, self
return response
@property
@lru_cache(maxsize=1)
def soup(self) -> bs4.BeautifulSoup:
return bs4.BeautifulSoup(
self.page.content.decode('utf-8'),
'html.parser'
)
def __str__(self):
return self.path
@property
def title(self) -> str:
return self.soup.find('title').text
@property
def h1(self) -> str:
return self.soup.find('h1').text
@property
def description(self) -> str:
return self.soup.select_one('meta[name="Description"]')['content']
class CategoryPage(Page):
@property
def text(self) -> str:
"""
Only category page has unique text.
Every another text has autogenerated content.
"""
return self.soup.select_one('#js-category-description').text
class RootCategoryPage(CategoryPage):
# @todo #741:30m Implement parse_stalbeton.Category.children() method.
# And reuse it as polymorphic method in subclasses.
# The task has pros and cons, so, we'll discuss it for the first.
def second_level(self) -> typing.List['SecondLevelCategoryPage']:
return [
SecondLevelCategoryPage(p['href'])
for p in self.soup.select('h2 > a.catalog-list__link')
]
class SecondLevelCategoryPage(CategoryPage):
def third_level(self) -> typing.List['ThirdLevelCategoryPage']:
return [
ThirdLevelCategoryPage(p['href'])
for p in self.soup.select('h2 > a.catalog-list__link')
]
class ThirdLevelCategoryPage(CategoryPage):
def options(self) -> list:
# @todo #741:60m Parse stalbeton's options.
raise NotImplemented()
# @todo #741:60m Parse series from stalbeton.
# Series are already parsed as text strings.
# Parse them as separated pages to get options-series relation.
def series(self) -> typing.List[str]:
return [
item.text for item in self.soup.select(
'.documentation-block span.documentation-block__item > a'
)
]
def parse():
main = Page(path='/')
through = ThroughElements(page=main)
roots = through.roots()
# @todo #741:30m Create parse_stalbeton.Categories class.
# And hide children list assembling there.
# See PR #758 discussion for example.
seconds = chain.from_iterable((r.second_level() for r in roots))
thirds = chain.from_iterable((s.third_level() for s in seconds))
options = chain.from_iterable((t.options() for t in thirds)) # Ignore PyFlakesBear
# @todo #741:60m Save parsed stalbeton to a DB.
# DB isn't required to be high performance.
# It can be sqlite or postgres or pickle lib or whatever else.
# DB is required to analyze data without loading stalbeton site every time.
class Command(BaseCommand):
def handle(self, *args, **options):
parse()