In [55]:
import requests
from bs4 import BeautifulSoup
import re
import time
from tqdm import tqdm
import numpy as np

In [56]:
PAGES_COUNT = 3152
ARTICLES_PEG_JOURNAL = 10
ALL_JOUNRALS_URL = 'https://www.livejournal.com/ratings/users/authority/?country=ru'

NOL_FORMS = ['ноль', 'ноля', 'нолю', 'нолём', 'нолем', 'ноле', 'ноли', 'нолей', 'нолям','нолями', 'нолях']
NUL_FORMS = [form.replace('о', 'у') for form in NOL_FORMS]
SLEEP_SECONDS = 10 / 1000

def get_soup(url):
    BEAUTIFUL_SOUP_OPTIONS = 'html.parser'
    OK_STATUS = 200
    
    request_result = requests.get(url)
    assert request_result.status_code == OK_STATUS
    return BeautifulSoup(request_result.text, BEAUTIFUL_SOUP_OPTIONS)

In [57]:
nol_count = []
nul_count = []

def count_words(text, forms: list):
    result = 0
    for word in forms:
        # occurencies do not overlap so we can use .count
        result += text.count(word)
    return result
    

def parse_journal(journal_link):
    print(journal_link)
    ARTICLE_URL_PATTERN = f'{journal_link}/[0-9]+.html'
    REGEX_ARTICLE_URL_PATTERN = re.compile(ARTICLE_URL_PATTERN.replace('.', '[.]') + '$')
    
    ARTICLES_PER_PAGE = 10
    
    journal_articles_count = 0
    first_previous_result = ''
    nol_words = 0
    nul_words = 0
    while True:
        if journal_articles_count >= ARTICLES_PEG_JOURNAL:
                break
        print('next page')
        soup = get_soup(journal_link + f'/?skip={i * ARTICLES_PER_PAGE}')
        urls = [item.get('href') for item in soup.find_all('a')]
        article_urls = set(filter(lambda url: url and REGEX_ARTICLE_URL_PATTERN.match(url), urls))
        if not article_urls or next(iter(article_urls)) == first_previous_result:
            break
        first_previous_result = next(iter(article_urls))
        for url in article_urls:
            if journal_articles_count >= ARTICLES_PEG_JOURNAL:
                break
            print(url)
            
            # livejournal may ban us for spamming get requests 
            time.sleep(SLEEP_SECONDS)
            text = get_soup(url).get_text().lower()
            nol_words += count_words(text, NOL_FORMS)
            nul_words += count_words(text, NUL_FORMS)
            journal_articles_count += 1
    
    
    nol_count.append((, nol_words))
    nul_count.append(nul_words)
            

        

In [59]:
for i in tqdm(range(1, PAGES_COUNT + 1)):
    soup = get_soup(ALL_JOUNRALS_URL + f'&page={i}')
    if i % 100 == 0:
        print(f'{i} pages result:', np.array(nul_count).sum(), np.array(nol_count).sum())
    a_tags = soup.find_all('a', {'class': 'rating-journals-item-user-link'})
    for a in a_tags:
        parse_journal(a.get('href'))

  0%|                                                  | 0/3152 [00:00<?, ?it/s]

https://lena-miro.ru
next page
https://lena-miro.ru/206505.html
https://lena-miro.ru/2882501.html
https://lena-miro.ru/2881351.html
https://lena-miro.ru/2881270.html
https://lena-miro.ru/2883078.html
https://lena-miro.ru/2881582.html
https://lena-miro.ru/2883063.html
https://lena-miro.ru/2882650.html
https://lena-miro.ru/2881883.html
https://lena-miro.ru/2880623.html
https://masterok.livejournal.com
next page
https://masterok.livejournal.com/3320604.html
https://masterok.livejournal.com/8955336.html
https://masterok.livejournal.com/8957419.html
https://masterok.livejournal.com/8957456.html
https://masterok.livejournal.com/8955992.html
https://masterok.livejournal.com/8956676.html
https://masterok.livejournal.com/4141970.html
https://masterok.livejournal.com/8956649.html
https://masterok.livejournal.com/8955774.html
https://masterok.livejournal.com/8956165.html
https://amarok-man.livejournal.com
next page
https://amarok-man.livejournal.com/7160906.html
https://amarok-man.livejournal.com

https://irek-murtazin.livejournal.com/3432521.html
https://irek-murtazin.livejournal.com/3433180.html
https://irek-murtazin.livejournal.com/3431515.html
https://irek-murtazin.livejournal.com/1244177.html
https://irek-murtazin.livejournal.com/3432293.html
https://tanjand.livejournal.com
next page
https://tanjand.livejournal.com/3777928.html
https://tanjand.livejournal.com/3777457.html
https://tanjand.livejournal.com/3778815.html
https://tanjand.livejournal.com/3780631.html
https://tanjand.livejournal.com/3781555.html
https://tanjand.livejournal.com/3780417.html
https://tanjand.livejournal.com/3777559.html
https://tanjand.livejournal.com/3780316.html
https://tanjand.livejournal.com/3781678.html
https://tanjand.livejournal.com/3778312.html
https://bmpd.livejournal.com
next page
https://bmpd.livejournal.com/4666978.html
https://bmpd.livejournal.com/4665344.html
https://bmpd.livejournal.com/4665289.html
https://bmpd.livejournal.com/4665849.html
https://bmpd.livejournal.com/4666159.html
http

  0%|                                     | 1/3152 [05:56<312:04:50, 356.55s/it]

https://sovenok101.livejournal.com
next page
https://sovenok101.livejournal.com/423265.html
https://sovenok101.livejournal.com/422130.html
https://sovenok101.livejournal.com/421734.html
https://sovenok101.livejournal.com/421583.html
https://sovenok101.livejournal.com/422163.html
https://sovenok101.livejournal.com/422982.html
https://sovenok101.livejournal.com/422872.html
https://sovenok101.livejournal.com/422582.html
https://sovenok101.livejournal.com/423567.html
https://sovenok101.livejournal.com/421200.html
https://ammo1.livejournal.com
next page
https://ammo1.livejournal.com/1382858.html
https://ammo1.livejournal.com/1380219.html
https://ammo1.livejournal.com/1379083.html
https://ammo1.livejournal.com/1382072.html


  0%|                                     | 1/3152 [06:17<330:16:59, 377.35s/it]


KeyboardInterrupt: 

In [51]:
np.array(nul_count).sum(), np.array(nol_count).sum()

(149, 23)

In [2]:
result = requests.get('https://www.livejournal.com/ratings/users/authority/?country=ru')

In [5]:
result.text



In [9]:
soup = BeautifulSoup(result.text, 'html.parser')

In [16]:
tmp = soup.find_all('a', {'class': 'rating-journals-item-user-link'})

In [47]:
new_url = tmp[0].get('href')

In [48]:
user = requests.get(new_url)

In [49]:
soup2 = BeautifulSoup(user.text, 'html.parser')
tmp2 = soup2.find_all('h3', {'class' 'entryunit__title'})

In [50]:
new_url

'https://lena-miro.ru'

In [53]:
tmp2 = soup2.find_all('a')

[<a class="s-logo-link" href="https://www.livejournal.com">
 <span class="s-logo-title s-logo-title">LiveJournal</span>
 </a>,
 <a class="s-header__nav-ratings s-header-item__link s-header-item__link--ratings s-header-item__link--icon s-header-item__link--fire-icon" href="https://www.livejournal.com/ratings/">Top</a>,
 <a class="s-header-item__link s-header-item__link--main" href="https://www.livejournal.com">Main</a>,
 <a class="s-header-item__link s-header-item__link--ratings s-header-item__link--icon s-header-item__link--fire-icon" href="https://www.livejournal.com/ratings/">
                                     Top
                                 </a>,
 <a class="s-header-item__link s-header-item__link--checklist" href="https://www.livejournal.com/about_checklist/">
                                     500 ideas
                                 </a>,
 <a class="s-header-item__link" href="https://www.livejournal.com/support/">
                                   Help
               

In [123]:
url_pattern = f'{new_url}/[0-9]+.html'
article_pattern = re.compile(url_pattern.replace('.', '[.]') + '$')

In [97]:
bool(article_pattern.match('https://lena-miro.ru/2884786.html'))

False

In [133]:
for item in tmp2:
    url = item.get('href')
    if url and article_pattern.match(url):
        print(url)

https://lena-miro.ru/2884786.html
https://lena-miro.ru/2884786.html
https://lena-miro.ru/206505.html
https://lena-miro.ru/2884605.html
https://lena-miro.ru/2884605.html
https://lena-miro.ru/2884335.html
https://lena-miro.ru/2884335.html
https://lena-miro.ru/2884057.html
https://lena-miro.ru/2870897.html
https://lena-miro.ru/2884057.html
https://lena-miro.ru/2883652.html
https://lena-miro.ru/2883652.html
https://lena-miro.ru/2883492.html
https://lena-miro.ru/2883492.html
https://lena-miro.ru/2883078.html
https://lena-miro.ru/2883078.html
https://lena-miro.ru/2883063.html
https://lena-miro.ru/2880623.html
https://lena-miro.ru/2883063.html
https://lena-miro.ru/2882650.html
https://lena-miro.ru/2882650.html
https://lena-miro.ru/2882501.html
https://lena-miro.ru/2882501.html


In [156]:
article = requests.get('https://lena-miro.ru/2884786.html')
soup3 = BeautifulSoup(article.text, 'html.parser')#'\nTags: '

In [157]:
soup3.get_text()

'\n\n\nВот такая вот! - Лена Миро: — LiveJournal\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n?\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLiveJournal\n\nTop\n\n\n\n\n\n\n\nMain\n\n\n\n                                    Top\n                                \n\n\n\n                                    500 ideas\n                                \n\n\n\n                       

In [150]:
tmp4 = soup3.find_all('div', {'class': 'j-e-text'})[0]

IndexError: list index out of range

In [None]:
BeautifulSoup(f'<body>{tmp4}</body>', 'html.parser').get_text()

In [52]:
soup2.find_all('a')

[<a class="s-logo-link" href="https://www.livejournal.com">
 <span class="s-logo-title s-logo-title">LiveJournal</span>
 </a>,
 <a class="s-header__nav-ratings s-header-item__link s-header-item__link--ratings s-header-item__link--icon s-header-item__link--fire-icon" href="https://www.livejournal.com/ratings/">Top</a>,
 <a class="s-header-item__link s-header-item__link--main" href="https://www.livejournal.com">Main</a>,
 <a class="s-header-item__link s-header-item__link--ratings s-header-item__link--icon s-header-item__link--fire-icon" href="https://www.livejournal.com/ratings/">
                                     Top
                                 </a>,
 <a class="s-header-item__link s-header-item__link--checklist" href="https://www.livejournal.com/about_checklist/">
                                     500 ideas
                                 </a>,
 <a class="s-header-item__link" href="https://www.livejournal.com/support/">
                                   Help
               

In [None]:
<h3 class="entryunit__title">
<a href="https://olegmakarenko.ru/2658093.html" target="_self">Закон об иноагентах в Грузии, будет ли Техас свободным и посадки президентов Перу </a>
</h3>

In [None]:
<a href="https://olegmakarenko.ru/2658093.html" target="_self">Закон об иноагентах в Грузии, будет ли Техас свободным и посадки президентов Перу </a>

In [None]:
<a href="https://masterok.livejournal.com/8953863.html" class="subj-link" rel="bookmark" target="_self">Британский наемник Эйден Эслин вернулся на фронт</a>

In [None]:
<dt class="entry-title"><a href="https://masterok.livejournal.com/8953863.html" class="subj-link" rel="bookmark" target="_self">Британский наемник Эйден Эслин вернулся на фронт</a></dt>

Критика подхода: может быть не очень разумно искать подстроки. например, человек мог опечататься в слове "налей".