# Melnikov Evgeny 18PMI


In [6]:
!pip install pymystem3 selenium html2text requests beautifulsoup4 atlas json2xml



In [7]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options

from time import sleep
# import tqdm
from tqdm.notebook import tqdm
import json
import time


import requests
from bs4 import BeautifulSoup
from html2text import html2text

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import clear_output

import warnings
warnings.filterwarnings('ignore')

In [8]:
chrome_driver_path = "chromedriver.exe"
json_filename = "data.json"
categories = ["transport", "auto", "health", "incidents"]

## Lab1


In [9]:
# The function extracts article data from www.nn.ru and writes it in JSON format to out_filename
# The extracted data is: 
# - article link (article_id) 
# - article title 
# - article category
# - article tags 
# - article text
# The category and number of articles is specified by categories and article_num respectively

def parse_nnru(article_num, categories, out_filename, progress_bar=True):
    # progress bar setup   
    if progress_bar:
        category_bar = tqdm(range(len(categories)), desc = 'Category progress')
        article_bar = tqdm(range(article_num), desc = 'Article progress', leave = False)

    # create output json file
    with open(out_filename,'w', encoding="utf-8") as file:
        pass
    
    # set Chrome options and preferences
    prefs = {"profile.managed_default_content_settings.images": 2,
             "profile.default_content_settings.cookies": 2}
    chrome_options = Options()
    chrome_options.add_extension('adblock.crx')
    chrome_options.add_experimental_option("prefs",prefs) 
    chrome_options.add_argument('--disable-application-cache')
    
    driver = Chrome(executable_path=chrome_driver_path, chrome_options=chrome_options)
    driver.implicitly_wait(10)
    driver.close()
    print("Current session is {}".format(driver.session_id))
    driver.switch_to.window(driver.window_handles[0])
    
    driver.get('chrome-extension://gighmmpiobklfepjocnamgkkbiglidom/options.html')
    driver.find_element_by_id('acceptable_ads').click() # adblock customization
       
    for category in categories:
        # progress bar
        if progress_bar:
            category_bar.update(1)
            article_bar.reset()
            
        article_count = 0
        page = 1       
        while(1):
            while_break_flag = False # used to break while(1) from the nested for-loop
            
            driver.get(f'https://www.nn.ru/text/?rubric={category}&page={page}')           
            
            #for element in driver.find_elements_by_xpath("//article"):
            for element in driver.find_elements_by_xpath("//*[@class='_3-SyJ']"): # tags with this specific class contain article previews
                json_dict = {}            
                # article's link  
                json_dict['article_id'] = element.find_element_by_tag_name('a').get_attribute('href')
                
                soup = BeautifulSoup(requests.get(json_dict['article_id']).text, "lxml")            

                # article's title 
                json_dict['title'] = ''
                title = soup.find('h1', attrs={'itemprop': 'headline'}, recursive=True)
                if title is None:
                    continue
                else:
                    json_dict['title'] = title.getText()
                
                # article's category  
                json_dict['category'] = category

                # article's tags        
                json_dict['tags'] = []
                div_tags = soup.find('div', attrs={'class': '_2TvYC'}, recursive=True) # <div class="_2TvYC"> contains tags of an article
                if div_tags is not None: # some articles may not have tags at all            
                    div_tags_children = div_tags.findChildren('span')
                    for child in div_tags_children: # iterate over <span> children
                        json_dict['tags'].append(child.getText()) # transform type bs4.element.Tag into str and append to a list
                else:
                    #continue
                    pass
                
                # article's text 
                json_dict['text'] = ''
                div_article = soup.find('div', attrs={'itemprop': 'articleBody'}) # find a <div>, which contains the article itself
                if div_article is None: # if the div with an article hasn't been found
                    continue                
                for figure_tag in div_article.find_all('figure', attrs={'itemscope': 'itemscope'}):
                    figure_tag.extract() # remove all <figure itemscope='itemscoppe'> tags, which contain images and image-related text
                text = ''
                for paragraph in div_article.find_all(['p', 'li'], attrs={'class': ''}):
                    json_dict['text'] += paragraph.getText() + ' '
                    # NOTE: there may be double spaces in some places
                
                # write data to output json file                
                with open(out_filename,'a', encoding="utf-8") as file:
                    file.write(json.dumps(json_dict, ensure_ascii=False) + '\n')
                
                # progress bar
                if progress_bar:
                    article_bar.update(1)
    
                # check whether the number of articles is sufficient
                article_count+=1
                if article_count >= article_num:
                    while_break_flag = True
                    break
                
            if while_break_flag is True: # break the while-loop from the inner for-loop if the amount of fetched articles is sufficient
                break
            else:
                page+= 1 # otherwise, fetch articles from the next page    
        
    driver.quit()


In [10]:
parse_nnru(1000, categories, json_filename)

Category progress:   0%|          | 0/4 [00:00<?, ?it/s]

Article progress:   0%|          | 0/1000 [00:00<?, ?it/s]

Current session is 6763fc423bfa3e83553e15c01b72e96e


In [13]:
data = pd.read_json(json_filename, lines=True)
data

Unnamed: 0,article_id,title,category,tags,text
0,https://www.nn.ru/text/transport/2022/01/15/70...,"Водитель погиб, 6-летняя девочка пострадала. В...",transport,[ДТП],"Сегодня, 15 января, на 510-м километре трассы ..."
1,https://www.nn.ru/text/transport/2022/01/12/70...,58 платных парковок запустят в Нижнем Новгород...,transport,"[Нижний Новгород, платные автопарковки]",В январе 2022 года в Нижнем Новгороде введут в...
2,https://www.nn.ru/text/transport/2022/01/12/70...,«Транспортный дневник» и подсчет пассажиров. К...,transport,"[маршрутная сеть, транспортная схема]","Новая маршрутная схема, которая должна появить..."
3,https://www.nn.ru/text/gorod/2022/01/11/70368938/,"Апокалипсис сегодня. Фоторепортаж о том, как Н...",transport,"[Нижний Новгород, снегопад, фоторепортаж]",В Нижнем Новгороде снова аномальные снегопады....
4,https://www.nn.ru/text/transport/2022/01/10/70...,Будет больше «зеленых волн». 90 модемов для мо...,transport,"[пробка, светофор, «зеленая волна»]",«Зеленых волн» светофоров в Нижнем Новгороде в...
...,...,...,...,...,...
3995,https://www.nn.ru/text/incidents/2020/05/02/69...,Пока все спали: в Дзержинске сгорели Toyota La...,incidents,"[Дзержинск, поджог, сгорел автомобиль]","Сегодня, 2 мая, в Дзержинске Нижегородской обл..."
3996,https://www.nn.ru/text/incidents/2020/05/01/69...,Из-за паводка в двух районах Нижегородской обл...,incidents,"[весенний паводок, подтопление]","Коронавирус коронавирусом, а во многом жизнь т..."
3997,https://www.nn.ru/text/incidents/2020/05/01/69...,«Он сел за руль КАМАЗа и вез бомбу до полигона...,incidents,"[ГИБДД, Госавтоинспекция, Павел Ржевский, Игор...",Несколько дней назад ушел из жизни начальник р...
3998,https://www.nn.ru/text/incidents/2020/04/30/69...,В Дзержинске полиция заинтересовалась мужчиной...,incidents,"[Дзержинск, дети, самоизоляция]",Весьма странные фотографии появились сегодня в...
