In [1]:
from pprint import pprint
from lxml import html
import requests
import pandas as pd

In [2]:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
URL_YANDEX_NEWS = 'https://yandex.ru/news/'
URL_LENTA_NEWS = 'https://lenta.ru/'
URL_DZEN_NEWS = 'https://dzen.ru/news/'

In [3]:
headers = {
    'User-Agent': USER_AGENT,
}

params = {
    'sso_failed': '',
}

In [4]:
def get_content_dom_from_html_text(url, headers=None, params=None):
    response = requests.get(url, headers=headers, params=params)
    content_dom = html.fromstring(response.text)

    print(response.url)
    
    return content_dom


def parser_yandex_news(content_dom):
    news_container = content_dom.xpath("//section//div[contains(@class, 'mg-grid__item')]")
    yandex_news = []
    for new_container in news_container:
        new_source = new_container.xpath(".//div[@class='mg-card-footer__left']//a/text()")[0]
        new_text = new_container.xpath(".//h2/a/text()")[0].replace('\xa0', ' ')
        new_link = new_container.xpath(".//h2/a/@href")[0]
        new_date = new_container.xpath(".//span[@class='mg-card-source__time']/text()")[0]
        news_dict = {
            'new_source': new_source,
            'new_text': new_text,
            'new_link': new_link,
            'new_date': new_date,
        }
        yandex_news.append(news_dict)

    return yandex_news


def parser_dzen_news(content_dom):
    news_container = content_dom.xpath("//div[contains(@class,'mg-card ')]")
    dzen_news = []
    for new_container in news_container:
        new_source = new_container.xpath(".//a[@class='mg-card__source-link']/text()")[0]
        new_text = new_container.xpath(".//h2/a/text()")[0].replace('\xa0', ' ')
        new_link = new_container.xpath(".//h2/a/@href")[0]
        new_date = new_container.xpath(".//span[@class='mg-card-source__time']/text()")[0]
        news_dict = {
            'new_source': new_source,
            'new_text': new_text,
            'new_link': new_link,
            'new_date': new_date,
        }
        dzen_news.append(news_dict) 

    return dzen_news


def parser_lenta_news(content_dom):
    news_container = content_dom.xpath("//a[contains(@class, 'card-')]")
    lenta_news = []
    for new_container in news_container:
        new_source = new_container.xpath(".//*[name()='svg']/*[name()='use']/attribute::*")
        if len(new_source) == 1:
            new_source = new_source[0].split('ui-label_')
            if len(new_source) == 2:
                new_source = new_source[1]
            else:
                new_source = 'Lenta.ru'
        else:
            new_source = 'Lenta.ru' 
        new_text = new_container.xpath(".//span[contains(@class, 'card-')]/text()")
        if len(new_text) == 1:
            new_text = new_text[0]
        else:
            new_text = None  
        new_link = URL_LENTA_NEWS + new_container.xpath("./@href")[0]
        
        new_date = new_container.xpath(".//time/text()")
        if len(new_date) == 1:
            new_date = new_date[0]
        else:
            new_date = None
        news_dict = {
            'new_source': new_source,
            'new_text': new_text,
            'new_link': new_link,
            'new_date': new_date,
        }
        lenta_news.append(news_dict)
        
    return lenta_news


def get_news(content_dom, parser_name='dzen'):
    result = None
    
    if parser_name == 'dzen':
      result = parser_dzen_news(content_dom)
    elif parser_name == 'yandex':
      result = parser_yandex_news(content_dom)
    elif parser_name == 'lenta':
      result = parser_lenta_news(content_dom)
    
    return result

In [5]:
dom = get_content_dom_from_html_text(URL_DZEN_NEWS, headers=headers, params=params)

https://dzen.ru/news?issue_tld=kz&sso_failed=


In [6]:
dzen_news = get_news(dom, 'dzen')
dzen_news

[{'new_source': 'Московский Комсомолец',
  'new_text': 'Представитель Казахстана вступил в должность генерального секретаря ОДКБ',
  'new_link': 'https://dzen.ru/news/story/Predstavitel_Kazakhstana_vstupil_vdolzhnost_generalnogo_sekretarya_ODKB--98650ed5741925105cb13d94e042aef4?lang=ru&rubric=index&fan=1&stid=mQMmzLfhS4bIgHdm01c4&t=1672592347&persistent_id=1234564460&story=08c7bf44-82bd-5ef2-937a-7ebd5637fa6f&issue_tld=kz',
  'new_date': '19:06'},
 {'new_source': '24.kz',
  'new_text': 'Список социально-экономических преобразований составили в Правительстве',
  'new_link': 'https://dzen.ru/news/story/Spisok_socialno-ehkonomicheskikh_preobrazovanij_sostavili_vPravitelstve--741a8093d190ddb86798fa96bd83e97a?lang=ru&rubric=index&fan=1&stid=dHTkk7DVoMxi5NP9&t=1672592347&persistent_id=1234564336&story=d4e79d83-e724-58e6-a958-57120782a29d&issue_tld=kz',
  'new_date': '16:10'},
 {'new_source': 'REGNUM',
  'new_text': 'Президент Казахстана подписал закон о развитии агломераций',
  'new_link': '