In [1]:
import asyncio
import aiohttp
import asyncpg

import datetime as dt

import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
from loguru import logger

In [None]:
def get_url_list(date):
    """Takes a list of links for news scraping on a specific date"""
    agency_url = 'https://www.newsru.com/allnews/' + str(date.strftime("%d%b%Y").lower())
    user_agents = open('proxy/user-agents.txt').read().splitlines()
    random_user_agent = random.choice(user_agents)
    headers = {'User-Agent': random_user_agent}
    answer = requests.get(agency_url, headers=headers)
    try:
        if answer and answer.status_code != 204:
            soup = BeautifulSoup(answer.text, features="html.parser")
            paragraph = soup.body.find(attrs={'class': 'content-main'}).find_all(attrs={'class': 'inner-news-item'})
            links = (tuple('https://www.newsru.com' + el.a.get('href') for el in paragraph if not el.a.get('href').startswith('http')))
        else:
            links = tuple()
    except AttributeError:
        logger.error(date)
        links = tuple()

    return links

In [2]:
agency_url = "https://lenta.ru/news/2019/12/15/cluchaynost/"

In [3]:
user_agents = open('proxy/user-agents.txt').read().splitlines()

In [4]:
random_user_agent = random.choice(user_agents)
headers = {'User-Agent': random_user_agent}
answer = requests.get(agency_url, headers=headers)

In [5]:
soup = BeautifulSoup(answer.text, features="html.parser")

In [29]:
raw_time = soup.body.find(attrs={'class': 'topic-header__item topic-header__time'}).text

In [19]:
raw_time = dt.datetime.strptime(raw_time.split(', ')[0], '%H:%M')

In [26]:
agency_url.split('news/')[1]

In [12]:
import datetime as dt

In [27]:
import re

regex = re.compile(r'\d{4}/\d{2}/\d{2}')

In [31]:
raw_time

In [34]:
str_date = regex.findall(agency_url)[0]

In [36]:
str_date

In [39]:
dt.datetime.fromisoformat(str_date.replace('/', '-') + ' ' + raw_time.split(',')[0])

In [20]:
raw_time

In [89]:
def parse_url(agency_url: str) -> tuple:
    random_user_agent = random.choice(user_agents)
    headers = {'User-Agent': random_user_agent}
    answer = requests.get(agency_url, headers=headers)
    soup = BeautifulSoup(answer.text, features="html.parser")
    raw_time = soup.body.find(attrs={'class': 'topic-header__item topic-header__time'}).text
    raw_date = regex.findall(agency_url)[0]
    news_date = dt.datetime.fromisoformat(raw_date.replace('/', '-') + ' ' + raw_time.split(',')[0])
    try:
        links = soup.body.find(attrs={'class': 'related-topics__list'}).find_all(attrs={'class': 'related-topics__link'})
        links = ['https://lenta.ru/' + link.get('href') for link in links]
    except Exception:
        links = []
    return news_date, links   

In [49]:
df.drop(columns='date', inplace=True)

In [50]:
df['agency'] = 'Lenta'

In [48]:
min(df.date), max(df.date)

In [51]:
df.head()

In [58]:
links = soup.body.find(attrs={'class': 'related-topics__list'}).find_all(attrs={'class': 'related-topics__link'})

In [67]:
['https://lenta.ru/' + link.get('href') for link in links]

In [68]:
df.rename(columns={'text': 'news'}, inplace=True)

In [69]:
df.head()

In [74]:
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine

In [83]:
engine_str = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

In [75]:
engine = create_async_engine(engine_str)

In [85]:
from sqlalchemy import create_engine

conn = create_engine(engine_str, pool_pre_ping=True)

In [87]:
df.to_sql('news', conn, if_exists='replace', index=False)

In [88]:
df.shape

In [97]:
links = df.url.tolist()

In [103]:
import pickle

with open('links.pkl', 'wb') as file:
    pickle.dump(links, file)

In [95]:
from loguru import logger

In [98]:
async def write_to_db(url, date, links):
    """Writes the news to the database"""
    conn = await asyncpg.connect(con)
    await conn.fetch('UPDATE news SET date = $1, links=$2 WHERE url = $3', date, links, url)
    await conn.close()
    await asyncio.sleep(0.2)

In [None]:
async def fetch_content(url, session):
    user_agents = open('proxy/user-agents.txt').read().splitlines()
    random_user_agent = random.choice(user_agents)
    headers = {'User-Agent': random_user_agent}

    async with session.get(url=url, headers=headers) as response:
        if response and response.status != 204:
            answer = await response.text()
            try:
                soup = BeautifulSoup(answer, features="html.parser")
                raw_time = soup.body.find(attrs={'class': 'topic-header__item topic-header__time'}).text
                raw_date = regex.findall(agency_url)[0]
                news_date = dt.datetime.fromisoformat(raw_date.replace('/', '-') + ' ' + raw_time.split(',')[0])
                try:
                    links = soup.body.find(attrs={'class': 'related-topics__list'}).find_all(attrs={'class': 'related-topics__link'})
                    links = ['https://lenta.ru/' + link.get('href') for link in links]
                except Exception:
                    links = []
                await write_to_db(url=url, date=news_date, links=links)
            except AttributeError:
                logger.info(f'{url} не был записан')           

In [None]:
async def main():
    """Collecting news for a given time period: get a list of links to news for a date and pass the pages to parser"""
    async with aiohttp.ClientSession(trust_env=True) as session:
        tasks = []
        chunk = 50
        times = len(links) // chunk
        start = 0
        for el in range(times + 1):
            for url in links[start:start + chunk]:
                task = asyncio.create_task(fetch_content(url, session))
                tasks.append(task)
            start += chunk
            await asyncio.gather(*tasks)
            await asyncio.sleep(0.7)    

In [99]:
    async def main():
        chunk = 100
        tasks = []
        start = 0
        
        times = len(links) // chunk + 1
    
        for el in range(times):
            logger.info(f'Обработка {el}/{times}')
            try:
                async with aiohttp.ClientSession() as session:
                    for url in links[start:start + chunk]:
                        task = asyncio.create_task(fetch_content(url, session))
                        tasks.append(task)
        
                    await asyncio.gather(*tasks)
                start += chunk
                tasks = []
            except Exception:
                logger.error(f'Ошибка обработки {el}/{times}')

In [1]:
DB_NAME='postgres'
DB_PASS="00Goelro00!"
DB_USER="postgres"
DB_HOST="localhost"
DB_PORT=5432

In [2]:
con = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
con

In [4]:
import pandas as pd

In [5]:
df = pd.read_sql(f"select * from news where date > '2019-12-14'", con)

In [7]:
df.sample(5)

In [8]:
df.drop(columns=['agency'], inplace=True)

In [9]:
df.sample(5)

In [10]:
df.shape

In [11]:
df.to_csv('lenta_2.csv.gzip', compression='', index=False)