## **Imports**

In [703]:
# db
import aiosqlite

# web
import aiohttp
import asyncio

# analysis 
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

# system
import os
import sys
import shutil

## **Database**

In [704]:
DATABASE_NAME = 'sites.db'

In [705]:
async def fill_database(name=DATABASE_NAME, rewrite=True):
    if rewrite and os.path.isfile(name):
        os.remove(name)
    async with aiosqlite.connect(DATABASE_NAME) as db: 
        await db.execute('''
        CREATE TABLE pages (
          id INTEGER PRIMARY KEY AUTOINCREMENT,
          url TEXT,
          page_name TEXT,
          
          pure_html TEXT,
          plain_text TEXT
        )
        ''')
        
        await db.execute('''
        CREATE TABLE children (
          id INTEGER PRIMARY KEY AUTOINCREMENT,
          root_id INTEGER,
          child_id INTEGER,
          FOREIGN KEY (root_id)  REFERENCES pages(id),
          FOREIGN KEY (child_id)  REFERENCES pages(id)
        )
        ''')
        
        await db.execute('''
        CREATE TABLE sites (
          id INTEGER PRIMARY KEY AUTOINCREMENT,
          url TEXT,
          site_name TEXT,
          root_id INTEGER,
          FOREIGN KEY (root_id)  REFERENCES pages(id)
        )
        ''')
        
        for page in pages:
            await db.execute('INSERT INTO pages VALUES(?, ?, ?, ?, ?)', page)
        for site in sites:
            await db.execute('INSERT INTO sites VALUES(?, ?, ?, ?)', site)
        for rel in children:
            await db.execute('INSERT INTO childern VALUES(?, ?, ?)', rel)
            
        await db.commit()
        

## **Get site data**

In [706]:
pages = []
children = []
sites = []

In [707]:
headers = {
    'Accept-Language': 'ru,en;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
}

In [708]:
def make_links_absolute(html, url):
    absolutize = lambda m: ' src="' + urljoin(url, m.group(1)) + '"'
    html = re.sub(r' src="([^"]+)"', absolutize, html)
    absolutize2 = lambda m: ' href="' + urljoin(url, m.group(1)) + '"'
    html = re.sub(r' href="([^"]+)"', absolutize2, html)
    return html

In [709]:
async def get_plain_text(soup):
    for script in soup(["script", "style"]):
      script.extract()

    text = soup.get_text()
    
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    return '\n'.join(chunk for chunk in chunks if chunk)

In [710]:
async def get_links(soup):
    for script in soup(["script", "style"]):
      script.extract()

    text = soup.get_text()
    
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    return '\n'.join(chunk for chunk in chunks if chunk)

In [711]:
async def get_page(session: aiohttp.ClientSession, url: str, root = 0):
    try:
        await asyncio.sleep(0.1)
        async with session.get(url, headers=headers) as response:
            await asyncio.sleep(0.01)
            text = await response.text(errors='replace')
            text = make_links_absolute(text, url)
            text = text[:text.rfind('<!-- copyright (t2) -->')]
            
            soup = BeautifulSoup(text, features="html.parser")
            plain_text = await get_plain_text(soup)
            title = soup.title.string if soup.title else ""
            
            index = len(pages) + 1
            pages.append((index, url, str(title), text, plain_text))
            
            if root > 0:
                ch_i = len(children) + 1
                children.append((ch_i, root, index))
            
            site = url[url.find('://') + 3:]
            site = site[:site.find('/')]
            tasks = []
            for link in soup.findAll('a'):
                link = link.get('href')
                if link and site in link and link.startswith('http'):
                    task = asyncio.create_task(get_page(session, link))
                    tasks.append(task)
                    
            await asyncio.gather(*tasks)
            
            return index
    except aiohttp.ClientConnectorError:
        print(url)
        

In [712]:
async def get_site(session: aiohttp.ClientSession, url: str):
    root = await get_page(session, url)
    index = len(sites) + 1
    print(index)
    sites.append((index, url, pages[root - 1][2], root))

In [713]:
async def gather_sites(sites):
    async with aiohttp.ClientSession() as session:
        tasks = []
        
        for site_link in sites:
            task = asyncio.create_task(get_site(session, site_link))
            tasks.append(task)
        
        await asyncio.gather(*tasks)
        
        await fill_database()

In [714]:
async def main():
    sites = [i.strip() for i in open('Narod.txt')]
    await gather_sites(sites)

In [715]:
await main()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
https://school-number-eight.narod.ru/
18
https://school-number-eight.narod.ru/
18


ServerDisconnectedError: Server disconnected