In [None]:
#!pipenv install requests

In [1]:
import requests
from bs4 import BeautifulSoup
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio

In [None]:
library = "http://www.perseus.tufts.edu/hopper/collection?collection=Perseus:corpus:perseus,Latin+Texts"

### Create a library

In [None]:
r = requests.get(library).text
soup = BeautifulSoup(r,"html.parser")

In [None]:
library_list = soup.find_all("a", {"class":"aResultsHeader"})

In [None]:
library_urls = [i["href"] for i in library_list]

In [None]:
library_urls

In [None]:
library_urls

### Get all pages

##### Test

In [None]:
"http://www.perseus.tufts.edu/hopper/"+library_urls[0]

In [None]:
r = requests.get("http://www.perseus.tufts.edu/hopper/"+library_urls[0]).text
soup = BeautifulSoup(r,"html.parser")

In [None]:
toc = soup.find_all("div", {"id":"toc"})[0]

In [None]:
[ i["href"] for i in toc.find_all("a")]

##### Execution

In [None]:
#!pipenv install nest_asyncio

In [None]:
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio

In [None]:
def get_all_pieces(session, url, all_pages_urls):
    url = "http://www.perseus.tufts.edu/hopper/"+url
    r = requests.get(url).text
    soup = BeautifulSoup(r,"html.parser")
    try:
        toc = soup.find_all("div",{"id":"toc"})[0]
        toc = [i["href"] for i in toc.find_all("a")]
        all_pages_urls += toc
    except:
        print(f'ERROR url = {url}')
    

In [None]:
async def get_index_data_asynchronous(all_pages_url, library_urls):
    """
    1. Establish an executor and number of workers
    2. Establish the session
    3. Establish the event loop
    4. Create the task by list comprenhensions
    5. Gather tasks.
    """
    with ThreadPoolExecutor(max_workers=40) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor, get_all_pieces, *(session, url, all_pages_url)
                )
                for url in library_urls
            ]
            for response in await asyncio.gather(*tasks):
                pass

In [None]:
def execute_async_index_event_loop(all_pages_url, library_urls):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    future = asyncio.ensure_future(
        get_index_data_asynchronous(all_pages_url, library_urls)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [None]:
nest_asyncio.apply()
all_pages_url = []
execute_async_index_event_loop(all_pages_url, library_urls)

In [22]:
all_pages_url = [i for i in all_pages_url if "javascript" not in i]

In [23]:
all_pages_url = set(all_pages_url)

In [24]:
len(all_pages_url)

109254

In [25]:
import pickle
filename = "library.pkl"
outfile = open(filename,'wb')
pickle.dump(all_pages_url, outfile)
outfile.close()

### Get all texts
##### Test

In [26]:
import pickle
filename= "library.pkl"
infile =open(filename,'rb')
all_pages_url = pickle.load(infile)
infile.close()

In [27]:
all_pages_url = list(all_pages_url)

In [None]:
url = all_pages_url[1]
url = "http://www.perseus.tufts.edu/hopper/text"+url
r = requests.get(url).text
soup = BeautifulSoup(r,"html.parser")
text = soup.find_all("a",{"class":"text"})

In [None]:
len([i.text for i in text])

In [None]:
url

##### Execution

In [4]:
def get_all_texts(session, base_url):
    url = "http://www.perseus.tufts.edu/hopper/text"+base_url
    r = requests.get(url).text
    soup = BeautifulSoup(r,"html.parser")
    text = soup.find_all("a",{"class":"text"})
    text = " ".join([i.text for i in text])
    f = open("texts/"+base_url,'w')
    f.write(text)
    f.close()

In [8]:
async def get_index_data_asynchronous(all_pages_url):
    """
    1. Establish an executor and number of workers
    2. Establish the session
    3. Establish the event loop
    4. Create the task by list comprenhensions
    5. Gather tasks.
    """
    with ThreadPoolExecutor(max_workers=40) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor, get_all_texts, *(session, url)
                )
                for url in all_pages_url
            ]
            for response in await asyncio.gather(*tasks):
                pass

In [9]:
def execute_async_index_event_loop(all_pages_url):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    future = asyncio.ensure_future(
        get_index_data_asynchronous(all_pages_url)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [12]:
%%time
nest_asyncio.apply()
all_texts = []
execute_async_index_event_loop(all_pages_url[105:1000])

KeyboardInterrupt: 