In [1]:
#!pip install requests asyncio bs4 nest_asyncio

In [1]:
import requests
from bs4 import BeautifulSoup
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio

ModuleNotFoundError: No module named 'nest_asyncio'

In [None]:
library = "http://www.perseus.tufts.edu/hopper/collection?collection=Perseus:corpus:perseus,Latin+Texts"

### Create a library

In [None]:
r = requests.get(library).text
soup = BeautifulSoup(r,"html.parser")

In [None]:
library_list = soup.find_all("a", {"class":"aResultsHeader"})

In [None]:
library_urls = [i["href"] for i in library_list]

In [None]:
library_urls

In [None]:
library_urls

### Get all pages

##### Test

In [None]:
"http://www.perseus.tufts.edu/hopper/"+library_urls[0]

In [None]:
r = requests.get("http://www.perseus.tufts.edu/hopper/"+library_urls[0]).text
soup = BeautifulSoup(r,"html.parser")

In [None]:
toc = soup.find_all("div", {"id":"toc"})[0]

In [None]:
[ i["href"] for i in toc.find_all("a")]

##### Execution

In [None]:
#!pipenv install nest_asyncio

In [None]:
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio

In [None]:
def get_all_pieces(session, url, all_pages_urls):
    url = "http://www.perseus.tufts.edu/hopper/"+url
    r = requests.get(url).text
    soup = BeautifulSoup(r,"html.parser")
    try:
        toc = soup.find_all("div",{"id":"toc"})[0]
        toc = [i["href"] for i in toc.find_all("a")]
        all_pages_urls += toc
    except:
        print(f'ERROR url = {url}')
    

In [None]:
async def get_index_data_asynchronous(all_pages_url, library_urls):
    """
    1. Establish an executor and number of workers
    2. Establish the session
    3. Establish the event loop
    4. Create the task by list comprenhensions
    5. Gather tasks.
    """
    with ThreadPoolExecutor(max_workers=40) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor, get_all_pieces, *(session, url, all_pages_url)
                )
                for url in library_urls
            ]
            for response in await asyncio.gather(*tasks):
                pass

In [None]:
def execute_async_index_event_loop(all_pages_url, library_urls):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    future = asyncio.ensure_future(
        get_index_data_asynchronous(all_pages_url, library_urls)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [None]:
nest_asyncio.apply()
all_pages_url = []
execute_async_index_event_loop(all_pages_url, library_urls)

In [22]:
all_pages_url = [i for i in all_pages_url if "javascript" not in i]

In [23]:
all_pages_url = set(all_pages_url)

In [24]:
len(all_pages_url)

109254

In [25]:
import pickle
filename = "library.pkl"
outfile = open(filename,'wb')
pickle.dump(all_pages_url, outfile)
outfile.close()

### Get all texts
##### Test

In [5]:
import pickle
filename= "library.pkl"
infile =open(filename,'rb')
all_pages_url = pickle.load(infile)
infile.close()

In [6]:
all_pages_url = list(all_pages_url)

In [None]:
url = all_pages_url[1]
url = "http://www.perseus.tufts.edu/hopper/text"+url
r = requests.get(url).text
soup = BeautifulSoup(r,"html.parser")
text = soup.find_all("a",{"class":"text"})

In [None]:
len([i.text for i in text])

In [None]:
url

##### Execution

In [7]:
def get_all_texts(session, base_url):
    url = "http://www.perseus.tufts.edu/hopper/text"+base_url
    r = requests.get(url).text
    soup = BeautifulSoup(r,"html.parser")
    text = soup.find_all("a",{"class":"text"})
    text = " ".join([i.text for i in text])
    f = open("texts/"+base_url,'w')
    f.write(text)
    f.close()

In [12]:
async def get_index_data_asynchronous(all_pages_url):
    """
    1. Establish an executor and number of workers
    2. Establish the session
    3. Establish the event loop
    4. Create the task by list comprenhensions
    5. Gather tasks.
    """
    with ThreadPoolExecutor(max_workers=200) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor, get_all_texts, *(session, url)
                )
                for url in all_pages_url
            ]
            for response in await asyncio.gather(*tasks):
                pass

In [13]:
def execute_async_index_event_loop(all_pages_url):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    future = asyncio.ensure_future(
        get_index_data_asynchronous(all_pages_url)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [None]:
%%time
nest_asyncio.apply()
all_texts = []
execute_async_index_event_loop(all_pages_url)

### Create Corpus

In [1]:
import os

In [2]:
len(os.listdir("texts/"))

108980

In [4]:
docs = ["texts/"+i for i in os.listdir("texts/")]

In [5]:
o = open(docs[0])
o = o.read()

In [6]:
[ i.strip('\n') for i in o.split(" ")]

['coangustatum',
 'est',
 'enim',
 'stratum',
 'ita',
 'ut',
 'alter',
 'decidat',
 'et',
 'pallium',
 'breve',
 'utrumque',
 'operire',
 'non',
 'potest']

In [7]:
def doc_stream(path):
    for f in os.listdir(path):
        with open(os.path.join(path,f)) as t:
            text = t.read()
            text = [ i.strip('\n') for i in text.split(" ")]
            yield text

In [8]:
!pip install gensim

[33mYou are using pip version 10.0.1, however version 19.2.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [9]:
from gensim import corpora
dictionary = corpora.dictionary.Dictionary()

In [10]:
ds = doc_stream("texts/")

In [11]:
latin_dict = corpora.Dictionary(ds)

In [12]:
latin_dict.save("latin_gensim_dict")

In [13]:
keys = latin_dict.cfs.keys()
values = latin_dict.cfs.values()
word = [latin_dict[i] for i in keys]

In [14]:
import pandas as pd

In [15]:
df = pd.DataFrame()

In [16]:
df["word"] = word

In [17]:
df["frequency"] = values

In [18]:
df.sort_values("frequency",ascending=False,inplace=True)

In [19]:
df["percent_of_corpus"] = df["frequency"] / latin_dict.num_pos

In [20]:
df.head()

Unnamed: 0,word,frequency,percent_of_corpus
8,et,193439,0.032638
34,in,118367,0.019971
1,est,83951,0.014164
13,non,65061,0.010977
5,ut,55089,0.009295


In [21]:
from gensim.models.phrases import Phrases

In [22]:
bigrams = Phrases(doc_stream("texts/")) 

In [23]:
bigrams.vocab

defaultdict(int,
            {b'coangustatum': 1,
             b'est': 83951,
             b'coangustatum_est': 1,
             b'enim': 19135,
             b'est_enim': 1089,
             b'stratum': 36,
             b'enim_stratum': 1,
             b'ita': 9185,
             b'stratum_ita': 1,
             b'ut': 55089,
             b'ita_ut': 810,
             b'alter': 1094,
             b'ut_alter': 21,
             b'decidat': 16,
             b'alter_decidat': 1,
             b'et': 193439,
             b'decidat_et': 1,
             b'pallium': 83,
             b'et_pallium': 9,
             b'breve': 184,
             b'pallium_breve': 2,
             b'utrumque': 843,
             b'breve_utrumque': 1,
             b'operire': 32,
             b'utrumque_operire': 1,
             b'non': 65061,
             b'operire_non': 1,
             b'potest': 5519,
             b'non_potest': 1039,
             b'responditque': 26,
             b'Achan': 6,
             b'responditque_

In [24]:
trigrams = Phrases(bigrams[doc_stream("texts/")])

In [25]:
t = trigrams.vocab

In [26]:
keys = t.keys()
values = t.values()

In [27]:
df2 = pd.DataFrame()

In [28]:
keys = [i.decode("UTF-8") for i in keys]
keys = [i.split("_") for i in keys]
keys = [" ".join(i) for i in keys]

In [30]:
df2["phrases"] = keys
df2["values"] = values

In [31]:
df2.sort_values("values",ascending=False, inplace=True)

In [35]:
df2["values"].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7fbed677c898>

In [36]:
import pickle

In [38]:
filename = "df2.pkl"
outfile = open(filename,'wb')
pickle.dump(df2, outfile)
outfile.close()