In [None]:
#!pip install nest_asyncio

In [1]:
import requests
from bs4 import BeautifulSoup
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio

In [2]:
library = "http://www.perseus.tufts.edu/hopper/collection?collection=Perseus:corpus:perseus,Latin+Texts"

### Create a library

In [3]:
r = requests.get(library).text
soup = BeautifulSoup(r,"html.parser")

In [4]:
library_list = soup.find_all("a", {"class":"aResultsHeader"})

In [5]:
library_urls = [i["href"] for i in library_list]

### Get all pages

##### Test

In [None]:
"http://www.perseus.tufts.edu/hopper/"+library_urls[0]

In [None]:
r = requests.get("http://www.perseus.tufts.edu/hopper/"+library_urls[0]).text
soup = BeautifulSoup(r,"html.parser")

In [None]:
toc = soup.find_all("div", {"id":"toc"})[0]

In [None]:
[ i["href"] for i in toc.find_all("a")]

##### Execution

In [None]:
#!pipenv install nest_asyncio

In [6]:
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio

In [7]:
def get_all_pieces(url, all_pages_url):
    url = "http://www.perseus.tufts.edu/hopper/"+url
    r = requests.get(url).text
    soup = BeautifulSoup(r,"html.parser")
    try:
        toc = soup.find_all("div",{"id":"toc"})[0]
        toc = [i["href"] for i in toc.find_all("a")]
        all_pages_url += toc
    except:
        print(f'ERROR url = {url}')
    

In [8]:
all_pages_url = []

In [11]:
library_urls[0]

'text?doc=Perseus%3atext%3a2007.01.0081'

In [9]:
get_all_pieces(library_urls[0],all_pages_url)

In [10]:
all_pages_url

["javascript:toggleExpand('N65542');",
 "javascript:toggleExpand('N65542');",
 "javascript:toggleExpand('N65557');",
 "javascript:toggleExpand('N65557');",
 '?doc=Perseus%3Atext%3A2007.01.0081%3Abook%3D14%3Achapter%3D1%3Asection%3D1',
 '?doc=Perseus%3Atext%3A2007.01.0081%3Abook%3D14%3Achapter%3D1%3Asection%3D2',
 '?doc=Perseus%3Atext%3A2007.01.0081%3Abook%3D14%3Achapter%3D1%3Asection%3D3',
 '?doc=Perseus%3Atext%3A2007.01.0081%3Abook%3D14%3Achapter%3D1%3Asection%3D4',
 '?doc=Perseus%3Atext%3A2007.01.0081%3Abook%3D14%3Achapter%3D1%3Asection%3D5',
 '?doc=Perseus%3Atext%3A2007.01.0081%3Abook%3D14%3Achapter%3D1%3Asection%3D6',
 '?doc=Perseus%3Atext%3A2007.01.0081%3Abook%3D14%3Achapter%3D1%3Asection%3D7',
 '?doc=Perseus%3Atext%3A2007.01.0081%3Abook%3D14%3Achapter%3D1%3Asection%3D8',
 '?doc=Perseus%3Atext%3A2007.01.0081%3Abook%3D14%3Achapter%3D1%3Asection%3D9',
 '?doc=Perseus%3Atext%3A2007.01.0081%3Abook%3D14%3Achapter%3D1%3Asection%3D10',
 "javascript:toggleExpand('N65689');",
 "javascript:t

In [12]:
async def get_index_data_asynchronous(all_pages_url, library_urls):
    """
    1. Establish an executor and number of workers
    2. Establish the session
    3. Establish the event loop
    4. Create the task by list comprenhensions
    5. Gather tasks.
    """
    with ThreadPoolExecutor(max_workers=40) as executor:
        with requests.Session() as session:
            loop = asyncio.new_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor, get_all_pieces, *(url, all_pages_url)
                )
                for url in library_urls
            ]
            for response in await asyncio.gather(*tasks):
                pass

In [None]:
def execute_async_index_event_loop(all_pages_url, library_urls):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    
    future = asyncio.ensure_future(
        get_index_data_asynchronous(all_pages_url, library_urls)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [25]:
def async_web_scrape(iterable, individual_scrape_function, *args):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    nest_asyncio.apply()
    async def create_scrape_loop(iterable, individual_scrape_function, *args):
        """
        1. Establish an executor and number of workers
        2. Establish the session
        3. Establish the event loop
        4. Create the task by list comprenhensions
        5. Gather tasks.
        """
        with ThreadPoolExecutor(max_workers=40) as executor:
            with requests.Session() as session:
                loop = asyncio.get_event_loop()
                tasks = [
                    loop.run_in_executor(
                        executor, individual_scrape_function, *(url, *args)
                    )
                    for url in iterable
                ]
                for response in await asyncio.gather(*tasks):
                    pass
    
    future = asyncio.ensure_future(
        create_scrape_loop(iterable, individual_scrape_function,*args)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [26]:
all_pages_url = []
async_web_scrape(library_urls[0:10], get_all_pieces, all_pages_url)

ERROR url = http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a2008.01.0606


In [27]:
all_pages_url

["javascript:toggleExpand('N65542');",
 "javascript:toggleExpand('N65542');",
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D1%3Asection%3D1',
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D1%3Asection%3D2',
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D1%3Asection%3D3',
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D1%3Asection%3D4',
 "javascript:toggleExpand('N65625');",
 "javascript:toggleExpand('N65625');",
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D2%3Asection%3Dpraef',
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D2%3Asection%3D1',
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D2%3Asection%3D2',
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D2%3Asection%3D3',
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D2%3Asection%3D4',
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D2%3Asection%3D5',
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D2%3Asection%3D6',
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D2%3Asection%3D7',
 '?doc=Perseus%3Atext%3A2008.01.0604%3Abook%3D2%3Asection%3D8',
 '?doc=P

In [None]:
nest_asyncio.apply()
all_pages_url = []
execute_async_index_event_loop(all_pages_url, library_urls)

In [None]:
all_pages_url = [i for i in all_pages_url if "javascript" not in i]

In [None]:
all_pages_url = set(all_pages_url)

In [None]:
len(all_pages_url)

In [None]:
import pickle
filename = "library.pkl"
outfile = open(filename,'wb')
pickle.dump(all_pages_url, outfile)
outfile.close()

### Get all texts
##### Test

In [None]:
import pickle
filename= "library.pkl"
infile =open(filename,'rb')
all_pages_url = pickle.load(infile)
infile.close()

In [None]:
all_pages_url = list(all_pages_url)

In [None]:
url = all_pages_url[1]
url = "http://www.perseus.tufts.edu/hopper/text"+url
r = requests.get(url).text
soup = BeautifulSoup(r,"html.parser")
text = soup.find_all("a",{"class":"text"})

In [None]:
len([i.text for i in text])

In [None]:
all_pages_url[1]

##### Execution

In [None]:
import os
os.listdir(os.getcwd())

In [None]:
def get_all_texts(session, base_url):
    url = "http://www.perseus.tufts.edu/hopper/text"+base_url
    r = requests.get(url).text
    soup = BeautifulSoup(r,"html.parser")
    text = soup.find_all("a",{"class":"text"})
    text = " ".join([i.text for i in text])
    f = open("texts/"+base_url,'w')
    f.write(text)
    f.close()

In [None]:
get_all_texts(requests.session, all_pages_url[1])

In [None]:
async def get_index_data_asynchronous(all_pages_url):
    """
    1. Establish an executor and number of workers
    2. Establish the session
    3. Establish the event loop
    4. Create the task by list comprenhensions
    5. Gather tasks.
    """
    with ThreadPoolExecutor(max_workers=200) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor, get_all_texts, *(session, url)
                )
                for url in all_pages_url
            ]
            for response in await asyncio.gather(*tasks):
                pass

In [None]:
def execute_async_index_event_loop(all_pages_url):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    future = asyncio.ensure_future(
        get_index_data_asynchronous(all_pages_url)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [None]:
%%time
nest_asyncio.apply()
all_texts = []
execute_async_index_event_loop(all_pages_url)

### Create Corpus

In [None]:
import os

In [None]:
len(os.listdir("texts/"))

In [None]:
docs = ["texts/"+i for i in os.listdir("texts/")]

In [None]:
o = open(docs[0])
o = o.read()

In [None]:
[ i.strip('\n') for i in o.split(" ")]

In [None]:
def doc_stream(path):
    for f in os.listdir(path):
        with open(os.path.join(path,f)) as t:
            text = t.read()
            text = [ i.strip('\n') for i in text.split(" ")]
            yield text

In [None]:
!pip install gensim

In [None]:
from gensim import corpora
dictionary = corpora.dictionary.Dictionary()

In [None]:
ds = doc_stream("texts/")

In [None]:
latin_dict = corpora.Dictionary(ds)

In [None]:
latin_dict.save("latin_gensim_dict")

In [None]:
keys = latin_dict.cfs.keys()
values = latin_dict.cfs.values()
word = [latin_dict[i] for i in keys]

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame()

In [None]:
df["word"] = word

In [None]:
df["frequency"] = values

In [None]:
df.sort_values("frequency",ascending=False,inplace=True)

In [None]:
df["percent_of_corpus"] = df["frequency"] / latin_dict.num_pos

In [None]:
df.head()

In [None]:
from gensim.models.phrases import Phrases

In [None]:
bigrams = Phrases(doc_stream("texts/")) 

In [None]:
bigrams.vocab

In [None]:
trigrams = Phrases(bigrams[doc_stream("texts/")])

In [None]:
t = trigrams.vocab

In [None]:
keys = t.keys()
values = t.values()

In [None]:
df2 = pd.DataFrame()

In [None]:
keys = [i.decode("UTF-8") for i in keys]
keys = [i.split("_") for i in keys]
keys = [" ".join(i) for i in keys]

In [None]:
df2["phrases"] = keys
df2["values"] = values

In [None]:
df2.sort_values("values",ascending=False, inplace=True)

In [None]:
df2["values"].plot()

In [None]:
import pickle

In [None]:
filename = "df2.pkl"
outfile = open(filename,'wb')
pickle.dump(df2, outfile)
outfile.close()

### Data Transformation

In [None]:
#!pipenv install seaborn

In [None]:
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import seaborn as sns

In [None]:
import pickle
filename= "df2.pkl"
infile =open(filename,'rb')
df2 = pickle.load(infile)
infile.close()

In [None]:
useful_values = df2[df2["values"]>500]

In [None]:
useful_values

In [None]:
sns.lineplot(data=useful_values["values"])

In [None]:
df2["len"]= df2["phrases"].apply(lambda x: len(x.split(" ")))

In [None]:
num_words = df2["values"].sum()

In [None]:
df2["percent"] = df2["values"] / num_words

In [None]:
df2["zipf"] = df2["percent"].cumsum()

### Unigram

In [None]:
words_df = df2[df2["len"]==1]

In [None]:
num_words = words_df["values"].sum()
words_df["percent"] = words_df["values"]/num_words
words_df["zipf"] = words_df["percent"].cumsum()

In [None]:
plt.plot(words_df["zipf"].values)

In [None]:
words_df[words_df["zipf"]<=.5].count()

In [None]:
words_df[words_df["zipf"]<=.5].tail(20)

In [None]:
words_df[words_df["zipf"]<=.6].count()

In [None]:
words_df[words_df["zipf"]<=.65].count()

In [None]:
words_df[words_df["zipf"]<=.8].count()

In [None]:
words_df[words_df["zipf"]<=.6]

### bigram

In [None]:
bigram = df2[df2["len"]==2]
num_words = bigram["values"].sum()
bigram["percent"] = bigram["values"]/num_words
bigram["zipf"] = bigram["percent"].cumsum()

In [None]:
plt.plot(bigram["zipf"].values)

In [None]:
top_bigrams = bigram[bigram["zipf"]<=.1]

In [None]:
top_bigrams[0:40]

### Trigrams

In [None]:
trigram = df2[df2["len"]==3]
num_words = trigram["values"].sum()
trigram["percent"] = trigram["values"]/num_words
trigram["zipf"] = trigram["percent"].cumsum()

In [None]:
plt.plot(trigram["zipf"].values)

In [None]:
top_trigrams = trigram[trigram["zipf"]<=.1]

In [None]:
trigram[trigram["zipf"]<=.1].count()

In [None]:
top_trigrams[:40]