In [123]:
#!pip install nest_asyncio

In [124]:
import requests
from bs4 import BeautifulSoup
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio

In [125]:
library = "http://www.perseus.tufts.edu/hopper/collection?collection=Perseus:corpus:perseus,Greek+Texts"

### Create a library

In [137]:
r = requests.get(library).text
soup = BeautifulSoup(r,"html.parser")
library_list = soup.find_all("a", {"class":"aResultsHeader"})
library_urls = [i["href"] for i in library_list]
text_urls = [(i,) for i in library_urls]

In [142]:
# with sqlite3.connect("greek.db") as conn:
#     conn.execute("DROP TABLE text_urls")

In [141]:
import sqlite3
conn = sqlite3.connect('greek.db')
c = conn.cursor()
c.execute("""CREATE TABLE text_urls (url text UNIQUE )""")
for i in text_urls:
    c.execute("""INSERT INTO text_urls VALUES(?)""",i)
conn.commit()

### Get all pages

##### Test

In [33]:
"http://www.perseus.tufts.edu/hopper/"+library_urls[1]

'http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a2008.01.0592'

In [34]:
r = requests.get("http://www.perseus.tufts.edu/hopper/"+library_urls[0]).text
soup = BeautifulSoup(r,"html.parser")

In [35]:
toc = soup.find_all("div", {"id":"toc"})[0]

In [74]:
toc[2]

'?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3Dpraef'

##### Execution

In [40]:
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio

In [151]:
from typing import Tuple
# with sqlite3.connect("greek.db") as conn:
#     conn.execute("DROP TABLE chapter_urls")
# with sqlite3.connect("greek.db") as conn:
#     conn.execute("""CREATE TABLE chapter_urls (chapter_urls text UNIQUE ,  text_url text)""")
def get_all_pieces(url:Tuple[str,int]):
    r = requests.get("http://www.perseus.tufts.edu/hopper/"+url[0]).text
    soup = BeautifulSoup(r,"html.parser")
    #try:
    toc = soup.find_all("div",{"id":"toc"})[0]
    toc = [(i["href"],url[1]) for i in toc.find_all("a") if "javascript" not in i["href"] ]
    with sqlite3.connect("greek.db") as conn:
        conn.executemany("""INSERT INTO chapter_urls VALUES(?,?)""", toc )
#     except:
#         print(f'ERROR url = {url}')
    

In [152]:
with sqlite3.connect("greek.db") as conn:
    c =conn.execute("SELECT *,ROWID FROM text_urls")
    text_urls=c.fetchall()

In [153]:
get_all_pieces(text_urls[0])

In [154]:
with sqlite3.connect("greek.db") as conn:
    c = conn.execute("SELECT * FROM chapter_urls")
c.fetchall()

[('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3Dpraef', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D1', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D2', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D3', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D4', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D5', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D6', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D7', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D8', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D9', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D10', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D11', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D12', '1'),
 ('?doc=Perseus%3Atext%3A2008.01.0590%3Abook%3D1%3Achapter%3D13', '1')

In [28]:
async def get_index_data_asynchronous(all_pages_url, library_urls):
    """
    1. Establish an executor and number of workers
    2. Establish the session
    3. Establish the event loop
    4. Create the task by list comprenhensions
    5. Gather tasks.
    """
    with ThreadPoolExecutor(max_workers=40) as executor:
        with requests.Session() as session:
            loop = asyncio.new_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor, get_all_pieces, *(url, all_pages_url)
                )
                for url in library_urls
            ]
            for response in await asyncio.gather(*tasks):
                pass

In [18]:
def execute_async_index_event_loop(all_pages_url, library_urls):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    
    future = asyncio.ensure_future(
        get_index_data_asynchronous(all_pages_url, library_urls)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [30]:
from tqdm import tqdm
def async_web_scrape(iterable, individual_scrape_function, *args):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    nest_asyncio.apply()
    async def create_scrape_loop(iterable, individual_scrape_function, *args):
        """
        1. Establish an executor and number of workers
        2. Establish the session
        3. Establish the event loop
        4. Create the task by list comprenhensions
        5. Gather tasks.
        """
        with ThreadPoolExecutor(max_workers=40) as executor:
            with requests.Session() as session:
                loop = asyncio.get_event_loop()
                tasks = [
                    loop.run_in_executor(
                        executor, individual_scrape_function, *(url, *args)
                    )
                    for url in iterable
                ]
                for response in await asyncio.gather(*tasks):
                    pass
    
    future = asyncio.ensure_future(
        create_scrape_loop(iterable, individual_scrape_function,*args)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [31]:
all_pages_url = []
async_web_scrape(library_urls[0:10], get_all_pieces, all_pages_url)

In [32]:
len(all_pages_url)

3004

In [33]:
nest_asyncio.apply()
all_pages_url = []
execute_async_index_event_loop(all_pages_url, library_urls)

ERROR url = http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0139%3aspeech%3d6


RuntimeError: Task <Task pending name='Task-5' coro=<get_index_data_asynchronous() running at <ipython-input-28-ef37061d2d89>:18>> got Future <_GatheringFuture pending> attached to a different loop

In [36]:
len(all_pages_url)

198743

In [37]:
all_pages_url = [i for i in all_pages_url if "javascript" not in i]

In [38]:
all_pages_url = set(all_pages_url)

In [39]:
len(all_pages_url)

129518

In [40]:
import pickle
filename = "library.pkl"
outfile = open(filename,'wb')
pickle.dump(all_pages_url, outfile)
outfile.close()

### Get all texts
##### Test

In [41]:
import pickle
filename= "library.pkl"
infile =open(filename,'rb')
all_pages_url = pickle.load(infile)
infile.close()

In [42]:
all_pages_url = list(all_pages_url)

In [43]:
url = all_pages_url[1]
url = "http://www.perseus.tufts.edu/hopper/text"+url
r = requests.get(url).text
soup = BeautifulSoup(r,"html.parser")
text = soup.find_all("a",{"class":"text"})

In [44]:
len([i.text for i in text])

54

In [45]:
all_pages_url[1]

'?doc=Perseus%3Atext%3A1999.01.0143%3Aspeech%3D8%3Asection%3D43'

In [12]:
class WordMixin:
     def __init__(self,greek,english):
        self.greek = greek
        self.english = english
        super().__init__()
        
class Noun(WordMixin):
    def __init__(self,number,gender,case):
        self.number = number
        self.gender = gender
        self.case = case

class Adjective(Noun):
    def __init__(self):
        super().__init__()

        
        
class Conjunction(WordMixin):
    def __init__(self,greek,english,part):
        self.part = part

class Article(WordMixin):
    def __init__(self,number,gender,case,declension):
        self.number = number
        self.gender = gender
        self.case = case
        self.declension = declension
        
        
class Verb(WordMixin):
    def __init__(self,person,number,class_,voice):
        self.person = person
        self.number = number
        self.class_ = class_
        self.voice = voice
        
        
        
        
        

##### Execution

In [58]:
# import os
# os.listdir(os.getcwd())

In [61]:
import sqlite3
conn = sqlite3.connect('greek.db')
conn.execute('''CREATE TABLE greek_db
(url TEXT, litho TEXT)''')
conn.execute("""CREATE_TABLE words
(word TEXT, defintion TEXT, )""")

<sqlite3.Cursor at 0x24a6c152030>

In [67]:
from urllib.parse import unquote
def get_all_texts(session, base_url):
    url = "http://www.perseus.tufts.edu/hopper/text"+base_url
    r = requests.get(url).text
    soup = BeautifulSoup(r,"html.parser")
    text = soup.find_all("a",{"class":"text"})
    text = " ".join([i.text for i in text])
    with sqlite3.connect("greek.db") as conn: 
        conn.execute("""INSERT into greek_db VALUES(?,?)""",(base_url,text))

In [68]:
get_all_texts(requests.session, all_pages_url[1])

In [1]:
async def get_index_data_asynchronous(all_pages_url):
    """
    1. Establish an executor and number of workers
    2. Establish the session
    3. Establish the event loop
    4. Create the task by list comprenhensions
    5. Gather tasks.
    """
    with ThreadPoolExecutor(max_workers=200) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor, get_all_texts, *(session, url)
                )
                for url in all_pages_url
            ]
            for response in await asyncio.gather(*tasks):
                pass

In [72]:
def execute_async_index_event_loop(all_pages_url):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    future = asyncio.ensure_future(
        get_index_data_asynchronous(all_pages_url)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)

In [None]:
%%time
nest_asyncio.apply()
all_texts = []
execute_async_index_event_loop(all_pages_url)

 66%|█████████████████████████████████████████████████▋                         | 85824/129518 [30:13<15:23, 47.33it/s]


### Create Corpus

In [None]:
import os

In [None]:
len(os.listdir("texts/"))

In [None]:
docs = ["texts/"+i for i in os.listdir("texts/")]

In [None]:
o = open(docs[0])
o = o.read()

In [None]:
[ i.strip('\n') for i in o.split(" ")]

In [None]:
def doc_stream(path):
    for f in os.listdir(path):
        with open(os.path.join(path,f)) as t:
            text = t.read()
            text = [ i.strip('\n') for i in text.split(" ")]
            yield text

In [None]:
!pip install gensim

In [None]:
from gensim import corpora
dictionary = corpora.dictionary.Dictionary()

In [None]:
ds = doc_stream("texts/")

In [None]:
latin_dict = corpora.Dictionary(ds)

In [None]:
latin_dict.save("latin_gensim_dict")

In [None]:
keys = latin_dict.cfs.keys()
values = latin_dict.cfs.values()
word = [latin_dict[i] for i in keys]

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame()

In [None]:
df["word"] = word

In [None]:
df["frequency"] = values

In [None]:
df.sort_values("frequency",ascending=False,inplace=True)

In [None]:
df["percent_of_corpus"] = df["frequency"] / latin_dict.num_pos

In [None]:
df.head()

In [None]:
from gensim.models.phrases import Phrases

In [None]:
bigrams = Phrases(doc_stream("texts/")) 

In [None]:
bigrams.vocab

In [None]:
trigrams = Phrases(bigrams[doc_stream("texts/")])

In [None]:
t = trigrams.vocab

In [None]:
keys = t.keys()
values = t.values()

In [None]:
df2 = pd.DataFrame()

In [None]:
keys = [i.decode("UTF-8") for i in keys]
keys = [i.split("_") for i in keys]
keys = [" ".join(i) for i in keys]

In [None]:
df2["phrases"] = keys
df2["values"] = values

In [None]:
df2.sort_values("values",ascending=False, inplace=True)

In [None]:
df2["values"].plot()

In [None]:
import pickle

In [None]:
filename = "df2.pkl"
outfile = open(filename,'wb')
pickle.dump(df2, outfile)
outfile.close()

### Data Transformation

In [None]:
#!pipenv install seaborn

In [None]:
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import seaborn as sns

In [None]:
import pickle
filename= "df2.pkl"
infile =open(filename,'rb')
df2 = pickle.load(infile)
infile.close()

In [None]:
useful_values = df2[df2["values"]>500]

In [None]:
useful_values

In [None]:
sns.lineplot(data=useful_values["values"])

In [None]:
df2["len"]= df2["phrases"].apply(lambda x: len(x.split(" ")))

In [None]:
num_words = df2["values"].sum()

In [None]:
df2["percent"] = df2["values"] / num_words

In [None]:
df2["zipf"] = df2["percent"].cumsum()

### Unigram

In [None]:
words_df = df2[df2["len"]==1]

In [None]:
num_words = words_df["values"].sum()
words_df["percent"] = words_df["values"]/num_words
words_df["zipf"] = words_df["percent"].cumsum()

In [None]:
plt.plot(words_df["zipf"].values)

In [None]:
words_df[words_df["zipf"]<=.5].count()

In [None]:
words_df[words_df["zipf"]<=.5].tail(20)

In [None]:
words_df[words_df["zipf"]<=.6].count()

In [None]:
words_df[words_df["zipf"]<=.65].count()

In [None]:
words_df[words_df["zipf"]<=.8].count()

In [None]:
words_df[words_df["zipf"]<=.6]

### bigram

In [None]:
bigram = df2[df2["len"]==2]
num_words = bigram["values"].sum()
bigram["percent"] = bigram["values"]/num_words
bigram["zipf"] = bigram["percent"].cumsum()

In [None]:
plt.plot(bigram["zipf"].values)

In [None]:
top_bigrams = bigram[bigram["zipf"]<=.1]

In [None]:
top_bigrams[0:40]

### Trigrams

In [None]:
trigram = df2[df2["len"]==3]
num_words = trigram["values"].sum()
trigram["percent"] = trigram["values"]/num_words
trigram["zipf"] = trigram["percent"].cumsum()

In [None]:
plt.plot(trigram["zipf"].values)

In [None]:
top_trigrams = trigram[trigram["zipf"]<=.1]

In [None]:
trigram[trigram["zipf"]<=.1].count()

In [None]:
top_trigrams[:40]