<a href="https://colab.research.google.com/github/datamaunz/sophoclesApp/blob/main/scarpe_plays_with_lemmas_from_Perseus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from google.colab import files
from google.colab import drive
import re

drive.mount("/content/drive")

import random

Mounted at /content/drive


In [None]:
def getSoup(url):
  r=requests.get(url).content
  soup = BeautifulSoup(r)
  return soup

def getLinksToNextPage(soup):

  arrows = soup.find_all('a', class_="arrow")
  links = ["http://www.perseus.tufts.edu/hopper/" + arrow["href"] for arrow in arrows if arrow.find("img")["alt"] == "next"]
  if len(links) > 0:
    return links[0]
  
def create_text_per_page(soup):
  text = soup.find("div", class_="text")
  lines = [BeautifulSoup(l) for sublist in [[x for x in line.split("<p></p>")] for line in str(text).split("<br/>")] for l in sublist if l != "\n"]

  frames = []

  name = None
  for line in lines:
    if line.find("b") != None:
      name = line.text.strip()
    else:
      speech = line.text
      links_to_lemmas = [f'http://www.perseus.tufts.edu/hopper/{word["href"]}' for word in line.find_all(href=True)]
      lemmas = extract_all_lemmas_in_line(links_to_lemmas)
      frames.append(pd.DataFrame({"Name":[name], "Speech":[speech], "Lemmas":[lemmas]}))

  return pd.concat(frames)

def extract_lemma_from_linked_page(link_to_lemma):

  lemma_soup = getSoup(link_to_lemma)
  lemma_options = lemma_soup.find_all("div", class_="analysis")
  if len(lemma_options) > 0:
    return lemma_options[0].find("div", class_="lemma_header").find("h4", class_="greek").text.replace("\n", "").replace("\tsum", "").replace("\t", "").strip()

def extract_all_lemmas_in_line(links_to_lemmas):

  lemma_list = []
  for link_to_lemma in links_to_lemmas:
    lemma_list.append(extract_lemma_from_linked_page(link_to_lemma))

  lemmas = " ".join([lemma for lemma in lemma_list if lemma != None])
  return lemmas

def extract_verse_numbers(frame):

  pattern = r'\d+'
  frame["verse_number"] = frame.Speech.apply(lambda x: re.search(pattern, x).group(0) if re.search(pattern, x) != None else None)
  frame["Speech"] = frame.Speech.apply(lambda x: re.sub(pattern, '', x).replace("\n", '').strip())
  return frame

def prepare_and_save_df(frames, path, title, infer_verse_numbers_check=True):

  frame = pd.concat(frames)
  frame = frame[(frame.Name.isna() == False)]
  if infer_verse_numbers_check == True:
    frame = infer_verse_numbers(frame)

  frame = frame.set_index(frame.columns[0])
  frame.to_csv(f"{path}/{title}.csv")

  print(title, "\n")
  print(set(frame.index), "\n")

def infer_verse_numbers(df):

  df = df[(df.Speech.isna() == False) & (df.Speech != "")].reset_index(drop=True)

  df.loc[0,"verse_number"] = 1
  df.loc[0,"inferred_verse_number"] = 1
  for index, row in df.iterrows():
    verse_number = row.verse_number
    inferred_verse_number = row.inferred_verse_number
    if verse_number == None:
      #former_verse_number = df.loc[index-1]["verse_number"]
      former_verse_number = df.loc[index-1]["inferred_verse_number"]
      df.loc[index, "inferred_verse_number"] = int(former_verse_number + 1)
    else:
      df.loc[index, "inferred_verse_number"] = int(verse_number)

  duplicated_verse_number_rows = df.inferred_verse_number.duplicated()
  df.loc[duplicated_verse_number_rows, "inferred_verse_number"] = df.loc[duplicated_verse_number_rows]["inferred_verse_number"] + 0.5
  return df


In [None]:
title_url_dict = {
    "Antigone":"http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0185",
    "Oedipus Colonus":"http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0189",
    "Ajax":"http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0183",
    "Electra":"http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0187",
    "Oedipus Tyrannus":"http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0191",
    "Philoctetes":"http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0193",
    "Trachiniae":"http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0195"}

path = "/content/drive/MyDrive/Colab Notebooks/workWithBen/greekTexts/sophocles"

In [None]:
title_url_dict

{'Antigone': 'http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0185',
 'Oedipus Colonus': 'http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0189',
 'Ajax': 'http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0183',
 'Electra': 'http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0187',
 'Oedipus Tyrannus': 'http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0191',
 'Philoctetes': 'http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0193',
 'Trachiniae': 'http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0195'}

In [None]:
for title in [x for x in title_url_dict.keys() if x not in ["Antigone"]]:
  print(title)

  url = title_url_dict.get(title)

  frames = []
  while url != None:
    print(url)

    soup = getSoup(url)
    frame = create_text_per_page(soup)
    frame = extract_verse_numbers(frame)
    frames.append(frame)
    url = getLinksToNextPage(soup)
    #time.sleep(random.randint(2,5))

  prepare_and_save_df(frames, path, title)

Oedipus Colonus
http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3atext%3a1999.01.0189
http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.01.0189%3Acard%3D33
http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.01.0189%3Acard%3D75
http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.01.0189%3Acard%3D118
http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.01.0189%3Acard%3D149
http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.01.0189%3Acard%3D176
http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.01.0189%3Acard%3D192
http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.01.0189%3Acard%3D207
http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.01.0189%3Acard%3D254
http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.01.0189%3Acard%3D296
http://www.perseus.tufts.edu/hopper/text?doc=Perseus%3Atext%3A1999.01.0189%3Acard%3D337
http://www.perseus.tufts.edu/ho