Scrape Romanized transliterations of Sumerian texts from _The Electronic Text Corpus of Sumerian Literature_ (ETCSL):
https://etcsl.orinst.ox.ac.uk/catalogue.htm

In [116]:
import re
import requests

from bs4 import BeautifulSoup as BS

In [79]:
DATA = '../../data/corpus'

HOME = 'https://etcsl.orinst.ox.ac.uk'
START_URL = f'{HOME}/catalogue.htm'

# Categories
ANCIENT_LIT = 0
NARRATIVE_MYTHOLOGICAL = 1
ROYAL_PRAISE = 2
LETTERS_LAW_CODES = 3
HYMNS = 4
SCRIBAL_TRAINING = 5
PROVERBS = 6

In [34]:
catalogues = {i: f'{HOME}/catalogue/catalogue{i}.htm' for i in range(7)}
catalogues

{0: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue0.htm',
 1: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue1.htm',
 2: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue2.htm',
 3: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue3.htm',
 4: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue4.htm',
 5: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue5.htm',
 6: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue6.htm'}

In [35]:
def get_category_urls():
    html = requests.get(START_URL).text
    soup = BS(html)
    category_links_list = soup.find_all('li')
    urls = {}
    for li in category_links_list:
        a = li.find('a')
        category = a.text
        val = int(category[0])
        title = category[4:]
        url = a.get('href')
        urls[val] = {'category': title, 'url': f'{HOME}/url'}
    return urls

In [112]:
def get_links_by_category(category_url, verbose=False): 
    if verbose:
        print('Getting links from', category_url)
    html = requests.get(category_url).text
    soup = BS(html)
    anchors = soup.find_all('a')
    urls = []
    for a in anchors:
        try:
            if a.text.startswith('composite'):
                url = a.get('href').replace('..', HOME)
                urls.append(url)
        except:
            continue
    return urls

In [113]:
source_urls = {i: get_links_by_category(catalogues[i], True) 
               for i in range(7)}

Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue0.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue1.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue2.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue3.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue4.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue5.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue6.htm


In [119]:
source_urls[0]

['https://etcsl.orinst.ox.ac.uk/section0/c011.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c012.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0201.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0202.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0203.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0204.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0205.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0206.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0207.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0208.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0211.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0212.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0213.htm']

In [124]:
def get_text_from_source(url):
    # TODO: Imperfect. Probably best to keep all lines starting with a 
    # superscript (<sup>), indicating line number.
    # Also: keep html, and not just text, to preserve superscripting of
    # semantic elements (d-, jic-, -ki, etc.)
    # pattern as '<sup>.*</?br>'
    # ALSO: proverbs formatting is different (all are embedded in table
    # elements), so code will need to differ for those.
    html = requests.get(url).text
    soup = BS(html)
    title = soup.find('h1').text
    out = {'title': title}
    pattern = r'<sup>[0-9]+.*</?br>'
    matches = re.findall(pattern, html)
    out['text'] = matches
    return out

In [125]:
sample = get_text_from_source(source_urls[0][0])
sample

{'title': 'Ur III catalogue from Nibru (N1): composite text',
 'text': ['<sup>1</sup>dub suj-ta<br>',
  '<sup>3</sup>an-zag-ce<sub>3</sub><br>',
  '<sup>4</sup>an-ji<sub>6</sub> zu ama tu<sub>6</sub> zu-ke<sub>4</sub><br>',
  '<sup>5</sup>jic-gi bul-e<br>',
  '<sup>7</sup>mac-mac erim<sub>2</sub> kur<sub>2</sub>-kur<sub>2</sub><br>',
  '<sup>9</sup>cag<sub>4</sub> LAGABxU 1-kam<br>',
  '<sup>10</sup>dub saj-ta<br>',
  '<sup>11</sup>X X kaskal-la 7 me-ce<sub>3</sub><br>',
  '<sup>12</sup><span class="dn"><sup>d</sup>li-li-a-ke<sub>4</sub></span><br>',
  '<sup>13</sup>igi X 7-na in-kur<sub>9</sub>-re-en<br>',
  '<sup>14</sup>cul a<sub>2</sub> he<sub>2</sub>-la<sub>2</sub><br>',
  '<sup>15</sup>gu DI sig<sub>1</sub><sub>0</sub>-/ga<sup>?</sup>\\ ca gal-kam<br>',
  '<sup>17</sup>cag<sub>4</sub> LAGABxU 1-kam<br>',
  '<sup>18</sup>jiri<sub>3</sub> IC lu<sub>2</sub> inim zid-da ga KAxX ba<br>',
  '<sup>20</sup>jiri<sub>3</sub>-jen-na<br>']}

In [104]:
def save_file(source_text):
    title = source_text['title'].split(':')[0].replace(' ', '_')
    with open(f'{DATA}/{title}.txt', 'w') as f:
        for line in source_text['text']:
            f.write(line + '\n')

In [105]:
for cat, url_list in source_urls.items():
    if cat == 6:
        print('Skipping category 6: proverbs, for now...')
        continue
    for url in url_list:
        print(f'Getting data from {url}...')
        try:
            data = get_text_from_source(url)
            save_file(data)
        except BaseException as e:
            print(f'Unexpected error for {url}\n{e}')

Getting data from https://etcsl.orinst.ox.ac.uk/section0/c011.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c012.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0201.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0202.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0203.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0204.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0205.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0206.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0207.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0208.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0211.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0212.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0213.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section1/c113.htm...
Getting data from https

Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2552.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2553.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2554.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2555.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2558.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2561.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2562.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2563.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2564.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2565.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2566.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2571.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2572.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c2581.htm...
Getting data from ht

Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40829.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40830.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40831.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40832.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40833.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c408a.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c4121.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c4122.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c41301.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c41302.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c41303.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c41304.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c41305.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c41306.htm...
Getting d

Unexpected error for https://etcsl.orinst.ox.ac.uk/proverbs/c.6.1.14.html
'NoneType' object has no attribute 'text'
Getting data from https://etcsl.orinst.ox.ac.uk/proverbs/c.6.1.15.html...
Unexpected error for https://etcsl.orinst.ox.ac.uk/proverbs/c.6.1.15.html
'NoneType' object has no attribute 'text'
Getting data from https://etcsl.orinst.ox.ac.uk/proverbs/c.6.1.16.html...
Unexpected error for https://etcsl.orinst.ox.ac.uk/proverbs/c.6.1.16.html
'NoneType' object has no attribute 'text'
Getting data from https://etcsl.orinst.ox.ac.uk/proverbs/c.6.1.17.html...
Unexpected error for https://etcsl.orinst.ox.ac.uk/proverbs/c.6.1.17.html
'NoneType' object has no attribute 'text'
Getting data from https://etcsl.orinst.ox.ac.uk/proverbs/c.6.1.18.html...
Unexpected error for https://etcsl.orinst.ox.ac.uk/proverbs/c.6.1.18.html
'NoneType' object has no attribute 'text'
Getting data from https://etcsl.orinst.ox.ac.uk/proverbs/c.6.1.19.html...
Unexpected error for https://etcsl.orinst.ox.ac.uk

Successfully got all data _except for_ the proverbs data, which for some reason, they chose to embed in `<table>` elements instead of `<p>`s.  Will grab those separately, but for now, we have a reasonable corpus of data:

In [106]:
!ls $DATA

A_Hymn_to_Nanshe_(Nanshe_A).txt
A_balbale_(?)_to_Inana_and_Dumuzid_(Dumuzid-Inana_P).txt
A_balbale_of_Inana_(Inana_A).txt
A_balbale_to_Enki_for_Ishme-Dagan_(Ishme-Dagan_E).txt
A_balbale_to_Enlil_for_Ur-Namma_(Ur-Namma_G).txt
A_balbale_to_Inana_and_Dumuzid.txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_A).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_B).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_C).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_D).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_E1).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_F).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_G).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_O).txt
A_balbale_to_Inana_as_Nanaya_(Inana_H).txt
A_balbale_to_Nanna_(Nanna_A).txt
A_balbale_to_Nanna_(Nanna_B).txt
A_balbale_to_Nanna_(Nanna_C).txt
A_balbale_to_Nanna_(Nanna_D).txt
A_balbale_to_Nanshe_(Nanshe_B).txt
A_balbale_to_Ninazu_(Ninazu_A).txt
A_balbale_to_Ningishzida_(Ningishzida_A).txt
A_b

In [107]:
!ls $DATA | wc

     317     318   13734


In [111]:
!cat $DATA/"Gilgamesh,_Enkidu_and_the_nether_world.txt"

1ud re-a udsu3-ra2 re-a 2ji6 re-a ji6 ba9-ra2re-a 3mu re-a mu su3-ra2 re-a 4ud ul nij2-du7-e pa ed2-a-ba 5ud ul nij2-du7-e mi2 ziddug4-ga-a-ba 6ec3 kalam-ma-ka ninda cu2-a-ba 7imcu-rin-na kalam-ma-ka nij2-tab ak-a-ba 8an ki-ta ba-da-ba9-ra2-a-ba 9ki an-ta ba-da-sur-ra-a-ba 10mu nam-lu2-u18-luba-an-jar-ra-a-ba 11ud an-ne2 anba-an-de6-a-ba 12den-lil2-le kiba-an-de6-a-ba 13derec-ki-gal-la-ra kur-ra sajrig7-bi-ce3 im-ma-ab-rig7-a-ba 14ba-u5-a-ba ba-u5-a-ba 15a-a kur-ce3 ba-u5-a-ba 16den-ki kur-ce3ba-u5-a-ba 17lugal-ra tur-tur ba-an-da-ri 18den-ki-ra gal-galba-an-da-ri 19tur-tur-bi na4 cu-kam 20gal-gal-bi na4 gi gu4-ud-da-kam 21ur2 jicma2 tur-re den-ki-ka3-ke4 22nij2-bun2-na du7-am3mi-cu2-cu2 23lugal-ra a jicma2saj-ja2-ke4 24ur-bar-ra-gin7 tec2mu-na-gu7-e 25den-ki-ra ajicma2 ejer-ra-ke4 26ur-mah-gin7 saj jic im-ra-ra 27ud-bi-a jic1-am3 jicha-lu-ub2 1-am3 jic1-am3 28gu2 id2buranun-na kug-ga-kadu3-a-bi 29id2buranun-na ana8-na8-da-bi 30a2 u18-lu ur2-bamu-ni-in-bu pa-ba mu-ni-