Scrape Romanized transliterations of Sumerian texts from _The Electronic Text Corpus of Sumerian Literature_ (ETCSL):
https://etcsl.orinst.ox.ac.uk/catalogue.htm

In [1]:
import re
import requests

from bs4 import BeautifulSoup as BS

In [2]:
DATA = '../../../data/corpus'

HOME = 'https://etcsl.orinst.ox.ac.uk'
START_URL = f'{HOME}/catalogue.htm'

# Categories
ANCIENT_LIT = 0
NARRATIVE_MYTHOLOGICAL = 1
ROYAL_PRAISE = 2
LETTERS_LAW_CODES = 3
HYMNS = 4
SCRIBAL_TRAINING = 5
PROVERBS = 6  # DO SEPARATELY: HTML formatting differs from others

In [3]:
catalogues = {i: f'{HOME}/catalogue/catalogue{i}.htm' for i in range(7)}
catalogues

{0: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue0.htm',
 1: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue1.htm',
 2: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue2.htm',
 3: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue3.htm',
 4: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue4.htm',
 5: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue5.htm',
 6: 'https://etcsl.orinst.ox.ac.uk/catalogue/catalogue6.htm'}

In [4]:
def get_category_urls():
    html = requests.get(START_URL).text
    soup = BS(html)
    category_links_list = soup.find_all('li')
    urls = {}
    for li in category_links_list:
        a = li.find('a')
        category = a.text
        val = int(category[0])
        title = category[4:]
        url = a.get('href')
        urls[val] = {'category': title, 'url': f'{HOME}/url'}
    return urls

In [5]:
def get_links_by_category(category_url, verbose=False): 
    if verbose:
        print('Getting links from', category_url)
    html = requests.get(category_url).text
    soup = BS(html)
    anchors = soup.find_all('a')
    urls = []
    for a in anchors:
        try:
            if a.text.startswith('composite'):
                url = a.get('href').replace('..', HOME)
                urls.append(url)
        except:
            continue
    return urls

In [6]:
source_urls = {
    i: get_links_by_category(catalogues[i], True) for i in range(7)}

Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue0.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue1.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue2.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue3.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue4.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue5.htm
Getting links from https://etcsl.orinst.ox.ac.uk/catalogue/catalogue6.htm


In [7]:
source_urls[0]

['https://etcsl.orinst.ox.ac.uk/section0/c011.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c012.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0201.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0202.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0203.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0204.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0205.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0206.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0207.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0208.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0211.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0212.htm',
 'https://etcsl.orinst.ox.ac.uk/section0/c0213.htm']

In [8]:
def get_text_from_source(url):
    # TODO: Imperfect. Probably best to keep all lines starting with a 
    # superscript (<sup>), indicating line number.
    # Also: keep html, and not just text, to preserve superscripting of
    # semantic elements (d-, jic-, -ki, etc.)
    # pattern as '<sup>.*</?br>'
    # ALSO: proverbs formatting is different (all are embedded in table
    # elements), so code will need to differ for those.
    html = requests.get(url).text
    soup = BS(html)
    title = soup.find('h1').text
    out = {'title': title}
    pattern = r'<sup>[0-9]+.*</?br>'
    matches = re.findall(pattern, html)
    out['text'] = matches
    return out

In [11]:
sample = get_text_from_source(source_urls[0][0])
sample

{'title': 'Ur III catalogue from Nibru (N1): composite text',
 'text': ['<sup>1</sup>dub suj-ta<br>',
  '<sup>3</sup>an-zag-ce<sub>3</sub><br>',
  '<sup>4</sup>an-ji<sub>6</sub> zu ama tu<sub>6</sub> zu-ke<sub>4</sub><br>',
  '<sup>5</sup>jic-gi bul-e<br>',
  '<sup>7</sup>mac-mac erim<sub>2</sub> kur<sub>2</sub>-kur<sub>2</sub><br>',
  '<sup>9</sup>cag<sub>4</sub> LAGABxU 1-kam<br>',
  '<sup>10</sup>dub saj-ta<br>',
  '<sup>11</sup>X X kaskal-la 7 me-ce<sub>3</sub><br>',
  '<sup>12</sup><span class="dn"><sup>d</sup>li-li-a-ke<sub>4</sub></span><br>',
  '<sup>13</sup>igi X 7-na in-kur<sub>9</sub>-re-en<br>',
  '<sup>14</sup>cul a<sub>2</sub> he<sub>2</sub>-la<sub>2</sub><br>',
  '<sup>15</sup>gu DI sig<sub>1</sub><sub>0</sub>-/ga<sup>?</sup>\\ ca gal-kam<br>',
  '<sup>17</sup>cag<sub>4</sub> LAGABxU 1-kam<br>',
  '<sup>18</sup>jiri<sub>3</sub> IC lu<sub>2</sub> inim zid-da ga KAxX ba<br>',
  '<sup>20</sup>jiri<sub>3</sub>-jen-na<br>']}

In [12]:
def save_file(source_text):
    title = source_text['title'].split(':')[0].replace(' ', '_')
    with open(f'{DATA}/{title}.txt', 'w') as f:
        for txt in source_text['text']:
            f.write(repr(txt) + '\n')

In [13]:
for cat, url_list in source_urls.items():
    if cat == 6:
        print('Skipping category 6: proverbs, for now...')
        continue
    for url in url_list:
        print(f'Getting data from {url}...')
        try:
            data = get_text_from_source(url)
            save_file(data)
        except BaseException as e:
            print(f'Unexpected error for {url}\n{e}')

Getting data from https://etcsl.orinst.ox.ac.uk/section0/c011.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c012.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0201.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0202.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0203.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0204.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0205.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0206.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0207.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0208.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0211.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0212.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section0/c0213.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section1/c111.htm...
Getting data from https

Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25404.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25405.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25408.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25409.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25410.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25411.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25413.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25415.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25416.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25417.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25419.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25421.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25423.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section2/c25424.htm...
Gettin

Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40801.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40802.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40803.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40804.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40805.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40806.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40807.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40808.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40809.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40810.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40812.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40813.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40815.htm...
Getting data from https://etcsl.orinst.ox.ac.uk/section4/c40816.htm...
Gettin

Successfully got all data _except for_ the proverbs data, which for some reason, they chose to embed in `<table>` elements instead of `<p>`s.  Will grab those separately, but for now, we have a reasonable corpus of data:

In [14]:
!ls $DATA

A_Hymn_to_Nanshe_(Nanshe_A).txt
A_balbale_(?)_to_Inana_and_Dumuzid_(Dumuzid-Inana_P).txt
A_balbale_of_Inana_(Inana_A).txt
A_balbale_to_Enki_for_Ishme-Dagan_(Ishme-Dagan_E).txt
A_balbale_to_Enlil_for_Ur-Namma_(Ur-Namma_G).txt
A_balbale_to_Inana_and_Dumuzid.txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_A).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_B).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_C).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_D).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_E1).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_F).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_G).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_O).txt
A_balbale_to_Inana_as_Nanaya_(Inana_H).txt
A_balbale_to_Nanna_(Nanna_A).txt
A_balbale_to_Nanna_(Nanna_B).txt
A_balbale_to_Nanna_(Nanna_C).txt
A_balbale_to_Nanna_(Nanna_D).txt
A_balbale_to_Nanshe_(Nanshe_B).txt
A_balbale_to_Ninazu_(Ninazu_A).txt
A_balbale_to_Ningishzida_(Ningishzida_A).txt
A_b

In [15]:
!ls $DATA | wc

     347     348   14406


In [16]:
!cat $DATA/"The_Sumerian_king_list.txt"

'<sup>1</sup></a>[nam]-lugal an-ta\red<sub>3</sub>-de<sub>3</sub>-a-ba<br>\r<sup>2</sup><span class="sn">/eridug\\<sup>ki</sup></span> nam-lugal-la<br>\r<sup>3</sup><span class="sn">eridug<sup>ki</sup></span> <span class=\r"rn">a<sub>2</sub>-lu-lim</span> lugal<br>\r<sup>4</sup>mu 28800 i<sub>3</sub>-ak<br>\r<sup>5</sup><span class="rn">a<sub>2</sub>-lal<sub>3</sub>-jar</span> mu 36000\ri<sub>3</sub>-ak<br>\r<sup>6</sup>2 lugal<br>\r<sup>7</sup>mu-&lt;bi&gt; 64800 ib<sub>2</sub>-ak<br>\r<sup>8</sup><span class="sn">eridug<sup>ki</sup></span> ba-cub<br>\r<sup>9</sup>nam-lugal-bi <span class=\r"sn">bad<sub>3</sub>-tibira<sup>ki</sup>-ce<sub>3</sub></span><br>\r<sup>10</sup>ba-de<sub>6</sub><br>\r<sup>11</sup><span class="sn">bad<sub>3</sub>-tibira<sup>ki</sup></span> <span\rclass="rn">en-me-en-lu<sub>2</sub>-an-na</span><br>\r<sup>12</sup>mu 43200 i<sub>3</sub>-ak<br>\r<sup>13</sup><span class="rn">en-me-en-gal-an-na</span><br>\r<sup>14</sup>mu 28800 i<sub>3</sub>-ak<br>\r<sup>15</sup><s