In [1]:
import os
import re

import pandas as pd

In [2]:
DATA = '../../../data/corpus'

In [3]:
#!mkdir ../../../data/corpus_clean

In [4]:
!ls ../../../data/corpus

A_Hymn_to_Nanshe_(Nanshe_A).txt
A_balbale_(?)_to_Inana_and_Dumuzid_(Dumuzid-Inana_P).txt
A_balbale_of_Inana_(Inana_A).txt
A_balbale_to_Enki_for_Ishme-Dagan_(Ishme-Dagan_E).txt
A_balbale_to_Enlil_for_Ur-Namma_(Ur-Namma_G).txt
A_balbale_to_Inana_and_Dumuzid.txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_A).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_B).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_C).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_D).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_E1).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_F).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_G).txt
A_balbale_to_Inana_and_Dumuzid_(Dumuzid-Inana_O).txt
A_balbale_to_Inana_as_Nanaya_(Inana_H).txt
A_balbale_to_Nanna_(Nanna_A).txt
A_balbale_to_Nanna_(Nanna_B).txt
A_balbale_to_Nanna_(Nanna_C).txt
A_balbale_to_Nanna_(Nanna_D).txt
A_balbale_to_Nanshe_(Nanshe_B).txt
A_balbale_to_Ninazu_(Ninazu_A).txt
A_balbale_to_Ningishzida_(Ningishzida_A).txt
A_b

In [5]:
input_files = [f for f in os.listdir(DATA) if f.endswith('.txt')]
print(len(input_files))
input_files[:10]

347


['An_excerpt_from_a_hymn_to_Nanna.txt',
 'Letter_from_Aradngu_to_Shulgi_about_attentive_citizens.txt',
 'A_hymn_to_Ur-Namma_(Ur-Namma_I).txt',
 'An_adab_to_Utu_for_Shulgi_(Shulgi_Q).txt',
 "The_farmer's_instructions.txt",
 'The_shumunda_grass.txt',
 'A_hymn_to_Nanna_(Nanna_G).txt',
 'The_exploits_of_Ninurta.txt',
 'A_hymn_to_Numushda_for_Sîn-iqisham_(Sîn-iqisham_A).txt',
 'A_balbale_to_Nanna_(Nanna_D).txt']

In [6]:
def remove_sup_sub_tags(txt):
    clean = re.sub(r'</?sub>', '', txt)
    clean = re.sub(r'<sup>', '', clean)
    clean = re.sub(r'</sup>', '-', clean)
    return clean

In [7]:
def remove_html_tags(txt):
    clean = re.sub(r'<.*?>', ' ', txt)
    return clean

In [8]:
def replace_html_entities(txt):
    txt = txt.replace('&lt;', '<').replace('&gt;', '>')
    return txt

In [9]:
def clean_text(txt):
    clean = txt.replace(r'\r', '\n')
    lines = clean.split('\n')
    out = ''
    for line in lines:
        line = line.strip("'")
        # remove leading numbers/spaces/dashes
        line = re.sub(r'^ *\d+\- *', '', line).strip()
        # remove question marks and exclamation marks
        line = re.sub(r'[\?!]+', '', line)
        if 'top  |' in line.lower():
            break
        line = replace_html_entities(line)
        out += line + '\n'
    return re.sub(r'\n{2,}', r'\n\n', out)

In [13]:
for f in input_files:
    print(f'Cleaning {f}..')
    with open(f'{DATA}/{f}', 'r') as inf:
        txt = inf.readlines()[0]
    txt = remove_sup_sub_tags(txt)
    txt = remove_html_tags(txt)
    txt = clean_text(txt)
    with open(f'{DATA}_clean/{f}', 'w') as outf:
        outf.write(txt)

Cleaning An_excerpt_from_a_hymn_to_Nanna.txt..
Cleaning Letter_from_Aradngu_to_Shulgi_about_attentive_citizens.txt..
Cleaning A_hymn_to_Ur-Namma_(Ur-Namma_I).txt..
Cleaning An_adab_to_Utu_for_Shulgi_(Shulgi_Q).txt..
Cleaning The_farmer's_instructions.txt..
Cleaning The_shumunda_grass.txt..
Cleaning A_hymn_to_Nanna_(Nanna_G).txt..
Cleaning The_exploits_of_Ninurta.txt..
Cleaning A_hymn_to_Numushda_for_Sîn-iqisham_(Sîn-iqisham_A).txt..
Cleaning A_balbale_to_Nanna_(Nanna_D).txt..
Cleaning The_message_of_Lu-dingira_to_his_mother.txt..
Cleaning A_song_of_Shulgi.txt..
Cleaning Letter_to_Shulgi_about_bandits_and_brigands.txt..
Cleaning A_tigi_to_Inana_(Inana_E).txt..
Cleaning The_return_of_Ninurta_to_Nibru.txt..
Cleaning Inana_and_Ebih.txt..
Cleaning The_lament_for_Eridug.txt..
Cleaning The_home_of_the_fish.txt..
Cleaning An_adab_to_Nuska_for_Ishme-Dagan_(Ishme-Dagan_Q).txt..
Cleaning The_exaltation_of_Inana_(Inana_B).txt..
Cleaning Letter_from_Sharrum-bani_to_Shu-Suen_about_keeping_the_Martu_

Cleaning A_hymn_to_Nergal_for_Shulgi_(Shulgi_U).txt..
Cleaning A_hymn_to_Inana_for_Hammu-rabi_(Hammu-rabi_F).txt..
Cleaning OB_Catalogue_from_Urim_(U2).txt..
Cleaning Self-praise_of_Shulgi_(Shulgi_D).txt..
Cleaning A_hymn_to_Nanna_(Nanna_N).txt..
Cleaning A_love_song_of_Shu-Suen_(Shu-Suen_B).txt..
Cleaning The_song_of_the_lettuce.txt..
Cleaning Inana_and_Ishme-Dagan_(Ishme-Dagan_K).txt..
Cleaning A_balbale_to_Inana_and_Dumuzid.txt..
Cleaning OB_catalogue_from_Nibru_(N3).txt..
Cleaning A_song_of_Inana_and_Dumuzid_(Dumuzid-Inana_T).txt..
Cleaning A_praise_poem_of_Shulgi_(Shulgi_W).txt..
Cleaning A_hymn_to_Haia_for_Rim-Sin_(Rim-Sin_B).txt..
Cleaning An_adab_(?)_to_Suen_for_Shu-Suen_(Shu-Suen_F).txt..
Cleaning A_love_song_of_Shulgi_(Shulgi_Z).txt..
Cleaning A_praise_poem_of_Shulgi_(Shulgi_B).txt..
Cleaning The_herds_of_Nanna_(Nanna_F).txt..
Cleaning Ur-Namma_the_canal-digger_(Ur-Namma_D).txt..
Cleaning A_tigi_to_Bau_for_Gudea.txt..
Cleaning A_praise_poem_of_Abi-Eshuh_(Abi-Eshuh_B).txt..
Cl

In [15]:
#!ls ../../../data/corpus_clean