See notes <a href="https://etcsl.orinst.ox.ac.uk/edition2/encodingdisplay.php#critical">here</a> about transliteration conventions.

In [1]:
from collections import Counter
import json
import os
import re

import pandas as pd

In [2]:
DATA = '../../../data/corpus_clean'

In [3]:
infiles = [f for f in os.listdir(DATA) if f.endswith('.txt')]
print(len(infiles))
infiles[:3]

347


['An_excerpt_from_a_hymn_to_Nanna.txt',
 'Letter_from_Aradngu_to_Shulgi_about_attentive_citizens.txt',
 'A_hymn_to_Ur-Namma_(Ur-Namma_I).txt']

In [4]:
word_counts = Counter()
testc = Counter()

In [5]:
def remove_digit_sequences(text):
    # Use regex to find sequences of digits surrounded by non-alphanumeric characters
    clean_text = re.sub(r'(?<!\w)\d+(?!\w)', '', text)
    return clean_text.strip('-')

In [6]:
remove_digit_sequences('2-0-26')

''

In [7]:
def get_ngrams(word, mx=5):
    word = remove_digit_sequences(word)
    grams = {}
    parts = word.split('-')
    if not parts or len(parts) == 1 and not parts[0]: 
        return grams
    try:
        while parts[0][0] in '0123456789':
            parts = parts[1:]
        if not parts or len(parts) == 1 and not parts[0]: 
            return grams
    except IndexError:
        return grams
    except:
        print(parts)
        raise
    for start in range(len(parts)):
        for end in range(start + 1, min(len(parts), mx) + 1):
            gram = '-'.join(parts[start:end]).strip('-')
            gram = re.sub(r'\-+', '-', gram)
            grams[gram] = grams.get(gram, 0) + 1
    return grams

In [8]:
get_ngrams('107A-try-me')

{'try': 1, 'try-me': 1, 'me': 1}

In [9]:
line = '521A- d-a-nun-na'
for word in line.split('-'):
    print(word)
    print(get_ngrams(word))
    print()

521A
{}

 d
{' d': 1}

a
{'a': 1}

nun
{'nun': 1}

na
{'na': 1}



In [10]:
test_grams = get_ngrams('-a-li-ba-ba-zoom')
testc.update(test_grams)
testc

Counter({'ba': 2,
         'a': 1,
         'a-li': 1,
         'a-li-ba': 1,
         'a-li-ba-ba': 1,
         'a-li-ba-ba-zoom': 1,
         'li': 1,
         'li-ba': 1,
         'li-ba-ba': 1,
         'li-ba-ba-zoom': 1,
         'ba-ba': 1,
         'ba-ba-zoom': 1,
         'ba-zoom': 1,
         'zoom': 1})

In [11]:
testc.update(test_grams)
testc

Counter({'ba': 4,
         'a': 2,
         'a-li': 2,
         'a-li-ba': 2,
         'a-li-ba-ba': 2,
         'a-li-ba-ba-zoom': 2,
         'li': 2,
         'li-ba': 2,
         'li-ba-ba': 2,
         'li-ba-ba-zoom': 2,
         'ba-ba': 2,
         'ba-ba-zoom': 2,
         'ba-zoom': 2,
         'zoom': 2})

In [12]:
for f in infiles:
    print('Parsing file:', f)
    with open(f'{DATA}/{f}', 'r') as inf:
        for line in inf:
            line = line.replace('[...]', '').strip()
            line = re.sub(r'\.', '-', line)
            line = re.sub(r'[^a-zA-Z0-9\- ]', '', line).strip()
            line = line.replace('X', '').strip()
            line = re.sub(r' +', ' ', line)
            line = re.sub(r'\-+', '-', line)
            if (line.lower().startswith('unknown') 
                    or line.lower().startswith('segment')
                    or not line):
                continue
            try:
                for word in line.split():
                    ngrams = get_ngrams(word)
                    word_counts.update(ngrams)
            except:
                print('ERROR FOR:', line)
    print()

Parsing file: An_excerpt_from_a_hymn_to_Nanna.txt

Parsing file: Letter_from_Aradngu_to_Shulgi_about_attentive_citizens.txt

Parsing file: A_hymn_to_Ur-Namma_(Ur-Namma_I).txt

Parsing file: An_adab_to_Utu_for_Shulgi_(Shulgi_Q).txt

Parsing file: The_farmer's_instructions.txt

Parsing file: The_shumunda_grass.txt

Parsing file: A_hymn_to_Nanna_(Nanna_G).txt

Parsing file: The_exploits_of_Ninurta.txt

Parsing file: A_hymn_to_Numushda_for_Sîn-iqisham_(Sîn-iqisham_A).txt

Parsing file: A_balbale_to_Nanna_(Nanna_D).txt

Parsing file: The_message_of_Lu-dingira_to_his_mother.txt

Parsing file: A_song_of_Shulgi.txt

Parsing file: Letter_to_Shulgi_about_bandits_and_brigands.txt

Parsing file: A_tigi_to_Inana_(Inana_E).txt

Parsing file: The_return_of_Ninurta_to_Nibru.txt

Parsing file: Inana_and_Ebih.txt

Parsing file: The_lament_for_Eridug.txt

Parsing file: The_home_of_the_fish.txt

Parsing file: An_adab_to_Nuska_for_Ishme-Dagan_(Ishme-Dagan_Q).txt

Parsing file: The_exaltation_of_Inana_(Inan


Parsing file: A_hymn_to_Suen_for_Ibbi-Suen.txt

Parsing file: Shulgi_and_Ninlil's_Barge_(Shulgi_R).txt

Parsing file: A_balbale_of_Inana_(Inana_A).txt

Parsing file: Enki_and_Ninhursanga.txt

Parsing file: A_praise_poem_of_Hammu-rabi_(Hammu-rabi_C).txt

Parsing file: A_hymn_to_Asarluhi_(Asarluhi_XA).txt

Parsing file: Ishbi-Erra_and_Kindattu_(Ishbi-Erra_B).txt

medical_treatment.txtfrom_Nanna-manshum_to_the_goddess_Ninisina_about_his_unsuccessful

Parsing file: A_praise_poem_of_Shulgi_(Shulgi_C).txt

Parsing file: The_heron_and_the_turtle.txt

Parsing file: Enki's_journey_to_Nibru.txt

Parsing file: A_shir-namshub_to_Nininsina_(Nininsina_B).txt

Parsing file: A_shir-namgala_to_Nanna_(Nanna_L).txt

Parsing file: A_praise_poem_of_Shulgi_(Shulgi_V).txt

Parsing file: An_adab_to_Enlil_for_Bur-Suen_(Bur-Suen_B).txt

Parsing file: OB_catalogue_at_Yale_(Y2).txt

Parsing file: Gilgamesh_and_Huwawa,_Version_B.txt

Parsing file: A_shir-namshub_to_Nanna_for_Ur-Namma_(Ur-Namma_F).txt

Parsing fil


Parsing file: Ningishzida's_journey_to_the_nether_world.txt

Parsing file: Ninurta's_journey_to_Eridug.txt

Parsing file: A_tigi_to_Ninurta_(Ninurta_D).txt

Parsing file: Letter_from_Kug-Nanna_to_Ninshubur.txt

Parsing file: Fragment_of_royal_praise_poetry.txt

Parsing file: The_death_of_Gilgamesh.txt

Parsing file: A_praise_poem_of_Ishme-Dagan_(Ishme-Dagan_AC).txt

Parsing file: Dumuzid_and_Geshtin-ana.txt

Parsing file: An_adab_to_Ninlil_(Ninlil_A).txt

Parsing file: An_adab_to_Nergal_for_Shu-ilishu_(Shu-ilishu_A).txt

Parsing file: He_is_a_good_seed_of_a_dog_(Diatribe_C).txt

Parsing file: An_adab_of_An_for_Lipit-Eshtar_(Lipit-Eshtar_C).txt

Parsing file: The_death_of_Ur-Namma_(Ur-Namma_A).txt

Parsing file: A_balbale_to_Ninurta_(Ninurta_F).txt

Parsing file: A_mythic_narrative_about_Inana.txt

Parsing file: An_adab_to_Suen_for_Ibbi-Suen_(Ibbi-Suen_C).txt

Parsing file: A_hymn_to_Enki_for_Ishme-Dagan_(Ishme-Dagan_X).txt

Parsing file: The_Flood_story.txt

Parsing file: A_hymn_to_Ni

In [13]:
word_counts.most_common(20)

[('a', 11107),
 ('mu', 8124),
 ('d', 7687),
 ('na', 7161),
 ('e', 7123),
 ('ra', 6835),
 ('ba', 6763),
 ('ni', 5773),
 ('an', 5756),
 ('en', 5472),
 ('ga', 5257),
 ('da', 5130),
 ('bi', 5030),
 ('ma', 4575),
 ('ki', 4295),
 ('zu', 3886),
 ('me', 3786),
 ('in', 3499),
 ('nam', 3271),
 ('la', 3011)]

In [14]:
with open(f'../../../data/freq/freq.json', 'w') as f:
    json.dump(dict(word_counts), f)

In [15]:
freq_df = pd.DataFrame(
    data=dict(word_counts).items(), columns=('ngram', 'freq')
).sort_values('ngram', ignore_index=True)
freq_df.head(20)

Unnamed: 0,ngram,freq
0,,18
1,171en187,1
2,60ce3,1
3,60ci62,1
4,60ci62-re7,1
5,60ci62-re7-ec,1
6,60ci62-re7-ec-am3,1
7,60x7fenx7f62,1
8,9GANA2,1
9,9GANA2-UH,1


In [19]:
freq_df.tail()

Unnamed: 0,ngram,freq
67481,zur-zur-ra-ke4,1
67482,zur-zur-re,5
67483,zur-zur-re-ec2,1
67484,zur4,1
67485,zur4-gin7,1


In [23]:
freq_df = freq_df.iloc[17:].reset_index(drop=True)
freq_df

Unnamed: 0,ngram,freq
0,A,278
1,A-A,1
2,A-AN,4
3,A-AN-gin7,1
4,A-BA,1
...,...,...
67464,zur-zur-ra-ke4,1
67465,zur-zur-re,5
67466,zur-zur-re-ec2,1
67467,zur4,1


In [24]:
freq_df.to_csv('../../../data/freq/freq.csv', index=False)

In [26]:
!head -5 ../../../data/freq/freq.csv

ngram,freq
A,278
A-A,1
A-AN,4
A-AN-gin7,1
