In [88]:
import xml.etree.ElementTree as ET
import json

# Parse the XML file
tree = ET.parse('data/a_LXX.xml')  # replace with your file name
root = tree.getroot()

# Initialize an empty list to store the output
output = []

# Iterate over all the books
for biblebook in root.findall('BIBLEBOOK'):
    book_number = biblebook.get('bnumber')  # Get book number
    
    # Iterate over all chapters in the book
    for chapter in biblebook.findall('CHAPTER'):
        chapter_number = chapter.get('cnumber')  # Get chapter number
        
        # Iterate over all verses in the chapter
        for verse in chapter.findall('VERS'):
            verse_number = verse.get('vnumber')  # Get verse number
            
            # Concatenate all the <gr> text elements for this verse
            verse_text = ' '.join([gr.text for gr in verse.findall('gr') if gr.text])

            # Create the identifier in the format <book.chapter.verse>
            identifier = f"{book_number}.{chapter_number}.{verse_number}"
            
            # Create a dictionary with the identifier and text
            verse_entry = {
                "book_num": book_number,
                "chapter": chapter_number,
                "verse": verse_number,
                "text": verse_text
            }
            
            # Add to the output list
            output.append(verse_entry)

# Write the output to a JSON Lines file
with open('data/LXX.jsonl', 'w', encoding='utf-8') as f:
    for entry in output:
        f.write(json.dumps(entry, ensure_ascii=False) + '\n')

print("Finished processing the XML file.")


Finished processing the XML file.


In [53]:
import xml.etree.ElementTree as ET
import json

# Parse the XML file
tree = ET.parse('data/septuaginta.xml')  # replace with your file name
root = tree.getroot()

# Initialize an empty list to store the output
output = []

# Iterate over all the books
for biblebook in root.findall('BIBLEBOOK'):
    book_name = biblebook.get('bname')  # Get book name
    book_number = biblebook.get('bnumber')  # Get book number
    
    # Iterate over all chapters in the book
    for chapter in biblebook.findall('CHAPTER'):
        chapter_number = chapter.get('cnumber')  # Get chapter number
        
        # Iterate over all verses in the chapter
        for verse in chapter.findall('VERS'):
            verse_number = verse.get('vnumber')  # Get verse number
            verse_text = verse.text  # Get the text of the verse
            
            # Create the identifier in the format <book.chapter.verse>
            identifier = f"{[book_name]}.{chapter_number}.{verse_number}"
            
            # Create a dictionary with the identifier and text
            verse_entry = {
                "book": book_name,
                "chapter": chapter_number,
                "verse": verse_number,
                "text": verse_text
            }
            
            # Add to the output list
            output.append(verse_entry)

# Write the output to a JSON Lines file
with open('data/septuaginta.jsonl', 'w', encoding='utf-8') as f:
    for entry in output:
        f.write(json.dumps(entry, ensure_ascii=False) + '\n')

print("Finished processing the XML file.")


Finished processing the XML file.


In [5]:
import pandas as pd
df = pd.read_json('../data/LXX/LXX.json', lines=True)
df.head()

Unnamed: 0,book_num,chapter,verse,text
0,1,1,1,εν αρχη εποιησεν ο θεος τον ουρανον και...
1,1,1,2,η δε γη ην αορατος και ακατασκευαστος κ...
2,1,1,3,και ειπεν ο θεος γενηθητω φως και εγενε...
3,1,1,4,και ειδεν ο θεος το φως οτι καλον και ...
4,1,1,5,και εκαλεσεν ο θεος το φως ημεραν και ...


In [7]:
num2name = {i+1:v for i, (k,v) in enumerate(NAMES.items())}
num2name

{1: 'Gen',
 2: 'Exo',
 3: 'Lev',
 4: 'Num',
 5: 'Deu',
 6: 'Jos',
 7: 'Jdg',
 8: 'Rut',
 9: '1Sa',
 10: '2Sa',
 11: '1Ki',
 12: '2Ki',
 13: '1Ch',
 14: '2Ch',
 15: 'Ezr',
 16: 'Neh',
 17: 'Est',
 18: 'Job',
 19: 'Psa',
 20: 'Pro',
 21: 'Ecc',
 22: 'Sng',
 23: 'Isa',
 24: 'Jer',
 25: 'Lam',
 26: 'Ezk',
 27: 'Dan',
 28: 'Hos',
 29: 'Jol',
 30: 'Amo',
 31: 'Oba',
 32: 'Jon',
 33: 'Mic',
 34: 'Nam',
 35: 'Hab',
 36: 'Zep',
 37: 'Hag',
 38: 'Zec',
 39: 'Mal'}

In [9]:
df['book'] = df['book_num'].apply(lambda x: num2name[x])

df['eng_ref'] = df['book'] + "." + df['chapter'].astype(str) + "." + df['verse'].astype(str)
df.head()

Unnamed: 0,book_num,chapter,verse,text,book,heb_ref,eng_ref
0,1,1,1,εν αρχη εποιησεν ο θεος τον ουρανον και...,Gen,Gen.1.1,Gen.1.1
1,1,1,2,η δε γη ην αορατος και ακατασκευαστος κ...,Gen,Gen.1.2,Gen.1.2
2,1,1,3,και ειπεν ο θεος γενηθητω φως και εγενε...,Gen,Gen.1.3,Gen.1.3
3,1,1,4,και ειδεν ο θεος το φως οτι καλον και ...,Gen,Gen.1.4,Gen.1.4
4,1,1,5,και εκαλεσεν ο θεος το φως ημεραν και ...,Gen,Gen.1.5,Gen.1.5


In [12]:
df[['eng_ref', 'text']].to_json('../data/LXX/LXX_aligned.json', lines=True, orient="records", force_ascii=False)

In [61]:
df['sbook'] = df['book'].apply(lambda x: NAMES[x] if x in NAMES else None)
df = df.dropna(subset='sbook')
df['book'].unique()

array(['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy',
       'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings',
       '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah',
       'Esther', 'Job', 'Psalm', 'Proverbs', 'Ecclesiastes',
       'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel',
       'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah',
       'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi'],
      dtype=object)

In [62]:
df['heb_ref'] = df['sbook'] + "." + df['chapter'].astype(str) + "." + df['verse'].astype(str)
df.head()

Unnamed: 0,book,chapter,verse,text,sbook,heb_ref
0,Genesis,1,1,εν αρχη εποιησεν ο θεος τον ουρανον και την γην,Gen,Gen.1.1
1,Genesis,1,2,η δε γη ην αορατος και ακατασκευαστος και σκοτ...,Gen,Gen.1.2
2,Genesis,1,3,και ειπεν ο θεος γενηθητω φως και εγενετο φως,Gen,Gen.1.3
3,Genesis,1,4,και ειδεν ο θεος το φως οτι καλον και διεχωρισ...,Gen,Gen.1.4
4,Genesis,1,5,και εκαλεσεν ο θεος το φως ημεραν και το σκοτο...,Gen,Gen.1.5


In [17]:
heb = pd.read_json('../data/STEP/OT_aligned_sep.json', lines=True)
heb.head()

Unnamed: 0,heb_ref,eng_ref,line,half_a,half_b,translation,trans_a,trans_b
0,Gen.1.1,Gen.1.1,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַ שָּׁמַ...,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים,אֵ֥ת הַ שָּׁמַ֖יִם וְ אֵ֥ת הָ אָֽרֶץ ׃,in/ beginning he created God <obj.> the/ heave...,in/ beginning he created God <obj.>,the/ heavens and/ <obj.> the/ earth
1,Gen.1.2,Gen.1.2,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,וְ ר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל ־ פְּנֵ֥י ה...,and/ the/ earth <it> was formlessness and/ emp...,and/ the/ earth <it> was formlessness and/ emp...,over [the] surface of [the] deep and/ [the] sp...
2,Gen.1.3,Gen.1.3,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽ יְהִי ־...,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר,וַֽ יְהִי ־ אֽוֹר ׃,and/ he said God let it be light and/ there wa...,and/ he said God let it,be light and/ there was light
3,Gen.1.4,Gen.1.4,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑ו...,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑וֹב,וַ יַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָ א֖וֹר וּ בֵ֥...,and/ he saw God <obj.> the/ light that [it was...,and/ he saw God <obj.> the/ light that [it,was] good and/ he separated God between the/ l...
4,Gen.1.5,Gen.1.5,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,וַֽ יְהִי ־ עֶ֥רֶב וַֽ יְהִי ־ בֹ֖קֶר י֥וֹם אֶ...,and/ he called God <to> the/ light day and/ <t...,and/ he called God <to> the/ light day and/ <t...,darkness he called night and/ there was evenin...


In [6]:
NAMES = {k:v for k,v in zip(['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel',
                             '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job',
                             'Psalm', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel',
                             'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah',
                             'Haggai', 'Zechariah', 'Malachi'],
                            ['Gen', 'Exo', 'Lev', 'Num', 'Deu', 'Jos', 'Jdg', 'Rut', '1Sa',
       '2Sa', '1Ki', '2Ki', '1Ch', '2Ch', 'Ezr', 'Neh', 'Est', 'Job',
       'Psa', 'Pro', 'Ecc', 'Sng', 'Isa', 'Jer', 'Lam', 'Ezk', 'Dan',
       'Hos', 'Jol', 'Amo', 'Oba', 'Jon', 'Mic', 'Nam', 'Hab', 'Zep',
       'Hag', 'Zec', 'Mal'])}

In [39]:
# I want to load the grb.tsv and ensure that every verse is the same as the hebrew references in the STEP Bible OT. That would be great verification
df['heb_ref']

0          Gen.1.1
2          Gen.1.2
3          Gen.1.3
4          Gen.1.4
5          Gen.1.5
           ...    
30786     Dan.12.9
30787    Dan.12.10
30788    Dan.12.11
30789    Dan.12.12
30790    Dan.12.13
Name: heb_ref, Length: 23097, dtype: object

In [100]:
s1 = set(df['heb_ref'].values)
s2 = set(heb['eng_ref'].values)
len(s1 & s2)

23140

In [110]:
len(heb['eng_ref'].drop_duplicates())

23140

In [106]:
len(df), len(heb) # different lengths because some hebrew verses get split up into multiple English verses. There are only 23140 unique english verses 

(23145, 23213)

In [108]:
s2 - s1

set()

In [102]:
diff = s1.symmetric_difference(s2)

In [105]:
diff

{'1Ki.18.34', '1Ki.20.3', 'Isa.64.1', 'Neh.7.68', 'Psa.13.6'}

In [None]:
# okay there are only 5 verse that differ, and that's because of weird things going on with duplication, 
# I can't be bothered to continue with this at any rate now. Need to switch paths
# this is really good enough. What now?
# so, now I can translate them and be pretty well assured they they match up, which is good. Gosh darn, I also need the NT

In [104]:
import numpy as np
[x.split for x in s1.symmetric_difference(s2)]

['Isa', '1Ki', '1Ki', 'Neh', 'Psa']

In [72]:
df.merge(heb, on='heb_ref')

Unnamed: 0,book_x,chapter,verse,text,sbook,heb_ref,eng_ref,line,half_a,half_b,translation,trans_a,trans_b,book_y
0,Genesis,1,1,εν αρχη εποιησεν ο θεος τον ουρανον και την γην,Gen,Gen.1.1,Gen.1.1,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַ שָּׁמַ...,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים,אֵ֥ת הַ שָּׁמַ֖יִם וְ אֵ֥ת הָ אָֽרֶץ ׃,in/ beginning he created God <obj.> the/ heave...,in/ beginning he created God <obj.>,the/ heavens and/ <obj.> the/ earth,Gen
1,Genesis,1,2,η δε γη ην αορατος και ακατασκευαστος και σκοτ...,Gen,Gen.1.2,Gen.1.2,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,וְ ר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל ־ פְּנֵ֥י ה...,and/ the/ earth <it> was formlessness and/ emp...,and/ the/ earth <it> was formlessness and/ emp...,over [the] surface of [the] deep and/ [the] sp...,Gen
2,Genesis,1,3,και ειπεν ο θεος γενηθητω φως και εγενετο φως,Gen,Gen.1.3,Gen.1.3,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽ יְהִי ־...,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר,וַֽ יְהִי ־ אֽוֹר ׃,and/ he said God let it be light and/ there wa...,and/ he said God let it,be light and/ there was light,Gen
3,Genesis,1,4,και ειδεν ο θεος το φως οτι καλον και διεχωρισ...,Gen,Gen.1.4,Gen.1.4,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑ו...,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑וֹב,וַ יַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָ א֖וֹר וּ בֵ֥...,and/ he saw God <obj.> the/ light that [it was...,and/ he saw God <obj.> the/ light that [it,was] good and/ he separated God between the/ l...,Gen
4,Genesis,1,5,και εκαλεσεν ο θεος το φως ημεραν και το σκοτο...,Gen,Gen.1.5,Gen.1.5,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,וַֽ יְהִי ־ עֶ֥רֶב וַֽ יְהִי ־ בֹ֖קֶר י֥וֹם אֶ...,and/ he called God <to> the/ light day and/ <t...,and/ he called God <to> the/ light day and/ <t...,darkness he called night and/ there was evenin...,Gen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21909,Malachi,3,20,και ανατελει υμιν τοις φοβουμενοις το ονομα μο...,Mal,Mal.3.20,Mal.4.2,וְ זָרְחָ֨ה לָ כֶ֜ם יִרְאֵ֤י שְׁמִ י֙ שֶׁ֣מֶשׁ...,וְ זָרְחָ֨ה לָ כֶ֜ם יִרְאֵ֤י שְׁמִ י֙ שֶׁ֣מֶשׁ...,הָ וִֽ יצָאתֶ֥ם וּ פִשְׁתֶּ֖ם כְּ עֶגְלֵ֥י מַר...,and/ it will rise for/ you [those] fearing <of...,and/ it will rise for/ you [those] fearing <of...,and/ healing [will be] in/ wings/ its and/ you...,Mal
21910,Malachi,3,21,και καταπατησετε ανομους διοτι εσονται σποδος ...,Mal,Mal.3.21,Mal.4.3,וְ עַסּוֹתֶ֣ם רְשָׁעִ֔ים כִּֽי ־ יִהְי֣וּ אֵ֔פ...,וְ עַסּוֹתֶ֣ם רְשָׁעִ֔ים כִּֽי ־ יִהְי֣וּ אֵ֔פ...,בַּ יּוֹם֙ אֲשֶׁ֣ר אֲנִ֣י עֹשֶׂ֔ה אָמַ֖ר יְהוָ...,and/ you will tread down wicked [people] for t...,and/ you will tread down wicked [people] for t...,ash[es] under [the] soles of feet/ your on the...,Mal
21911,Malachi,3,22,και ιδου εγω αποστελλω υμιν ηλιαν τον θεσβιτην...,Mal,Mal.3.22,Mal.4.4,זִכְר֕וּ תּוֹרַ֖ת מֹשֶׁ֣ה עַבְדִּ֑ י אֲשֶׁר֩ צ...,זִכְר֕וּ תּוֹרַ֖ת מֹשֶׁ֣ה עַבְדִּ֑,י אֲשֶׁר֩ צִוִּ֨יתִי אוֹת֤ וֹ בְ חֹרֵב֙ עַל ־ ...,remember [the] instruction of Moses servant/ m...,remember [the] instruction of Moses servant/ m...,I commanded <obj.>/ him at/ Horeb on all,Mal
21912,Malachi,3,23,ος αποκαταστησει καρδιαν πατρος προς υιον και ...,Mal,Mal.3.23,Mal.4.5,הִנֵּ֤ה אָֽנֹכִי֙ שֹׁלֵ֣חַ לָ כֶ֔ם אֵ֖ת אֵלִיּ...,הִנֵּ֤ה אָֽנֹכִי֙ שֹׁלֵ֣חַ לָ כֶ֔ם אֵ֖ת אֵלִיּ...,לִ פְנֵ֗י בּ֚וֹא י֣וֹם יְהוָ֔ה הַ גָּד֖וֹל וְ ...,here! I [am] about to send to/ you <obj.> Elij...,here! I [am] about to send to/ you <obj.>,Elijah the/ prophet <to>/ before comes [the] d...,Mal


In [74]:
df['heb_ref']

0         Gen.1.1
1         Gen.1.2
2         Gen.1.3
3         Gen.1.4
4         Gen.1.5
           ...   
23595    Mal.3.20
23596    Mal.3.21
23597    Mal.3.22
23598    Mal.3.23
23599    Mal.3.24
Name: heb_ref, Length: 23600, dtype: object

In [132]:
def parse_eng_data_file(file_path):
    parsed_data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

        for i, line in enumerate(lines):
            if line.startswith("# "):
                parts = line.split("# ")[1].split('\t')
                ref = parts[0]
                if '[' in parts[1]:
                    parts = parts[1:]
                text = " ".join(parts[1:]).strip().rstrip()
                translation = lines[i+1].split("#_Translation")[1].strip().rstrip().replace('\t', ' ')
                if len(ref.split()) >1:
                    eng_ref = ref.split()[0]
                    #other_ref = eng_ref.split('.')[0] + '.' + ref.split()[-1].replace('(', '').replace(")", '')
                    
                else:
                    eng_ref = ref
                    #other_ref = eng_ref
                
                parsed_data.append({'eng_ref': eng_ref,
                                    'text': text,
                                    'translation': translation,
                                    })             

    return parsed_data

In [135]:
data = []
for p in ['../data/STEP/Mat_Jhn.txt', '../data/STEP/Act_Rev.txt']:
    data.extend(parse_eng_data_file(p))

data[:5]

[{'eng_ref': 'Mat.1.1',
  'text': 'Βίβλος  γενέσεως  Ἰησοῦ  Χριστοῦ  υἱοῦ  Δαυὶδ  υἱοῦ  Ἀβραάμ.',
  'translation': '[The] book of [the] genealogy of Jesus Christ son of David son of Abraham.'},
 {'eng_ref': 'Mat.1.2',
  'text': 'Ἀβραὰμ  ἐγέννησεν  τὸν  Ἰσαάκ·  Ἰσαὰκ  δὲ  ἐγέννησεν  τὸν  Ἰακώβ·  Ἰακὼβ',
  'translation': 'Abraham begat <the> Isaac; Isaac then begat <the> Jacob; Jacob'},
 {'eng_ref': 'Mat.1.3',
  'text': 'Ἰούδας  δὲ  ἐγέννησεν  τὸν  Φάρες  καὶ  τὸν  Ζάρα  ἐκ  τῆς',
  'translation': 'Judah then begat <the> Perez and <the> Zerah out of <the>'},
 {'eng_ref': 'Mat.1.4',
  'text': 'Ἀρὰμ  δὲ  ἐγέννησεν  τὸν  Ἀμιναδάβ·  Ἀμιναδὰβ  δὲ  ἐγέννησεν  τὸν  Ναασσών·',
  'translation': 'Ram then begat <the> Amminadab; Amminadab then begat <the> Nahshon;'},
 {'eng_ref': 'Mat.1.5',
  'text': 'Σαλμὼν  δὲ  ἐγέννησεν  τὸν  Βόες  ἐκ  τῆς  Ῥαχάβ·  Βόες  δὲ',
  'translation': 'Salmon then begat <the> Boaz out of <the> Rahab; Boaz then'}]

In [136]:
with open('../data/STEP/NT_aligned.json', 'w', encoding='utf-8') as out_f:
    for line in data:
        json.dump(line, out_f, ensure_ascii=False)
        out_f.write('\n')