In [1]:
%%bash
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Older%20Formats/TOTHT%20Gen-Deu%20-%20Translators%20OT%20Hebrew%20Tagged%20text%20-%20STEPBible.org%20CC%20BY.txt >> ../data/STEP/Gen_Deu.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Older%20Formats/TOTHT%20Jos-Est%20-%20Translators%20OT%20Hebrew%20Tagged%20text%20-%20STEPBible.org%20CC%20BY.txt >> ../data/STEP/Jos_Est.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Older%20Formats/TOTHT%20Job-Sng%20-%20Translators%20OT%20Hebrew%20Tagged%20text%20-%20STEPBible.org%20CC%20BY.txt >> ../data/STEP/Job_Sng.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Older%20Formats/TOTHT%20Isa-Mal%20-%20Translators%20OT%20Hebrew%20Tagged%20text%20-%20STEPBible.org%20CC%20BY.txt >> ../data/STEP/Isa_Mal.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 8489k  100 8489k    0     0  3469k      0  0:00:02  0:00:02 --:--:-- 3469k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11.4M  100 11.4M    0     0  5488k      0  0:00:02  0:00:02 --:--:-- 5489k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4181k  100 4181k    0     0  6475k      0 --:--:-- --:--:-- --:--:-- 6473k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 8435k  100 8435k    0     0  7655k      0  0:00:01  0:00:01 --:--:-- 7661k


In [2]:
prefix = '../data/STEP/'
paths = ['Gen_Deu.txt', 'Jos_Est.txt', 'Job_Sng.txt', 'Isa_Mal.txt']
paths = [prefix + p for p in paths]
import json
ATNACH = '\u0591'

In [3]:
# Function to parse the input data and produce the desired output
def get_half_lines(line):
    words = line.split()
    split = None
    for i, w in enumerate(words):
        if ATNACH in w:
            split = i
    if split != None:
        return ' '.join(words[:split+1]), ' '.join(words[split+1:])
    else:
        return line, ''


def parse_data_file(input_file, keep_sep=False, priority='Hebrew'):
    parsed_data = []
    current_ref = None
    current_text = []
    # Open the input file for reading
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

        # Iterate through each line in the file, skipping headers
        for line in lines[45:]:
            columns = line.strip().split('\t')
            if len(columns) < 6:
                continue  # Skip lines that do not have enough data

            heb_ref, eng_ref, pointed, accented, morphology, extended_strongs = columns

            # Extract the Hebrew and English references, ignoring the word number
            heb_verse = heb_ref.split('-')[0]
            eng_verse = eng_ref.split('-')[0]

            if not keep_sep:
                accented = accented.replace('/', '')
            else:
                accented = accented.replace('/', ' ')

            # # If we're still processing the same reference, keep appending words
            # as some verses break different places, this tells us whether we want Hebrew or English verse breaks to have priority
            if priority == "Hebrew":
                comp = heb_verse
            elif priority == "English":
                comp = eng_verse

            if current_ref == comp:
                current_text.append(accented)
            else:
                # If we encounter a new reference, store the previous one (if it exists)
                if current_ref is not None:
                    line = ' '.join(current_text)
                    half_a, half_b = get_half_lines(line)
                    parsed_data.append({
                        'heb_ref': current_ref,
                        'eng_ref': current_eng_ref,
                        'line': line,
                        'half_a': half_a,
                        'half_b': half_b
                    })
                # Start collecting data for the new reference
                current_ref = heb_verse
                current_eng_ref = eng_verse
                current_text = [accented]

        # Append the last collected reference
        if current_ref:
            line = ' '.join(current_text)
            half_a, half_b = get_half_lines(line)
            parsed_data.append({
                'heb_ref': heb_verse,
                'eng_ref': eng_verse,
                'line': line,
                'half_a': half_a,
                'half_b': half_b
            })
    return parsed_data

In [4]:
data = []
for p in paths:
    data.extend(parse_data_file(p, keep_sep=True, priority='Hebrew'))


In [5]:
data[:5]

[{'heb_ref': 'Gen.1.1',
  'eng_ref': 'Gen.1.1',
  'line': 'בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַ שָּׁמַ֖יִם וְ אֵ֥ת הָ אָֽרֶץ ׃',
  'half_a': 'בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים',
  'half_b': 'אֵ֥ת הַ שָּׁמַ֖יִם וְ אֵ֥ת הָ אָֽרֶץ ׃'},
 {'heb_ref': 'Gen.1.2',
  'eng_ref': 'Gen.1.2',
  'line': 'וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖שֶׁךְ עַל ־ פְּנֵ֣י תְה֑וֹם וְ ר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל ־ פְּנֵ֥י הַ מָּֽיִם ׃',
  'half_a': 'וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖שֶׁךְ עַל ־ פְּנֵ֣י תְה֑וֹם',
  'half_b': 'וְ ר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל ־ פְּנֵ֥י הַ מָּֽיִם ׃'},
 {'heb_ref': 'Gen.1.3',
  'eng_ref': 'Gen.1.3',
  'line': 'וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽ יְהִי ־ אֽוֹר ׃',
  'half_a': 'וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר',
  'half_b': 'וַֽ יְהִי ־ אֽוֹר ׃'},
 {'heb_ref': 'Gen.1.4',
  'eng_ref': 'Gen.1.4',
  'line': 'וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑וֹב וַ יַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָ א֖וֹר וּ בֵ֥ין הַ חֹֽשֶׁךְ ׃',
  'half_a': 'וַ יַּ֧רְא אֱלֹהִ֛ים א

### Now adding the English translation

In [2]:
%%bash
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Translators%20Amalgamated%20OT%2BNT/TAHOT%20Gen-Deu%20-%20Translators%20Amalgamated%20Hebrew%20OT%20-%20STEPBible.org%20CC%20BY.txt > ../data/STEP/Gen_Deu_eng.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Translators%20Amalgamated%20OT%2BNT/TAHOT%20Jos-Est%20-%20Translators%20Amalgamated%20Hebrew%20OT%20-%20STEPBible.org%20CC%20BY.txt > ../data/STEP/Jos_Est_eng.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Translators%20Amalgamated%20OT%2BNT/TAHOT%20Job-Sng%20-%20Translators%20Amalgamated%20Hebrew%20OT%20-%20STEPBible.org%20CC%20BY.txt > ../data/STEP/Job_Sng_eng.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Translators%20Amalgamated%20OT%2BNT/TAHOT%20Isa-Mal%20-%20Translators%20Amalgamated%20Hebrew%20OT%20-%20STEPBible.org%20CC%20BY.txt > ../data/STEP/Isa_Mal_eng.txt


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 17.3M  100 17.3M    0     0  2150k      0  0:00:08  0:00:08 --:--:-- 3263k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 23.3M  100 23.3M    0     0  3526k      0  0:00:06  0:00:06 --:--:-- 4594k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 9316k  100 9316k    0     0  3564k      0  0:00:02  0:00:02 --:--:-- 3565k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 17.1M  100 17.1M    0     0  5470k      0  0:00:03  0:00:03 --:--:-- 5469k


In [6]:
import pandas as pd

def parse_eng_data_file(file_path):
    parsed_data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        curr_ref = None
        for i, line in enumerate(lines):
            if line.startswith("# "):
                ref = line.split("\t")[0].split("# ")[1].strip().rstrip()
                translation = lines[i+1].split("#_Translation")[1].strip().rstrip().replace('\t', ' ')
                if len(ref.split()) >1:
                    # formatted as Gen.31.55 (Heb: 32.1)
                    eng_ref = ref.split()[0]
                    heb_ref = eng_ref.split('.')[0] + '.'+ ref.split('Heb: ')[1].replace(')', '')
                    #heb_ref = eng_ref.split('.')[0] + '.' + ref.split()[-1].replace('(', '').replace(")", '')
                    
                else:
                    eng_ref = ref
                    heb_ref = eng_ref
                

                curr_ref = eng_ref
                parsed_data.append({'eng_ref': eng_ref,
                                    'heb_ref': heb_ref,
                                    'translation': translation,
                                    })         
            elif line.startswith(f"#_{curr_ref}"):
                # if there is a subsequent line with the rest of the info
                extra_translation  = lines[i+1].split("#_Translation")[1].strip().rstrip().replace('\t', ' ')
                # grab the last entry in parsed_data and add in the translation
                parsed_data[-1]['translation'] += " " + extra_translation  

    return parsed_data

eng_data = []
prefix = '../data/STEP/'
paths = ['Gen_Deu_eng.txt','Jos_Est_eng.txt', 'Job_Sng_eng.txt', 'Isa_Mal_eng.txt']
for p in [prefix+pth for pth in paths]:
    eng_data.extend(parse_eng_data_file(p))

eng_data[:5]



[{'eng_ref': 'Gen.1.1',
  'heb_ref': 'Gen.1.1',
  'translation': 'in/ beginning he created God <obj.> the/ heavens and/ <obj.> the/ earth'},
 {'eng_ref': 'Gen.1.2',
  'heb_ref': 'Gen.1.2',
  'translation': 'and/ the/ earth <it> was formlessness and/ emptiness and/ darkness [was] over [the] surface of [the] deep and/ [the] spirit of God [was] hovering over [the] surface of the/ waters'},
 {'eng_ref': 'Gen.1.3',
  'heb_ref': 'Gen.1.3',
  'translation': 'and/ he said God let it be light and/ there was light'},
 {'eng_ref': 'Gen.1.4',
  'heb_ref': 'Gen.1.4',
  'translation': 'and/ he saw God <obj.> the/ light that [it was] good and/ he separated God between the/ light and/ between the/ darkness'},
 {'eng_ref': 'Gen.1.5',
  'heb_ref': 'Gen.1.5',
  'translation': 'and/ he called God <to> the/ light day and/ <to> the/ darkness he called night and/ there was evening and/ there was morning day one'}]

In [7]:
# let's make it into a dataframe, merge on eng_ref, and then hopefully we have a perfectly aligned dataframe
import pandas as pd

heb = pd.DataFrame.from_records(data)
heb.head()

Unnamed: 0,heb_ref,eng_ref,line,half_a,half_b
0,Gen.1.1,Gen.1.1,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַ שָּׁמַ...,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים,אֵ֥ת הַ שָּׁמַ֖יִם וְ אֵ֥ת הָ אָֽרֶץ ׃
1,Gen.1.2,Gen.1.2,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,וְ ר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל ־ פְּנֵ֥י ה...
2,Gen.1.3,Gen.1.3,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽ יְהִי ־...,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר,וַֽ יְהִי ־ אֽוֹר ׃
3,Gen.1.4,Gen.1.4,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑ו...,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑וֹב,וַ יַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָ א֖וֹר וּ בֵ֥...
4,Gen.1.5,Gen.1.5,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,וַֽ יְהִי ־ עֶ֥רֶב וַֽ יְהִי ־ בֹ֖קֶר י֥וֹם אֶ...


In [8]:
eng = pd.DataFrame.from_records(eng_data)
eng.head()

Unnamed: 0,eng_ref,heb_ref,translation
0,Gen.1.1,Gen.1.1,in/ beginning he created God <obj.> the/ heave...
1,Gen.1.2,Gen.1.2,and/ the/ earth <it> was formlessness and/ emp...
2,Gen.1.3,Gen.1.3,and/ he said God let it be light and/ there wa...
3,Gen.1.4,Gen.1.4,and/ he saw God <obj.> the/ light that [it was...
4,Gen.1.5,Gen.1.5,and/ he called God <to> the/ light day and/ <t...


In [9]:
all_OT = pd.merge(heb, eng, on='eng_ref')
# this creates heb_ref_x and heb_ref_y. we want to make certain that these are the same. If so
all_OT.head()

Unnamed: 0,heb_ref_x,eng_ref,line,half_a,half_b,heb_ref_y,translation
0,Gen.1.1,Gen.1.1,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַ שָּׁמַ...,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים,אֵ֥ת הַ שָּׁמַ֖יִם וְ אֵ֥ת הָ אָֽרֶץ ׃,Gen.1.1,in/ beginning he created God <obj.> the/ heave...
1,Gen.1.2,Gen.1.2,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,וְ ר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל ־ פְּנֵ֥י ה...,Gen.1.2,and/ the/ earth <it> was formlessness and/ emp...
2,Gen.1.3,Gen.1.3,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽ יְהִי ־...,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר,וַֽ יְהִי ־ אֽוֹר ׃,Gen.1.3,and/ he said God let it be light and/ there wa...
3,Gen.1.4,Gen.1.4,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑ו...,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑וֹב,וַ יַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָ א֖וֹר וּ בֵ֥...,Gen.1.4,and/ he saw God <obj.> the/ light that [it was...
4,Gen.1.5,Gen.1.5,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,וַֽ יְהִי ־ עֶ֥רֶב וַֽ יְהִי ־ בֹ֖קֶר י֥וֹם אֶ...,Gen.1.5,and/ he called God <to> the/ light day and/ <t...


In [10]:
all_OT[all_OT['eng_ref']==None]

Unnamed: 0,heb_ref_x,eng_ref,line,half_a,half_b,heb_ref_y,translation


In [11]:
all_OT[all_OT['heb_ref_x'] != all_OT['heb_ref_y']].to_csv('non_matching.csv')
# I see, so there are some cases the verse in English divides the verse differently. We do have exactly the words that belong to each verse in the non-Eng files, but I don't think we need it.
# I think we should, however, keep the more detailed column that includes ranges

In [12]:
all_OT = all_OT.rename(columns = {'heb_ref_x': 'heb_ref'})
all_OT = all_OT.drop(columns =['heb_ref_y'])
all_OT= all_OT[['eng_ref', 'heb_ref', 'line', 'translation', 'half_a', 'half_b']]
all_OT.head()

Unnamed: 0,eng_ref,heb_ref,line,translation,half_a,half_b
0,Gen.1.1,Gen.1.1,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַ שָּׁמַ...,in/ beginning he created God <obj.> the/ heave...,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים,אֵ֥ת הַ שָּׁמַ֖יִם וְ אֵ֥ת הָ אָֽרֶץ ׃
1,Gen.1.2,Gen.1.2,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,and/ the/ earth <it> was formlessness and/ emp...,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,וְ ר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל ־ פְּנֵ֥י ה...
2,Gen.1.3,Gen.1.3,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽ יְהִי ־...,and/ he said God let it be light and/ there wa...,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר,וַֽ יְהִי ־ אֽוֹר ׃
3,Gen.1.4,Gen.1.4,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑ו...,and/ he saw God <obj.> the/ light that [it was...,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑וֹב,וַ יַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָ א֖וֹר וּ בֵ֥...
4,Gen.1.5,Gen.1.5,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,and/ he called God <to> the/ light day and/ <t...,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,וַֽ יְהִי ־ עֶ֥רֶב וַֽ יְהִי ־ בֹ֖קֶר י֥וֹם אֶ...


In [13]:
half_a = []
half_b = []
for i, row in all_OT.iterrows():
    #print(row)
    if row['half_b'] != "":
        words = row['translation'].split()
        half_a.append(" ".join(words[:len(words)//2]))
        half_b.append(" ".join(words[len(words)//2:]))
    else:
        half_a.append(row['translation'])
        half_b.append("")

all_OT['trans_a'] = half_a
all_OT['trans_b'] = half_b

all_OT.head()


Unnamed: 0,eng_ref,heb_ref,line,translation,half_a,half_b,trans_a,trans_b
0,Gen.1.1,Gen.1.1,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַ שָּׁמַ...,in/ beginning he created God <obj.> the/ heave...,בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים,אֵ֥ת הַ שָּׁמַ֖יִם וְ אֵ֥ת הָ אָֽרֶץ ׃,in/ beginning he created God <obj.>,the/ heavens and/ <obj.> the/ earth
1,Gen.1.2,Gen.1.2,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,and/ the/ earth <it> was formlessness and/ emp...,וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖...,וְ ר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל ־ פְּנֵ֥י ה...,and/ the/ earth <it> was formlessness and/ emp...,[the] deep and/ [the] spirit of God [was] hove...
2,Gen.1.3,Gen.1.3,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽ יְהִי ־...,and/ he said God let it be light and/ there wa...,וַ יֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר,וַֽ יְהִי ־ אֽוֹר ׃,and/ he said God let it,be light and/ there was light
3,Gen.1.4,Gen.1.4,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑ו...,and/ he saw God <obj.> the/ light that [it was...,וַ יַּ֧רְא אֱלֹהִ֛ים אֶת ־ הָ א֖וֹר כִּי ־ ט֑וֹב,וַ יַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָ א֖וֹר וּ בֵ֥...,and/ he saw God <obj.> the/ light that [it was...,and/ he separated God between the/ light and/ ...
4,Gen.1.5,Gen.1.5,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,and/ he called God <to> the/ light day and/ <t...,וַ יִּקְרָ֨א אֱלֹהִ֤ים ׀ לָ אוֹר֙ י֔וֹם וְ לַ ...,וַֽ יְהִי ־ עֶ֥רֶב וַֽ יְהִי ־ בֹ֖קֶר י֥וֹם אֶ...,and/ he called God <to> the/ light day and/ <t...,he called night and/ there was evening and/ th...


In [14]:
all_data = all_OT.to_dict('records')


In [15]:
# Write the output as a JSON file
with open('../data/STEP/OT_aligned_sep.json', 'w', encoding='utf-8') as out_f:
    for line in all_data:
        json.dump(line, out_f, ensure_ascii=False)
        out_f.write('\n')


In [42]:
import json
with open('../data/STEP/OT_aligned_sep.json', 'rt', encoding='utf-8') as ifd:
    data = []
    for line in ifd:
        data.append(json.loads(line))

data[:5]


[{'eng_ref': 'Gen.1.1',
  'heb_ref': 'Gen.1.1',
  'line': 'בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַ שָּׁמַ֖יִם וְ אֵ֥ת הָ אָֽרֶץ ׃',
  'translation': 'in/ beginning he created God <obj.> the/ heavens and/ <obj.> the/ earth',
  'half_a': 'בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים',
  'half_b': 'אֵ֥ת הַ שָּׁמַ֖יִם וְ אֵ֥ת הָ אָֽרֶץ ׃',
  'trans_a': 'in/ beginning he created God <obj.>',
  'trans_b': 'the/ heavens and/ <obj.> the/ earth'},
 {'eng_ref': 'Gen.1.2',
  'heb_ref': 'Gen.1.2',
  'line': 'וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖שֶׁךְ עַל ־ פְּנֵ֣י תְה֑וֹם וְ ר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל ־ פְּנֵ֥י הַ מָּֽיִם ׃',
  'translation': 'and/ the/ earth <it> was formlessness and/ emptiness and/ darkness [was] over [the] surface of [the] deep and/ [the] spirit of God [was] hovering over [the] surface of the/ waters',
  'half_a': 'וְ הָ אָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָ בֹ֔הוּ וְ חֹ֖שֶׁךְ עַל ־ פְּנֵ֣י תְה֑וֹם',
  'half_b': 'וְ ר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל ־ פְּנֵ֥י הַ מָּֽיִם ׃',
  'trans_a': 'a

okay what do I want to do? I want to re-run two methods of chiasm scoring, both feature-based and neural embedding space
as well as line-level and half-line level. We want to do this with books as separate trials. Also, we should probably base our baseline
on n continuous lines within that book for a fair trial 
* half-line level
* line level
* paragraph level
#### let's start with embedding it with E5.
First want to check that how embedding models deals with Hebrew accents. check the embedding is the same with and without diacritics




In [None]:
! pip install sentence_transformers -qqq

In [44]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/local/Cellar/python@3.10/3.10.15/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/local/Cellar/python@3.10/3.10.15/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/hopespeirs/Documents/projects/intertextuality_rewrite/literary-translation/.venv/lib/python3.10/site-pack

ImportError: initialization failed

In [8]:
model = SentenceTransformer('intfloat/multilingual-e5-small')

In [159]:

# okay, let's sort data into books, what structure to use? Dict. As always
books = {}
for line in all_data:
    book = line['heb_ref'].split('.')[0]
    if book not in books:
        books[book] = [line]
    else:
        books[book].append(line)


In [102]:
import re
def remove_cantillation_marks(text):
    # Define the regex pattern
    # U+05C3 is a sof pasuq, we want to keep that.
    # pattern = r'[\u0591-\u05AF\u05C0-\u05C2\u05C4-\u05C7\u05F3\u05F4]'
    # actually remove sof pasuq
    pattern = r'[\u0591-\u05AF\u05C0-\u05C7\u05F3\u05F4]'

    # Use the sub() function to replace matches with an empty string
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text
def remove_nikkud(text):
    # Define the regex pattern
    return re.sub(r'[\u0591-\u05C7]', '', text)


In [104]:

line = all_data[0]['line']
cleaned_line = remove_cantillation_marks(line)
modern_cleaned_line = remove_nikkud(line)
print(line)
print(cleaned_line)
print(modern_cleaned_line)

enc1 = model.encode(line)
enc2 = model.encode(modern_cleaned_line)

# encoding changes based on nikkud
import numpy as np
np.dot(enc1, enc2) # if they were the same embedding, dot product would be 1

בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃
בְּרֵאשִית בָּרָא אֱלֹהִים אֵת הַשָּמַיִם וְאֵת הָאָֽרֶץ
בראשית ברא אלהים את השמים ואת הארץ


0.8897082

In [106]:
enc1[:10], enc2[:10]

(array([ 0.07546929, -0.00655045, -0.06118112, -0.02635499,  0.08451455,
         0.02568321,  0.00330316,  0.0366122 ,  0.00541422,  0.03100647],
       dtype=float32),
 array([ 0.04332992,  0.03696194, -0.02795921, -0.04710515,  0.11382207,
         0.04877399,  0.00981134,  0.03429966,  0.01273974,  0.01509028],
       dtype=float32))

In [126]:
books.keys()

dict_keys(['Gen', 'Exo', 'Lev', 'Num', 'Deu', 'Jos', 'Jdg', 'Rut', '1Sa', '2Sa', '1Ki', '2Ki', '1Ch', '2Ch', 'Ezr', 'Neh', 'Est', 'Job', 'Psa', 'Pro', 'Ecc', 'Sng', 'Isa', 'Jer', 'Lam', 'Ezk', 'Dan', 'Hos', 'Jol', 'Amo', 'Oba', 'Jon', 'Mic', 'Nam', 'Hab', 'Zep', 'Hag', 'Zec', 'Mal'])

In [127]:
verses = books['Psa']
import pandas as pd

df = pd.DataFrame.from_records(verses)
df['line'] = df['line'].apply(lambda x: remove_nikkud(x))
df.head()

Unnamed: 0,heb_ref,eng_ref,line,half_a,half_b,translation
0,Psa.1.1,Psa.1.1,אשרי האיש אשר לא הלך בעצת רשעים ובדרך חטאים לא...,אַ֥שְֽׁרֵי־ הָאִ֗ישׁ אֲשֶׁ֤ר׀ לֹ֥א הָלַךְ֮ בַּ...,וּבְמוֹשַׁ֥ב לֵ֝צִ֗ים לֹ֣א יָשָֽׁב׃,how blessed! [is] the/ person who not he walks...
1,Psa.1.2,Psa.1.2,כי אם בתורת יהוה חפצו ובתורתו יהגה יומם ולילה,כִּ֤י אִ֥ם בְּתוֹרַ֥ת יְהוָ֗ה חֶ֫פְצ֥וֹ וּֽבְת...,,that except [is] in/ [the] law of Yahweh delig...
2,Psa.1.3,Psa.1.3,והיה כעץ שתול על פלגי מים אשר פריו יתן בעתו וע...,וְֽהָיָ֗ה כְּעֵץ֮ שָׁת֪וּל עַֽל־ פַּלְגֵ֫י מָ֥...,וְכֹ֖ל אֲשֶׁר־ יַעֲשֶׂ֣ה יַצְלִֽיחַ׃,and/ he is like/ a tree planted at streams of ...
3,Psa.1.4,Psa.1.4,לא כן הרשעים כי אם כמץ אשר תדפנו רוח,לֹא־ כֵ֥ן הָרְשָׁעִ֑ים,כִּ֥י אִם־ כַּ֝מֹּ֗ץ אֲֽשֶׁר־ תִּדְּפֶ֥נּוּ רֽ...,not [are] so the/ wicked [people] that except ...
4,Psa.1.5,Psa.1.5,על כן לא יקמו רשעים במשפט וחטאים בעדת צדיקים,עַל־ כֵּ֤ן׀ לֹא־ יָקֻ֣מוּ רְ֭שָׁעִים בַּמִּשְׁ...,וְ֝חַטָּאִ֗ים בַּעֲדַ֥ת צַדִּיקִֽים׃,there- -fore not they will stand wicked [peopl...


In [128]:
encs = model.encode(df['line'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches: 100%|██████████| 79/79 [01:30<00:00,  1.14s/it]


In [129]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(encs,encs)

In [135]:
import numpy as np

def get_chiasm_score(cos_sim, i, n):

    # the basic chiasm score is the sum of the reversed diagonal elements of the cosine similarity matrix
    # then we add a penalty for high similarity scores between different levels. 
    chiasm = cos_sim[i:i+n, i:i+n]
    # now reverse the diagonal
    chiasm = np.fliplr(chiasm)
    score = chiasm.trace()
    # if it's odd, subtract the middle value -- it's a self-similarity score, always 1
    if n % 2 == 1:
        score -= chiasm[n//2, n//2]
    # should normalize the score to the number of lines in the chiasm. 
    # if the chiasm is even, we divide by n, if it is odd, we divide by n-1, this is to avoid penalizing odd chiasmi
    # this is the average of lines that should be similar
    if n%2 == 0:
        div = n
    else:
        div = n-1
    
    neg_score = np.sum(chiasm[0, 1:-1]) + np.sum(chiasm[-1, 1:-1])
    # need to normalize this to n
    score = score/div - neg_score/div
    return score

In [136]:
# just as a quick check, get the chiasm score and then sort by that and look at what's up
# ugh we also want to get the corresponding english translation easily. hold on. Done
scores = {}
for i in range(len(encs)): 
    scores[i] = get_chiasm_score(cos_sim, i=i, n=4)
scores = pd.Series(scores)
scores.sort_values(ascending=False).head(15)
 

2525    0.447967
2526    0.250000
2524    0.237559
190     0.062713
2317    0.049571
1058    0.043664
2277    0.043040
1512    0.042806
2000    0.040585
1476    0.036888
1334    0.036461
1826    0.036142
1593    0.035952
1618    0.034992
980     0.034703
dtype: float64

In [137]:
for idx in scores.sort_values(ascending=False).head().index:
    print(verses[idx])
    text = [v['translation'] for v in verses[idx:idx+4]]
    for i, line in enumerate(text):
        space = "\t"*i
        print(space + line)
    print("\n")

{'heb_ref': 'Psa.150.5', 'eng_ref': 'Psa.150.5', 'line': 'הַֽלְל֥וּהוּ בְצִלְצְלֵי־ שָׁ֑מַע הַֽ֝לְל֗וּהוּ בְּֽצִלְצְלֵ֥י תְרוּעָֽה׃', 'half_a': 'הַֽלְל֥וּהוּ בְצִלְצְלֵי־ שָׁ֑מַע', 'half_b': 'הַֽ֝לְל֗וּהוּ בְּֽצִלְצְלֵ֥י תְרוּעָֽה׃', 'translation': 'praise/ him with/ cymbals of sound praise/ him with/ cymbals of shouting'}
praise/ him with/ cymbals of sound praise/ him with/ cymbals of shouting
	every <the>/ breathing thing let it praise Yahweh praise Yahweh


{'heb_ref': 'Psa.150.6', 'eng_ref': 'Psa.150.6', 'line': 'כֹּ֣ל הַ֭נְּשָׁמָה תְּהַלֵּ֥ל יָ֗הּ הַֽלְלוּ־ יָֽהּ׃', 'half_a': 'כֹּ֣ל הַ֭נְּשָׁמָה תְּהַלֵּ֥ל יָ֗הּ הַֽלְלוּ־ יָֽהּ׃', 'half_b': '', 'translation': 'every <the>/ breathing thing let it praise Yahweh praise Yahweh'}
every <the>/ breathing thing let it praise Yahweh praise Yahweh


{'heb_ref': 'Psa.150.4', 'eng_ref': 'Psa.150.4', 'line': 'הַֽ֭לְלוּהוּ בְתֹ֣ף וּמָח֑וֹל הַֽ֝לְל֗וּהוּ בְּמִנִּ֥ים וְעוּגָֽב׃', 'half_a': 'הַֽ֭לְלוּהוּ בְתֹ֣ף וּמָח֑וֹל', 'half_b': 'הַֽ֝לְל֗וּהוּ

In [234]:
from tqdm import tqdm
n_list = [3,4,5,6,7]

np.random.seed(42)
random_starts = np.random.randint(0, cos_sim.shape[0]-max(n_list)+1, 1000)

p_values = {}
candidate_scores = {}
for n in n_list:
    p_values[n] = []
    candidate_scores[n] = []
    scores = []
    for i in random_starts:
        scores.append(get_chiasm_score(cos_sim, i, n))

    for i in tqdm(range(0, cos_sim.shape[0]-n+1)):
        # get the score of that chiasmus of size n
        candidate= get_chiasm_score(cos_sim, i, n)  
        candidate_scores[n].append(candidate)

        # calculate p-value
        # adding 1 to the numerator and denominator to avoid division by zero (laplace smoothing)
        p = (sum([1 for s in scores if s >= candidate]) + 1) / (len(scores)+1)
  
        p_values[n].append(p)

100%|██████████| 1068/1068 [00:00<00:00, 8818.87it/s]
100%|██████████| 1067/1067 [00:00<00:00, 9352.68it/s]
100%|██████████| 1066/1066 [00:00<00:00, 9449.43it/s]
100%|██████████| 1065/1065 [00:00<00:00, 9549.69it/s]
100%|██████████| 1064/1064 [00:00<00:00, 9353.02it/s]


[0.4955044955044955, 0.36663336663336665, 0.1068931068931069]

Here's the good news: you've found this before it's been published. It does need reworking. Let's try a feature-based approach.
How would I do that? Just a word n-gram based approach and then look at the overlap of sets in the opposing lines?
The good news also is that once you have feature vectors, the same process will hold. Can use similarity functions, but maybe jaccard score instead of

In [216]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=lambda x: x.split() + ["".join(y) for y in zip(x,x[1:])] + ["".join(y) for y in zip(x,x[1:],x[2:])], ngram_range=(1, 3))
word_feats = vectorizer.fit_transform(df['line'])

In [217]:
pd.DataFrame(word_feats.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,אאמין,אאמין כי,אאמין כי יאזין,אאמצכם,אאמצכם במו,אאמצכם במו פי,אאריך,אאריך נפשי,אב,אב או,...,תתענג,תתענג ותשא,תתענג ותשא אל,תתפלא,תתפלא בי,תתקפהו,תתקפהו כמלך,תתקפהו כמלך עתיד,תתקפהו לנצח,תתקפהו לנצח ויהלך
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1066,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1067,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1068,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [228]:
scores = {}
for i in range(len(encs)): 
    scores[i] = get_chiasm_score(cosine_similarity(word_feats, word_feats), i=i, n=4)
scores = pd.Series(scores)
scores.sort_values(ascending=False).head(15)
for idx in scores.sort_values(ascending=False).head(15).index:
    print(verses[idx])

{'heb_ref': 'Job.42.15', 'eng_ref': 'Job.42.15', 'line': 'וְלֹ֨א נִמְצָ֜א נָשִׁ֥ים יָפ֛וֹת כִּבְנ֥וֹת אִיּ֖וֹב בְּכָל־ הָאָ֑רֶץ וַיִּתֵּ֨ן לָהֶ֧ם אֲבִיהֶ֛ם נַחֲלָ֖ה בְּת֥וֹךְ אֲחֵיהֶֽם׃ס', 'half_a': 'וְלֹ֨א נִמְצָ֜א נָשִׁ֥ים יָפ֛וֹת כִּבְנ֥וֹת אִיּ֖וֹב בְּכָל־ הָאָ֑רֶץ', 'half_b': 'וַיִּתֵּ֨ן לָהֶ֧ם אֲבִיהֶ֛ם נַחֲלָ֖ה בְּת֥וֹךְ אֲחֵיהֶֽם׃ס'}
{'heb_ref': 'Job.42.17', 'eng_ref': 'Job.42.17', 'line': 'וַיָּ֣מָת אִיּ֔וֹב זָקֵ֖ן וּשְׂבַ֥ע יָמִֽים׃', 'half_a': 'וַיָּ֣מָת אִיּ֔וֹב זָקֵ֖ן וּשְׂבַ֥ע יָמִֽים׃', 'half_b': ''}
{'heb_ref': 'Job.1.16', 'eng_ref': 'Job.1.16', 'line': 'ע֣וֹד׀ זֶ֣ה מְדַבֵּ֗ר וְזֶה֮ בָּ֣א וַיֹּאמַר֒ אֵ֣שׁ אֱלֹהִ֗ים נָֽפְלָה֙ מִן־ הַשָּׁמַ֔יִם וַתִּבְעַ֥ר בַּצֹּ֛אן וּבַנְּעָרִ֖ים וַתֹּאכְלֵ֑ם וָאִמָּ֨לְטָ֧ה רַק־ אֲנִ֛י לְבַדִּ֖י לְהַגִּ֥יד לָֽךְ׃', 'half_a': 'ע֣וֹד׀ זֶ֣ה מְדַבֵּ֗ר וְזֶה֮ בָּ֣א וַיֹּאמַר֒ אֵ֣שׁ אֱלֹהִ֗ים נָֽפְלָה֙ מִן־ הַשָּׁמַ֔יִם וַתִּבְעַ֥ר בַּצֹּ֛אן וּבַנְּעָרִ֖ים וַתֹּאכְלֵ֑ם', 'half_b': 'וָאִמָּ֨לְטָ֧ה רַק־ אֲנִ֛י לְבַדִּ֖י לְהַגִּ֥יד לָֽךְ׃'}
{'heb

In [None]:
# so, okay, let's think about this. 
# previously, we've thought about semantic similarity as a proxy for chiasm. That might not capture wordplay and things like that
# which lexical and phonetic features would.
# we want, over the course of a set of half-lines, line, paragraphs, or maybe chapters 
# I suppose a completely different way of going about it is to build a model that maximises n-gram counts
# e.g. finding the divisions rather assuming divisions are the set lines or half-lines. 