In [54]:
from tf.app import use
A = use('etcbc/bhsa')
F = A.TF.api.F
L = A.TF.api.L

This is Text-Fabric 9.3.2
Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html

122 features found and 0 ignored


# Generating vocab lists for Bible passages

I often want to generate vocabulary lists for specific passages. Text-Fabric should be good at this sort of thing but I haven't done it before. So that's what this notebook is about.

## Lexemes in Exodus (Python)

In [55]:
# The following code searches for all lexemes in Exodus.
# Thanks to Martijn Naaijer for this solution.
for bo in F.otype.s('book'):
    if F.book.v(bo) == 'Exodus':
        lexemes = {F.lex.v(w) for w in L.d(bo, 'word')}

print(len(lexemes))

1425


## Lexemes in Exodus 1 (Python)
### Collect unique values of feature `lex_utf8`

In [56]:
# The following is a slightly modified version.
# It generates a list of unique lexemes in Exodus 1.
for chapter in F.otype.s('chapter'):
    if F.book.v(chapter) == 'Exodus' and F.chapter.v(chapter) == 1:
        # use list instead of set to preserve insertion order
        # (canonical ordering sometimes makes life easier when preparing material
        # for class)
        lexemes = [F.lex_utf8.v(w) for w in L.d(chapter, 'word')]

# remove duplicates
lexemes = [x for x in dict.fromkeys(lexemes).keys()]

print(len(lexemes))
print(lexemes)

119
['ו', 'אלה', 'שׁם', 'בן', 'ישׂראל', 'ה', 'בוא', 'מצרים', 'את', 'יעקב', 'אישׁ', 'בית', 'ראובן', 'שׁמעון', 'לוי', 'יהודה', 'ישׂשׂכר', 'זבולן', 'בנימן', 'דן', 'נפתלי', 'גד', 'אשׁר', 'היה', 'כל', 'נפשׁ', 'יצא', 'ירך', 'שׁבע', 'יוסף', 'ב', 'מות', 'אח', 'דור', 'הוא', 'פרה', 'שׁרץ', 'רבה', 'עצם', 'מאד', 'מלא', 'ארץ', 'קום', 'מלך', 'חדשׁ', 'על', 'לא', 'ידע', 'אמר', 'אל', 'עם', 'הנה', 'רב', 'עצום', 'מן', 'יהב', 'חכם', 'ל', 'פן', 'כי', 'קרא', 'מלחמה', 'יסף', 'גם', 'שׂנא', 'לחם', 'עלה', 'שׂים', 'שׂר', 'מס', 'למען', 'ענה', 'סבלות', 'בנה', 'עיר', 'מסכנות', 'פרעה', 'פתם', 'רעמסס', 'כ', 'כן', 'פרץ', 'קוץ', 'פנה', 'עבד', 'פרך', 'מרר', 'חיים', 'עבדה', 'קשׁה', 'חמר', 'לבנה', 'שׂדה', 'ילד', 'עברי', 'אחד', 'שׁפרה', 'שׁני', 'פועה', 'ראה', 'אבן', 'אם', 'בת', 'היא', 'חיה', 'ירא', 'אלהים', 'עשׂה', 'דבר', 'מדוע', 'זה', 'אשׁה', 'מצרי', 'טרם', 'יטב', 'צוה', 'ילוד', 'יאר', 'שׁלך']


**Note:** Actually, this is not quite right. It returns 119 results, whereas there are 126 lexemes according to the TF Query below. The difference is probably due to different lexemes having identical `lex_utf8` features.

### Collect unique values of feature `lex`
Removing duplicates based on `lex` rather than `lex_utf8` resolves the issue:

In [57]:
# The following is a slightly modified version.
# It generates a list of unique lexemes in Exodus 1.
for chapter in F.otype.s('chapter'):
    if F.book.v(chapter) == 'Exodus' and F.chapter.v(chapter) == 1:
        # use list instead of set to preserve insertion order
        # (canonical ordering sometimes makes life easier when preparing material
        # for class)
        lexemes = [F.lex.v(w) for w in L.d(chapter, 'word')]

# remove duplicates
lexemes = [x for x in dict.fromkeys(lexemes).keys()]

print(len(lexemes))
print(lexemes)

126
['W', '>LH', 'CM/', 'BN/', 'JFR>L/', 'H', 'BW>[', 'MYRJM/', '>T==', 'J<QB/', '>JC/', 'BJT/', 'R>WBN/', 'CM<WN/', 'LWJ=/', 'JHWDH/', 'JFFKR/', 'ZBWLN/', 'BNJMN/', 'DN/', 'NPTLJ/', 'GD==/', '>CR==/', 'HJH[', 'KL/', 'NPC/', 'JY>[', 'JRK/', 'CB</', 'JWSP/', 'B', 'MWT[', '>X/', 'DWR/', 'HW>', 'PRH[', 'CRY[', 'RBH[', '<YM[', 'M>D/', 'ML>[', '>RY/', '>T', 'QWM[', 'MLK/', 'XDC/', '<L', '>CR', 'L>', 'JD<[', '>MR[', '>L', '<M/', 'HNH', 'RB/', '<YWM/', 'MN', 'JHB[', 'XKM[', 'L', 'PN', 'KJ', 'QR>=[', 'MLXMH/', 'JSP[', 'GM', 'FN>[', 'LXM[', '<LH[', 'FJM[', 'FR/', 'MS/', 'LM<N', '<NH=[', 'SBLWT/', 'BNH[', '<JR/', 'MSKNWT/', 'PR<H/', 'PTM/', 'R<MSS/', 'K', 'KN', 'PRY[', 'QWY[', 'PNH/', '<BD[', 'PRK/', 'MRR[', 'XJJM/', '<BDH/', 'QCH/', 'XMR/', 'LBNH/', 'FDH/', 'JLD[', '<BRJ/', '>XD/', 'CPRH=/', 'CNJ/', 'PW<H/', 'R>H[', '>BN=/', '>M', 'BT/', 'HJ>', 'XJH[', 'JR>[', '>LHJM/', '<FH[', 'DBR[', 'JLD/', 'QR>[', 'MDW<', 'DBR/', 'ZH', '>CH/', 'MYRJ/', 'XJH=/', 'HNH=', 'VRM/', 'JVB[', 'YWH[', 'JLWD/', 'J>R=

## Lexemes in Exodus (TF Query)

The following code blocks search for the same lexemes using TF queries instead of Python code.

In [58]:
# Thanks to Dirk Roorda for this solution.
A.table(A.search('''
lex
/with/
book book=Exodus
  w:word
  % Note that .. refers to the parent node of the qunatifier, which is the lex node.
  w ]] ..
/-/
''')[0:10])

  0.54s 1425 results


n,p,lex
1,בְּ,
2,רֵאשִׁית,
3,ברא,
4,אֱלֹהִים,
5,אֵת,
6,הַ,
7,שָׁמַיִם,
8,וְ,
9,אֶרֶץ,
10,היה,


In [59]:
# Thanks to Dirk Roorda for this solution.
A.table(A.search('''
lex
/with/
  w:word
book book=Exodus
  w
/-/
''')[0:10])

  0.69s 1425 results


n,p,lex
1,בְּ,
2,רֵאשִׁית,
3,ברא,
4,אֱלֹהִים,
5,אֵת,
6,הַ,
7,שָׁמַיִם,
8,וְ,
9,אֶרֶץ,
10,היה,


## Lexemes in Exodus 1 (TF Query)
Limiting the above query to Exodus 1.

In [60]:
nodes = A.search('''
lex
/with/
  w:word
book book=Exodus
  chapter chapter=1
    w
/-/
''')[0:20]
A.table(nodes)
print(nodes)

  0.49s 126 results


n,p,lex
1,בְּ,
2,אֱלֹהִים,
3,אֵת,
4,הַ,
5,וְ,
6,אֶרֶץ,
7,היה,
8,עַל,
9,פָּנֶה,
10,אמר,


[(1437602,), (1437605,), (1437606,), (1437607,), (1437609,), (1437610,), (1437611,), (1437615,), (1437616,), (1437621,), (1437623,), (1437624,), (1437628,), (1437629,), (1437634,), (1437637,), (1437638,), (1437639,), (1437641,), (1437642,)]


## Lexemes in Exodus 1 (hybrid approach)

The above query does not return lexemes in the order in which they appear in Exodus 1. The following code uses a TF query such in a way that results can be sorted in canonical order using the `sort` method.

### In order of appearance (canonical)

In [61]:
# Find lexeme-word pairs
tuples = A.search('''
l:lex
    w:word
    /with/
    book book=Exodus
        chapter chapter=1
            w
    /-/
''')

# Sort by word nodes (which are in canonical order)
tuples.sort(key=lambda t: t[1])

# Select lexeme nodes only and remove duplicates
lexemes = list({ l: w for (l, w) in tuples })

A.table([(l,) for l in lexemes][0:10])

  0.53s 349 results


n,p,lex
1,וְ,
2,אֵלֶּה,
3,שֵׁם,
4,בֵּן,
5,יִשְׂרָאֵל,
6,הַ,
7,בוא,
8,מִצְרַיִם,
9,אֵת,
10,יַעֲקֹב,


### In alphabetical order

The following code sorts in alphabetical order instead.

In [62]:
# Find lexeme-word pairs
tuples = A.search('''
l:lex
    w:word
    /with/
    book book=Exodus
        chapter chapter=1
            w
    /-/
''')

# Sort by word feature `lex_utf8`
tuples.sort(key=lambda t: F.lex_utf8.v(t[1]))

# Select lexeme nodes only and remove duplicates
lexemes = list({ l: w for (l, w) in tuples })

A.table([(l,) for l in lexemes][0:10])

  0.51s 349 results


n,p,lex
1,אֹבֶן,
2,אָח,
3,אֶחָד,
4,אִישׁ,
5,אֶל,
6,אֵלֶּה,
7,אֱלֹהִים,
8,אִם,
9,אמר,
10,אֶרֶץ,


# Clipboard integration

To facilitate exporting the vocab list to another app, we'll simply copy it to the clipboard. The following solution relies on the Python module `pyperclip`.

Windows: `pip install pyperclip`

Linux/macOS: `pip3 install pyperclip`

In [63]:
import pyperclip

def copy_list(list, vertical=True):
    # separate items with newline or tab
    sep = "\n" if vertical else "\t"
    string = sep.join(list)

    pyperclip.copy(string)

copy_list(["בראשׁית", "ברא", "אלהים"], False)

# Putting it all together

In my day-to-day work, the most common use of Scripture-based vocabulary lists entails generating a list for a book or chapter and processing it in Excel. The following functions would be useful for that task.

Of course, there other scenarios where you might want to grab only rare words (below a certain frequency) or to exclude certain words (e.g., only those in Genesis 2 that did not already appear in Genesis 1). But these functions are good enough for now, and it should not be too difficult to make adjustments when needed.

## Functions
Besides the functions below, the above function `copy_list` is also required (so run code blocks in sequence).

In [64]:
def get_lexeme_nodes(book, chapter=None, alphabetical=False):
    ch_spec = "chapter=%d"%(chapter) if chapter else ""
    # Find lexeme-word pairs
    tuples = A.search('''
l:lex
    w:word
    /with/
    book book=%s
        chapter %s
            w
    /-/
    '''%(book, ch_spec))

    if alphabetical:
        # Sort by word feature `lex_utf8`
        tuples.sort(key=lambda t: F.lex_utf8.v(t[1]))
    else:
        # Sort by word nodes (which are in canonical order)
        tuples.sort(key=lambda t: t[1])

    # Select lexeme nodes only and remove duplicates
    lexemes = list({ l: w for (l, w) in tuples })

    print("%d lexemes found in %s %s"%(len(lexemes), book, str(chapter or "")))
    return lexemes

def table_lexemes(list, limit=None):
    limit = limit or 100000
    A.table([(l,) for l in list][0:limit])

def copy_lexemes(list, vertical=True):
    utf8 = [F.voc_lex_utf8.v(i) for i in list]
    copy_list(utf8, vertical)

## Examples
### Genesis 1

In [65]:
# canonical order
gen1 = get_lexeme_nodes("Genesis", 1)
copy_lexemes(gen1)
table_lexemes(gen1, 10)

  0.54s 673 results
104 lexemes found in Genesis 1


n,p,lex
1,בְּ,
2,רֵאשִׁית,
3,ברא,
4,אֱלֹהִים,
5,אֵת,
6,הַ,
7,שָׁמַיִם,
8,וְ,
9,אֶרֶץ,
10,היה,


In [66]:
# alphabetical order
gen1 = get_lexeme_nodes("Genesis", 1, True)
copy_lexemes(gen1)
table_lexemes(gen1, 10)

  0.51s 673 results
104 lexemes found in Genesis 1


n,p,lex
1,אָדָם,
2,אֲדָמָה,
3,אֹור,
4,אור,
5,אֹות,
6,אֶחָד,
7,אָכְלָה,
8,אֶל,
9,אֱלֹהִים,
10,אמר,


### Deuteronomy 6

In [67]:
# canonical order
gen1 = get_lexeme_nodes("Deuteronomium", 6)
copy_lexemes(gen1)
table_lexemes(gen1, 10)

  0.18s 455 results
126 lexemes found in Deuteronomium 6


n,p,lex
1,וְ,
2,זֹאת,
3,הַ,
4,מִצְוָה,
5,חֹק,
6,מִשְׁפָּט,
7,אֲשֶׁר,
8,צוה,
9,יְהוָה,
10,אֱלֹהִים,


In [68]:
# alphabetical order
gen1 = get_lexeme_nodes("Deuteronomium", 6, True)
copy_lexemes(gen1)
table_lexemes(gen1, 10)

  0.53s 455 results
126 lexemes found in Deuteronomium 6


n,p,lex
1,אָב,
2,אַבְרָהָם,
3,אֲדָמָה,
4,אהב,
5,אֹות,
6,אֶחָד,
7,אַחֵר,
8,אַחַר,
9,איב,
10,אכל,


### Ruth

In [69]:
# canonical order
gen1 = get_lexeme_nodes("Ruth")
copy_lexemes(gen1)
table_lexemes(gen1, 10)

  0.65s 1802 results
319 lexemes found in Ruth 


n,p,lex
1,וְ,
2,היה,
3,בְּ,
4,יֹום,
5,שׁפט,
6,הַ,
7,רָעָב,
8,אֶרֶץ,
9,הלך,
10,אִישׁ,


In [70]:
# alphabetical order
gen1 = get_lexeme_nodes("Ruth", alphabetical=True)
copy_lexemes(gen1)
table_lexemes(gen1, 10)

  0.63s 1802 results
319 lexemes found in Ruth 


n,p,lex
1,אָב,
2,אָדֹון,
3,אהב,
4,אָז,
5,אֹזֶן,
6,אָח,
7,אֶחָד,
8,אחז,
9,אַחֵר,
10,אַחַר,


## Notebook header
For future reference, here is the code block that would need to be included at the top of a new notebook to use these functions.

In [71]:
import pyperclip
from tf.app import use
A = use('etcbc/bhsa')
F = A.TF.api.F
L = A.TF.api.L

def get_lexeme_nodes(book, chapter=None, alphabetical=False):
    ch_spec = "chapter=%d"%(chapter) if chapter else ""
    # Find lexeme-word pairs
    tuples = A.search('''
l:lex
    w:word
    /with/
    book book=%s
        chapter %s
            w
    /-/
    '''%(book, ch_spec))

    if alphabetical:
        # Sort by word feature `lex_utf8`
        tuples.sort(key=lambda t: F.lex_utf8.v(t[1]))
    else:
        # Sort by word nodes (which are in canonical order)
        tuples.sort(key=lambda t: t[1])

    # Select lexeme nodes only and remove duplicates
    lexemes = list({ l: w for (l, w) in tuples })

    print("%d lexemes found in %s %s"%(len(lexemes), book, str(chapter or "")))
    return lexemes

def table_lexemes(list, limit=None):
    limit = limit or 100000
    A.table([(l,) for l in list][0:limit])

def copy_lexemes(list, vertical=True):
    utf8 = [F.voc_lex_utf8.v(i) for i in list]
    copy_list(utf8, vertical)

def copy_list(list, vertical=True):
    # separate items with newline or tab
    sep = "\n" if vertical else "\t"
    string = sep.join(list)

    pyperclip.copy(string)

This is Text-Fabric 9.3.2
Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html

122 features found and 0 ignored


# Appendix: More complex queries

Here are some examples of queries that are a bit more complex than just finding lexemes in one book or chapter.

## Lexemes in Exodus that are not in Genesis

In [73]:
# Thanks to Dirk Roorda for this example.
A.table(A.search('''
lex
/with/
  w:word
book book=Exodus
  w
/-/
/without/
  v:word
book book=Genesis
  v
/-/
''')[0:10])

  1.12s 633 results


n,p,lex
1,חָדָשׁ,
2,חכם,
3,לחם,
4,סִבְלֹות,
5,מִסְכְּנֹות,
6,פִּתֹם,
7,פֶּרֶךְ,
8,שִׁפְרָה,
9,פּוּעָה,
10,אֹבֶן,


## Lexemes occuring only in Exodus

In [74]:
# Thanks to Dirk Roorda for this example.
A.table(A.search('''
lex
/with/
  w:word
book book=Exodus
  w
/-/
/without/
  v:word
book book#Exodus
  v
/-/
''')[0:10])

  2.35s 92 results


n,p,lex
1,סִבְלֹות,
2,פִּתֹם,
3,שִׁפְרָה,
4,פּוּעָה,
5,חָיֶה,
6,תֵּבָה,
7,צִפֹּרָה,
8,יִתְרֹו,
9,לַבָּה,
10,פִּקֵּחַ,
