In [16]:
from IPython.display import display, HTML

# Definite proper names

How often is the definite article used with proper names?<br>

Inspired by John Meade's very interesting tweet thread.

In [18]:
display(HTML('<blockquote class="twitter-tweet"><p lang="en" dir="ltr">1/ &quot;Satan&quot;: Thread<br>Why do we translate Job 1:6 et al &quot;Satan&quot; as a personal name/noun (Aleppo Codex below: hasatan; השׂטן)? Hebrew does not articulate proper nouns, since they are already considered definite. The Hebrew is better rendered &quot;the Accuser/Opponent.&quot; <a href="https://t.co/I1Clh6h1JM">pic.twitter.com/I1Clh6h1JM</a></p>&mdash; John Meade (@drjohnmeade) <a href="https://twitter.com/drjohnmeade/status/1263985320099364864?ref_src=twsrc%5Etfw">May 23, 2020</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'))

We'll use the ETCBC's BHSA syntax data to run a query and produce a dataset.

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
from tf.app import use
from IPython.display import display, HTML

bhsa = use('bhsa')
F, T, L = bhsa.api.F, bhsa.api.T, bhsa.api.L 

Using TF-app in /Users/cody/text-fabric-data/annotation/app-bhsa/code:
	rv2.0.0=#7b3b9ffba7ee6dbc76a52b8d76475d17babf0daf offline under ~/text-fabric-data (local release)
Using data in /Users/cody/text-fabric-data/etcbc/bhsa/tf/c:
	rv1.6 offline under ~/text-fabric-data (local release)
Using data in /Users/cody/text-fabric-data/etcbc/phono/tf/c:
	r1.2 offline under ~/text-fabric-data (local release)
Using data in /Users/cody/text-fabric-data/etcbc/parallels/tf/c:
	r1.2 offline under ~/text-fabric-data (local release)
   |     0.00s Dataset without structure sections in otext:no structure functions in the T-API


We run a query and gather the results. **The query searches for all cases where<br>
a definite article immediately precedes a proper name.**

In [34]:
data = []

# run a query
query = bhsa.search("""

book
    verse
        phrase_atom
            word lex=H
            <: word pdp=nmpr

""")

# build in to table
for book, verse, phrase, art, noun in query:
    
    data.append({
        'book': F.book.v(book),
        'ref': '{} {}:{}'.format(*T.sectionFromNode(verse)),
        'phrase': T.text(phrase),
        'proper_noun': T.text(noun),
        'noun_lex': F.voc_lex_utf8.v(noun),
    })

  1.26s 796 results


In [35]:
# build data into a table
data_df = pd.DataFrame(data)
data_df.head() # first 5

Unnamed: 0,book,ref,phrase,proper_noun,noun_lex
0,Genesis,Genesis 2:11,אֵ֚ת כָּל־אֶ֣רֶץ הַֽחֲוִילָ֔ה,חֲוִילָ֔ה,חֲוִילָה
1,Genesis,Genesis 12:8,הָעַ֣י,עַ֣י,עַי
2,Genesis,Genesis 13:3,בֵּ֥ין בֵּֽית־אֵ֖ל וּבֵ֥ין הָעָֽי׃,עָֽי׃,עַי
3,Genesis,Genesis 13:10,אֶת־כָּל־כִּכַּ֣ר הַיַּרְדֵּ֔ן,יַּרְדֵּ֔ן,יַרְדֵּן
4,Genesis,Genesis 13:11,אֵ֚ת כָּל־כִּכַּ֣ר הַיַּרְדֵּ֔ן,יַּרְדֵּ֔ן,יַרְדֵּן


This is the first 5 rows of the dataset. Now let's construct a count of<br>
the various noun lexemes.

In [36]:
pd.set_option('display.max_rows', 200)

### below can be seen the lexemes and their respective frequency counts that co-occur with the definite article.

In [37]:
pd.DataFrame(data_df.noun_lex.value_counts())

Unnamed: 0,noun_lex
יַרְדֵּן,167
גִּלְעָד,52
בָּשָׁן,48
לְבָנֹון,48
מִצְפָּה,39
גִּלְגָּל,38
עַי,37
יְאֹר,35
רָמָה,33
כַּשְׂדִּים,33


### Below all found cases are shown in context

In [38]:
bhsa.table(query)

n,p,book,verse,phrase_atom,word,word.1
1,Genesis 2:11,Genesis,,אֵ֚ת כָּל־אֶ֣רֶץ הַֽחֲוִילָ֔ה,הַֽ,חֲוִילָ֔ה
2,Genesis 12:8,Genesis,,הָעַ֣י,הָ,עַ֣י
3,Genesis 13:3,Genesis,,בֵּ֥ין בֵּֽית־אֵ֖ל וּבֵ֥ין הָעָֽי׃,הָ,עָֽי׃
4,Genesis 13:10,Genesis,,אֶת־כָּל־כִּכַּ֣ר הַיַּרְדֵּ֔ן,הַ,יַּרְדֵּ֔ן
5,Genesis 13:11,Genesis,,אֵ֚ת כָּל־כִּכַּ֣ר הַיַּרְדֵּ֔ן,הַ,יַּרְדֵּ֔ן
6,Genesis 14:3,Genesis,,אֶל־עֵ֖מֶק הַשִּׂדִּ֑ים,הַ,שִּׂדִּ֑ים
7,Genesis 14:5,Genesis,,אֶת־הַזּוּזִ֖ים,הַ,זּוּזִ֖ים
8,Genesis 14:5,Genesis,,אֵת֙ הָֽאֵימִ֔ים,הָֽ,אֵימִ֔ים
9,Genesis 14:8,Genesis,,בְּעֵ֖מֶק הַשִּׂדִּֽים׃,הַ,שִּׂדִּֽים׃
10,Genesis 14:10,Genesis,,עֵ֣מֶק הַשִׂדִּ֗ים,הַ,שִׂדִּ֗ים
