# Export TF Data from BHSAc for R Processing

I will export specifically data on time phrase markers. Aramaic is excluded.

dataset contains:

book&nbsp;|&nbsp;

In [1]:
from tf.fabric import Fabric
from os import sys, path
import csv

In [2]:
# put custom functions in path
if __name__ == '__main__' and __package__ is None:
    sys.path.append('../')

In [3]:
TF = Fabric(locations='~/github/etcbc/bhsa/tf', modules='c')
TF2016 = Fabric(locations='~/github/etcbc/bhsa/tf', modules='2016', silent=True)

This is Text-Fabric 3.0.9
Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

113 features found and 0 ignored


In [4]:
# load c data
api = TF.load('''
                 book chapter verse
                 function kind
                 pdp typ rela ls
                 domain vt lex
              ''')

api.makeAvailableIn(globals())


e4c = TF2016.load('function kind') # load 2016 data

  0.00s loading features ...
   |     0.02s B book                 from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.02s B chapter              from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.01s B verse                from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.09s B function             from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.03s B kind                 from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.16s B pdp                  from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.27s B typ                  from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.27s B rela                 from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.15s B ls                   from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.02s B domain               from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.15s B vt                   from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.17s B lex                  from /Users/Cody/github/etcbc/bhsa/tf/c
   |     0.00s Feature overview: 10

In [5]:
# custom function to id weqetal verbs and for subject isolation
from my_functions.verbs import is_weqt 
from my_functions.substantives import is_subs

In [6]:
# versions check
# I was unsure about the difference between c and 2016 based on Dirk Roorda's chart (see below)
# https://github.com/ETCBC/bhsa/blob/master/programs/versionPhrases.ipynb
# As shown below, most changes in Time-funct. phrases came from clauses without predication

# time phrases in c WP clauses ("without predication")
time_phrases_c = [phrase for phrase in F.function.s('Time')
                     if F.kind.v(L.u(phrase, otype='clause')[0]) == 'WP'
                 ]
# time phrases in 2016
time_phrases_4c = [phrase for phrase in e4c.F.function.s('Time')
                       if e4c.F.kind.v(e4c.L.u(phrase, otype='clause')[0]) == 'WP'
                  
                  ]

print('c', len(time_phrases_c))
print('2016', len(time_phrases_4c))
print('difference:', len(time_phrases_c) - len(time_phrases_4c))

c 366
2016 160
difference: 206


This shows that 206 changes between 2016 and now (Oct 2017) occurred within clauses without predication (`WP`). 206 time phrases were added into the database.

In [7]:
time_phrase_data = '''

clause kind=VC
    phrase function=Time
    
    phrase function=Pred|PreO|PreS|PtcO
        word pdp=verb
'''

S.study(time_phrase_data, silent=True)

results = sorted(set(result for result in S.fetch()))

print(len(results), 'results ready...')

3245 results ready...


In [8]:
# be aware that there are some repeated results due to multiple time phrases within a single clause

already_encountered = set()

double_results = []

# get doubled results
for r in results:
    
    if r[0] in already_encountered:
        double_results.append(r)
    
    else:
        already_encountered.add(r[0])
    

# see the repeated results below where the clause is the same but the phrase matches are different.
double_results[:5]

[(428158, 653363, 653366, 3253),
 (428158, 653364, 653366, 3253),
 (428158, 653365, 653366, 3253),
 (428202, 653519, 653516, 3612),
 (428204, 653527, 653528, 3650)]

In [9]:
# put csv data here
rows = []

# get row data per time phrase (there will be some repeated clauses due to plural time phrases)
for r in results:
    
    # specify result nodes
    clause_n, time_n, pred_n, verb_n = r  
    time_words = L.d(time_n, otype='word')
    
    # (1) section data
    book, chapter, verse = T.sectionFromNode(clause_n)
        
    # (2) clause-level data
    domain = F.domain.v(clause_n)
    num_time_phrases = len([phrase for phrase in L.d(clause_n, otype='phrase')
                               if F.function.v(phrase) == 'Time'])
    
    # (3) word-level data
    # tense
    verb_tense = F.vt.v(verb_n) if not is_weqt(verb_n) else 'weqt' # + hacked weqetal
    
    # substantives
    subs = [F.lex.v(w) for w in time_words if is_subs(w)]
    num_subs = len(subs)
    subs_txt = '|'.join(subs) or 'NA'
    
    # prepositions
    preps = [F.lex.v(w) for w in time_words if F.pdp.v(w) == 'prep']
    num_preps = len(preps)
    preps_txt = '|'.join(preps) or 'NA'
    
    # quantities (card == "cardinal number")
    quants = [F.lex.v(w) for w in time_words if F.ls.v(w) == 'card']
    num_quants = len(quants)
    quants_txt = '|'.join(quants) or 'NA'
    
    # phrase dep. part of speech
    pdp_tags = '-'.join(F.pdp.v(w) for w in L.d(time_n, otype='word'))
    
    # package the row and append it
    row = [book, chapter, verse, clause_n, time_n, domain, num_time_phrases, verb_tense,
           num_preps, preps_txt, num_subs, subs_txt, num_quants, quants_txt, pdp_tags]
    
    rows.append(row)
    
print(len(rows), 'rows ready for export...')
print('sample: ', rows[100])

3245 rows ready for export...
sample:  ['Genesis', 18, 14, 429009, 656022, 'Q', 2, 'impf', 1, 'K', 1, '<T/', 0, 'NA', 'prep-art-subs-adjv']


In [10]:
# Export the .csv

header = ['book', 'chapter', 'verse', 'clause.n', 'time.n', 'domain', 'num.time.phrases',
          'verb.tense', 'num.preps', 'preps', 'num.subs', 'subs', 'num.quants', 'quants', 'pdp.tags']

with open('time_phrases.csv', 'w') as outfile:
    
    writer = csv.writer(outfile)
    
    writer.writerow(header)
    writer.writerows(rows)

In [11]:
T.text(L.d(651923, otype='word'))

'טֶ֣רֶם '