# Markers with Prepositions

Do time markers as correlated with certain tenses contain more or less of a given type of preposition?

For instance, a cursory analysis of tense-dominant time markers (`2_markers_with_tenses.ipynb`) appear to show that the particle עד is more common in yiqtol-dominant time markers. Alternatively, the particle ב appears more often in the wayyiqtol-dominant markers. Statistically, are there patterns that can be identified?

In [1]:
import pickle, collections
import pandas as pd
from pprint import pprint
from tf.fabric import Fabric
from IPython.display import display, HTML

TF = Fabric(modules='hebrew/etcbc4c', silent=True)
api = TF.load('''book chapter verse
                 pdp vt domain lex
              ''')

api.makeAvailableIn(globals())

  0.00s loading features ...
   |     0.01s B book                 from /Users/Cody/github/text-fabric-data/hebrew/etcbc4c
   |     0.01s B chapter              from /Users/Cody/github/text-fabric-data/hebrew/etcbc4c
   |     0.01s B verse                from /Users/Cody/github/text-fabric-data/hebrew/etcbc4c
   |     0.12s B pdp                  from /Users/Cody/github/text-fabric-data/hebrew/etcbc4c
   |     0.12s B vt                   from /Users/Cody/github/text-fabric-data/hebrew/etcbc4c
   |     0.02s B domain               from /Users/Cody/github/text-fabric-data/hebrew/etcbc4c
   |     0.13s B lex                  from /Users/Cody/github/text-fabric-data/hebrew/etcbc4c
   |     0.00s Feature overview: 103 for nodes; 5 for edges; 1 configs; 7 computed
  5.23s All features loaded/computed - for details use loadLog()


In [2]:
# import custom function for weqetal detection
from functions.verbs import is_weqt

In [3]:
# import time markers data
tm_data_file = 'data/time_markers.pickle'

# load data
with open(tm_data_file, 'rb') as infile:
    tm_data = pickle.load(infile)

print('data available: ', ', '.join(tm_data.keys()))

data available:  markers, top_markers, stats_rows


In [4]:
# assign the data
markers = tm_data['markers']
top_markers = tm_data['top_markers']
stats_rows = tm_data['stats_rows']

In [5]:
markers['H JWM'].keys()

dict_keys(['count', 'clauses', 'tense_cl_lists', 'tense_counts', 'tense_percents', 'example_phrase'])

## Make Initial Counts

Find time phrases with a preposition as the first word in the phrase. Count it and display the data.

In [6]:
preposition_cl_lists = collections.defaultdict(lambda: collections.defaultdict(list))
preposition_counts = collections.defaultdict(lambda: collections.Counter())

# for basic information about how many time markers have a preposition
total_time_markers = len(markers)
markers_with_preps = 0

# map clauses to prepositions by tense and count in the same way
for marker, marker_data in markers.items():
    
    # get the part of speech of first word in time phrase
    time_phrase = marker_data['example_phrase']
    first_word = L.d(time_phrase, otype='word')[0]
    first_word_type = F.pdp.v(first_word)
        
    # make sure first word is a preposition
    if first_word_type != 'prep':
        continue
        
    # count the time marker with prep
    markers_with_preps += 1
        
    # get the lex plain text representation of the preposition
    prep_lex = L.u(first_word, otype='lex')[0]
    prep_text = F.lex.v(prep_lex)
    
    # count by tense
    for tense, clause_list in marker_data['tense_cl_lists'].items():
        tense_count = len(clause_list)
        
        # map the clause nodes
        preposition_cl_lists[prep_text][tense].extend(clause_list)
        
        # make a count
        preposition_counts[prep_text][tense] += tense_count
        preposition_counts[prep_text]['total'] += tense_count

        
percent_with_prep = round((markers_with_preps/total_time_markers)*100, 1)

print(f'{markers_with_preps}/{total_time_markers} time markers begin with a preposition')
print(f'or {percent_with_prep}%')

718/985 time markers begin with a preposition
or 72.9%


In [7]:
# order and create tables
prep_count_table_order = [(tense_totals['total'], marker, tense_totals) 
                          for marker, tense_totals in preposition_counts.items()]
prep_count_table_order = sorted(prep_count_table_order, reverse=True)


# get header data
all_tenses = [key for key in sorted(prep_count_table_order[0][2].keys())
                                         if key != 'total']
table_header = ['marker', 'total'] + all_tenses


# make table rows
prep_count_table_rows = []

for total, marker, tense_totals in prep_count_table_order:
    
    tense_counts = ['{}% ({})'.format(round((tense_totals[tense]/total)*100,1), tense_totals[tense])
                       for tense in all_tenses ]
    
    row = [marker, total] + tense_counts
    
    prep_count_table_rows.append(row)
    

# display table
prep_count_table = pd.DataFrame(prep_count_table_rows, columns=table_header)
prep_count_table

Unnamed: 0,marker,total,impf,impv,infa,infc,perf,ptca,ptcp,wayq,weqt
0,B,1095,27.3% (299),2.2% (24),0.4% (4),3.7% (40),25.0% (274),4.7% (52),0.3% (3),26.4% (289),10.0% (110)
1,<D,277,34.7% (96),2.5% (7),0.0% (0),4.0% (11),16.6% (46),5.4% (15),0.4% (1),22.7% (63),13.7% (38)
2,L,266,48.9% (130),4.1% (11),0.4% (1),5.3% (14),18.0% (48),4.5% (12),1.5% (4),10.9% (29),6.4% (17)
3,MN,170,17.1% (29),3.5% (6),0.0% (0),0.6% (1),30.0% (51),7.6% (13),3.5% (6),35.3% (60),2.4% (4)
4,>XR/,120,27.5% (33),0.0% (0),0.0% (0),0.0% (0),35.8% (43),1.7% (2),0.0% (0),35.0% (42),0.0% (0)
5,K,50,24.0% (12),16.0% (8),0.0% (0),8.0% (4),24.0% (12),8.0% (4),2.0% (1),18.0% (9),0.0% (0)
6,BJN/,7,71.4% (5),0.0% (0),0.0% (0),14.3% (1),0.0% (0),0.0% (0),0.0% (0),0.0% (0),14.3% (1)
7,>T,2,50.0% (1),0.0% (0),0.0% (0),0.0% (0),0.0% (0),0.0% (0),0.0% (0),50.0% (1),0.0% (0)
8,BLT/,1,0.0% (0),0.0% (0),0.0% (0),0.0% (0),100.0% (1),0.0% (0),0.0% (0),0.0% (0),0.0% (0)
9,>L,1,0.0% (0),0.0% (0),0.0% (0),0.0% (0),0.0% (0),0.0% (0),0.0% (0),100.0% (1),0.0% (0)


In [8]:
# export data with prepositions

# add preposition data
tm_data['preposition_cl_lists'] = dict(preposition_cl_lists)

In [9]:
tm_data.keys()

dict_keys(['markers', 'top_markers', 'stats_rows', 'preposition_cl_lists'])

In [10]:
with open(tm_data_file, 'wb') as outfile:
    
    pickle.dump(tm_data, outfile)

# Export a Full Spreadsheet with All Time Marker and Tense Data

This step is unrelated to this notebook.

In [30]:
import csv

csv_header = tm_data['stats_rows']['header']

csv_row_order = sorted([(tm_data['stats_rows'][row][1],tm_data['stats_rows'][row]) for row in tm_data['stats_rows']               
                            if row != 'header'], reverse=True)

csv_rows = [row[1] for row in csv_row_order]
                       
len(csv_rows)

985

In [31]:
with open('all_time_markers.csv', 'w') as outfile:
    
    writer = csv.writer(outfile)
    
    writer.writerow(csv_header)
    writer.writerows(csv_rows)