In [81]:
# imports
import json
from nltk.corpus import wordnet as wn
import pandas as pd

# Populate unique tokens of PPs to json

In [82]:
with open('pp_lexicon/unique_tokens_copy.json', 'r') as f:
    unique_tokens_copy = json.load(f)


In [83]:
p_lexicon = {}

# Define a template for each lexical entry
template = {
    "isAtomicMorph": "",
    "class": "",
    "spellOutHEAD": [""],
    "path_p_morphology": "",
    "measure_allowed": ""
}

for token in unique_tokens_copy:
    p_lexicon[token] = {
        "isAtomicMorph": template["isAtomicMorph"],
        "class": template["class"],
        "spellOutHEAD": list(template["spellOutHEAD"]),
        "path_p_morphology": template["path_p_morphology"],
        "measure_allowed": template["measure_allowed"]
    }

In [84]:
# # write the p_lexicon to a JSON file
# with open('pp_lexicon/p_lexicon.json', 'w') as f:
#     json.dump(p_lexicon, f, indent=4, ensure_ascii=False)

In [85]:
with open('pp_lexicon/p_lexicon.json', 'r') as f:
    annotated_p_lex = json.load(f)


In [86]:
# check if annotated_p_lex value is empty string and calculate how many entries have empty values
# also print the empty token
empty_count = 0
empty_tokens = []
for token, entry in annotated_p_lex.items():
    if entry["isAtomicMorph"] == "" and entry["class"] == "" and entry["path_p_morphology"] == "" and entry["measure_allowed"] == "" and entry["spellOutHEAD"] == [""]:
        empty_count += 1
        empty_tokens.append(token)
print(f"Empty tokens: {empty_tokens[8:18]}")
print(f"Number of entries with empty values: {empty_count}")


Empty tokens: []
Number of entries with empty values: 0


In [87]:
list_pps = list(annotated_p_lex.keys())

In [88]:
with open('pp_lexicon/unique_tokens_to_decompose.json', 'r') as f:
    ut_decompose = json.load(f)

to_decompose = ut_decompose[0]['to_decompose'] 
atomic = ut_decompose[1]['atomic'] 

In [89]:
atomic

['a',
 'top',
 'by',
 'to',
 'but',
 'end',
 'for',
 'high',
 'via',
 'side',
 'back',
 'rear',
 'part',
 'back',
 'center',
 'with',
 'prior',
 'front',
 'opposite',
 'from',
 'beneath',
 'rear',
 'of',
 'far',
 'amid',
 'board',
 'next',
 'ward',
 'wards',
 'under',
 'neath',
 'skin',
 'flank',
 'plus',
 'middle',
 'rim',
 'face',
 'close',
 'amidst',
 'virtue']

In [90]:
decomposer_pps = list_pps.copy()
# drop values in decomposer_pps = ['astride', 'edge', 'on', 'to']
decomposer_pps = [pp for pp in decomposer_pps if pp not in ['astride', 'edge', 'on', 'to']]



In [91]:
def is_english_word(w):
    w = w.lower()
    return bool(wn.synsets(w))

def decompose_preposition(preposition, atomic_tokens, require_atomic_remainder=True):
   
    result = {}
    p = preposition.lower()
    atomic_lower = {tok.lower() for tok in atomic_tokens}

    for token in atomic_tokens:
        t = token.lower()
        # Skip trivial cases and tokens that don't even appear inside p
        if t == p:
            continue
        if t not in p:
            continue

        # Count how many times ‘t’ shows up in ‘p’
        count = p.count(t)  
        # Remove exactly one occurrence of t from p
        remainder = p.replace(t, "", 1)

        # Check remainder:
        if require_atomic_remainder:
            # We only accept remainders that are themselves in atomic_tokens:
            if remainder in atomic_lower:
                result[token] = {
                    'decomposition': [token, remainder],
                    'occurrence': count
                }
        else:
            # We accept any English‐word remainder
            if is_english_word(remainder):
                result[token] = {
                    'decomposition': [token, remainder],
                    'occurrence': count
                }

    return result

In [92]:
atomic

['a',
 'top',
 'by',
 'to',
 'but',
 'end',
 'for',
 'high',
 'via',
 'side',
 'back',
 'rear',
 'part',
 'back',
 'center',
 'with',
 'prior',
 'front',
 'opposite',
 'from',
 'beneath',
 'rear',
 'of',
 'far',
 'amid',
 'board',
 'next',
 'ward',
 'wards',
 'under',
 'neath',
 'skin',
 'flank',
 'plus',
 'middle',
 'rim',
 'face',
 'close',
 'amidst',
 'virtue']

In [93]:
to_decompose

['astride',
 'atop',
 'corner',
 'by',
 'edge',
 'onto',
 'after',
 'but',
 'end',
 'into',
 'upside',
 'for',
 'base',
 'higher',
 'bottom',
 'as',
 'via',
 'aside',
 'back',
 'nearest',
 'betwixt',
 'at',
 'ahead',
 'astern',
 'following',
 'center',
 'within',
 'prior',
 'without',
 'front',
 'before',
 'opposite',
 'from',
 'beneath',
 'means',
 'apart',
 'towards',
 'except',
 'heart',
 'afore',
 'tween',
 'aboard',
 'throughout',
 'foot',
 'nearer',
 'nigh',
 'alongside',
 'toward',
 'amongst',
 'adjacent',
 'underneath',
 'subsequent',
 'addition',
 'cross',
 'surface',
 'amidst',
 'underside']

In [94]:
decomposer_pps = list_pps.copy()


result_decompose = {}
for pp in to_decompose:
    comps = decompose_preposition(pp, atomic, require_atomic_remainder=False)
    if comps:
        result_decompose[pp] = comps

# turn it into a flat table
rows = []
for preposition, comps in result_decompose.items():
    for token, details in comps.items():
        rows.append({
            'preposition': preposition,
            'token': token,
            'decomposition': details['decomposition'],
            'occurrence': details['occurrence']
        })



In [95]:
# print result_decompose dict as pandas
df_decompose = pd.DataFrame(rows)
df_decompose = df_decompose.sort_values(by=['preposition', 'occurrence'], ascending=[True, False])


In [96]:
df_decompose
# drop df_decompose row by index
df_decompose = df_decompose.drop(index=[8,12, 6, 17, 19, 9])

In [97]:
df_decompose

Unnamed: 0,preposition,token,decomposition,occurrence
24,aboard,a,"[a, board]",2
25,aboard,board,"[board, a]",1
23,afore,a,"[a, fore]",1
13,ahead,a,"[a, head]",2
26,alongside,side,"[side, along]",1
30,amidst,a,"[a, midst]",1
20,apart,a,"[a, part]",2
21,apart,part,"[part, a]",1
10,aside,a,"[a, side]",1
11,aside,side,"[side, a]",1


In [98]:
decomposed = set(df_decompose['preposition'])
decomposed

{'aboard',
 'afore',
 'ahead',
 'alongside',
 'amidst',
 'apart',
 'aside',
 'astern',
 'astride',
 'atop',
 'beneath',
 'higher',
 'into',
 'onto',
 'surface',
 'toward',
 'towards',
 'underneath',
 'underside',
 'upside',
 'within',
 'without'}

In [99]:
with open('pp_lexicon/atomic_p.json', 'r') as f:
    atomic_p = json.load(f)
atomic_p.keys()

dict_keys(['behind', 'above', 'below', 'beyond', 'in front of', 'inside', 'outside', 'left', 'right', 'against', 'among', 'beside', 'between', 'near', 'next to', 'upon', 'across', 'along', 'around', 'over', 'past', 'through', 'under', 'up', 'down', 'on', 'off', 'in', 'out', 'away'])

In [100]:
# extract keys into a separate set so atomic_p (the dict) isn’t overwritten
atomic_p_keys = set(atomic_p.keys())
atomic_p_keys

{'above',
 'across',
 'against',
 'along',
 'among',
 'around',
 'away',
 'behind',
 'below',
 'beside',
 'between',
 'beyond',
 'down',
 'in',
 'in front of',
 'inside',
 'left',
 'near',
 'next to',
 'off',
 'on',
 'out',
 'outside',
 'over',
 'past',
 'right',
 'through',
 'under',
 'up',
 'upon'}

In [101]:
# match ut_decompose[1]['atomic'] with annotated_p_lex that is not empty entry
atomic_set = set(atomic)
annotated_atomic = set()
for token, entry in annotated_p_lex.items():
    if entry["isAtomicMorph"] != "" or entry["class"] != "" or entry["path_p_morphology"] != "" or entry["measure_allowed"] != "" or entry["spellOutHEAD"] != [""]:
        annotated_atomic.add(token)
annotated_atomic = annotated_atomic.intersection(atomic_set)
# annotated_atomic

# not annotated atomic tokens
not_annotated_atomic = atomic_set - annotated_atomic
not_annotated_atomic



{'neath', 'plus', 'under', 'virtue', 'ward', 'wards'}

In [102]:
annotated_atomic

{'a',
 'amid',
 'amidst',
 'back',
 'beneath',
 'board',
 'but',
 'by',
 'center',
 'close',
 'end',
 'face',
 'far',
 'flank',
 'for',
 'from',
 'front',
 'high',
 'middle',
 'next',
 'of',
 'opposite',
 'part',
 'prior',
 'rear',
 'rim',
 'side',
 'skin',
 'to',
 'top',
 'via',
 'with'}

In [103]:
ut_decompose[1]['atomic']

['a',
 'top',
 'by',
 'to',
 'but',
 'end',
 'for',
 'high',
 'via',
 'side',
 'back',
 'rear',
 'part',
 'back',
 'center',
 'with',
 'prior',
 'front',
 'opposite',
 'from',
 'beneath',
 'rear',
 'of',
 'far',
 'amid',
 'board',
 'next',
 'ward',
 'wards',
 'under',
 'neath',
 'skin',
 'flank',
 'plus',
 'middle',
 'rim',
 'face',
 'close',
 'amidst',
 'virtue']

In [104]:
not_annotated_atomic = not_annotated_atomic - atomic_p_keys
not_annotated_atomic

{'neath', 'plus', 'virtue', 'ward', 'wards'}

list of morphemes that are not annotated: {'neath', 'plus', 'virtue', 'ward', 'wards'}

# Decompose spatial prepositional phrases

In [105]:

df_pp_wordnet = pd.read_csv('dictionaries/pp_wordnet_dict_wiki_pop_fix.csv', sep=',')
spatial_df_pp_wordnet = df_pp_wordnet[df_pp_wordnet['is_spatial'] == True]
spatial_df_pp_wordnet.columns

Index(['Unnamed: 0', 'preposition', 'is_atomic', 'is_spatial', 'class',
       'transitivity', 'synonyms', 'antonyms', 'hypernym', 'hyponym',
       'meronym', 'holonym', 'supersense'],
      dtype='object')

In [106]:
spatial_df_pp_wordnet.drop(columns=['Unnamed: 0'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spatial_df_pp_wordnet.drop(columns=['Unnamed: 0'], inplace=True)


In [107]:
list_full_pp = list(spatial_df_pp_wordnet['preposition'])
len(list_full_pp)

133

In [108]:
intransitive_p = [
    "abroad",
    "adrift",
    "aft",
    "afterward",
    "afterwards",
    "ahead",
    "apart",
    "ashore",
    "aside",
    "away",
    "back",
    "backward",
    "backwards",
    "beforehand",
    "downhill",
    "downstage",
    "downstairs",
    "downstream",
    "downward",
    "downwards",
    "downwind",
    "east",
    "eastward(s)",
    "forth",
    "forward(s)",
    "heavenward",
    "hence",
    "henceforth",
    "here",
    "hereby",
    "herein",
    "hereof",
    "hereto",
    "herewith",
    "home",
    "homeward(s)",
    "indoors",
    "inward(s)",
    "leftward(s)",
    "north",
    "northeast",
    "northward(s)",
    "northwest",
    "now",
    "onward(s)",
    "outdoors",
    "outward(s)",
    "overboard",
    "overhead",
    "overland",
    "overseas",
    "rightward(s)",
    "seaward(s)",
    "skyward(s)",
    "south",
    "southeast",
    "southward(s)",
    "southwest",
    "then",
    "thence",
    "thenceforth",
    "there",
    "thereby",
    "therein",
    "thereof",
    "thereto",
    "therewith",
    "together",
    "underfoot",
    "underground",
    "uphill",
    "upstage",
    "upstairs",
    "upstream",
    "upward(s)",
    "upwind",
    "west",
    "westward(s)",
    "when",
    "whence",
    "where",
    "whereby",
    "wherein",
    "whereto",
    "wherewith"
]

processed_list = []

for item in intransitive_p:
    if "(s)" in item:
        processed_list.append(item.replace("(s)", ""))
        processed_list.append(item.replace("(s)", "s"))
    else:
        processed_list.append(item)


intransitive_p = processed_list

intransitive_p

['abroad',
 'adrift',
 'aft',
 'afterward',
 'afterwards',
 'ahead',
 'apart',
 'ashore',
 'aside',
 'away',
 'back',
 'backward',
 'backwards',
 'beforehand',
 'downhill',
 'downstage',
 'downstairs',
 'downstream',
 'downward',
 'downwards',
 'downwind',
 'east',
 'eastward',
 'eastwards',
 'forth',
 'forward',
 'forwards',
 'heavenward',
 'hence',
 'henceforth',
 'here',
 'hereby',
 'herein',
 'hereof',
 'hereto',
 'herewith',
 'home',
 'homeward',
 'homewards',
 'indoors',
 'inward',
 'inwards',
 'leftward',
 'leftwards',
 'north',
 'northeast',
 'northward',
 'northwards',
 'northwest',
 'now',
 'onward',
 'onwards',
 'outdoors',
 'outward',
 'outwards',
 'overboard',
 'overhead',
 'overland',
 'overseas',
 'rightward',
 'rightwards',
 'seaward',
 'seawards',
 'skyward',
 'skywards',
 'south',
 'southeast',
 'southward',
 'southwards',
 'southwest',
 'then',
 'thence',
 'thenceforth',
 'there',
 'thereby',
 'therein',
 'thereof',
 'thereto',
 'therewith',
 'together',
 'underfoot'

In [109]:
# add intransitive_p to spatial_df_pp_wordnet
spatial_df_pp_wordnet[[
    'preposition', 'is_spatial', 'is_atomic', 'transitivity',
]].head()

Unnamed: 0,preposition,is_spatial,is_atomic,transitivity
0,a,True,True,
1,aboard,True,True,both
3,above,True,True,both
6,across,True,True,both
7,adjacent,True,True,


In [110]:
# add entries to spatial_df_pp_wordnet
for pp in intransitive_p:
    if pp not in spatial_df_pp_wordnet['preposition'].values:
        new_row = pd.Series({
            'preposition': pp,
            'is_spatial': True,
            'is_atomic': False,
            'transitivity': 'intransitive',
            'is_conjunctive': None
        })
        spatial_df_pp_wordnet = pd.concat(
            [spatial_df_pp_wordnet, new_row.to_frame().T],
            ignore_index=True
        )

In [111]:
spatial_df_pp_wordnet[spatial_df_pp_wordnet['preposition'].isin(intransitive_p)]

Unnamed: 0,preposition,is_atomic,is_spatial,class,transitivity,synonyms,antonyms,hypernym,hyponym,meronym,holonym,supersense,is_conjunctive
9,ahead,TRUE,True,,intransitive,,,,,,,,
17,apart,TRUE,True,,intransitive,,,,,,,,
21,aside,TRUE,True,,,,,,,,,,
30,back,TRUE,True,,,,,,,,,,
132,away,TRUE,True,PARTICLE,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,where,False,True,,intransitive,,,,,,,,
223,whereby,False,True,,intransitive,,,,,,,,
224,wherein,False,True,,intransitive,,,,,,,,
225,whereto,False,True,,intransitive,,,,,,,,


In [112]:
len(spatial_df_pp_wordnet)

227

In [113]:
# df_pp_wordnet['transitivity'] == 'intransitive'
spatial_df_pp_wordnet['transitivity'].value_counts()

# save the spatial_df_pp_wordnet to a csv file
# spatial_df_pp_wordnet.to_csv('dictionaries/pp_wordnet_wiki_pop_fix.csv', index=False)

transitivity
intransitive    97
both            25
transitive      16
Name: count, dtype: int64

In [114]:
conjunctive_p = [
    "after",
    "although",
    "as",
    "at",
    "because",
    "before",
    "beside",
    "besides",
    "between",
    "by",
    "considering",
    "despite",
    "except",
    "for",
    "from",
    "given",
    "granted",
    "if",
    "into",
    "lest",
    "like",
    "notwithstanding",
    "now",
    "of",
    "on",
    "once",
    "provided",
    "providing",
    "save",
    "seeing",
    "since",
    "so",
    "supposing",
    "than",
    "though",
    "till",
    "to",
    "unless",
    "until",
    "upon",
    "when",
    "whenever",
    "where",
    "whereas",
    "wherever",
    "while",
    "whilst",
    "with",
    "without"
]


In [115]:
spatial_df_pp_wordnet.head()

Unnamed: 0,preposition,is_atomic,is_spatial,class,transitivity,synonyms,antonyms,hypernym,hyponym,meronym,holonym,supersense,is_conjunctive
0,a,True,True,,,,,,,,,,
1,aboard,True,True,,both,"alongside, on base, on board",,,,,,,
2,above,True,True,PROJECTIVE,both,"higher up, in a higher place, supra, to a high...",below,"section, subdivision",,,,,
3,across,True,True,EXTENDED,both,"crossways, crosswise",,,,,,,
4,adjacent,True,True,,,,,,,,,,


In [116]:
# add entries to spatial_df_pp_wordnet
for pp in conjunctive_p:
    if pp not in spatial_df_pp_wordnet['preposition'].values:
        new_row = pd.Series({
            'preposition': pp,
            'is_spatial': True,
            'is_atomic': False,
            'transitivity': 'transitive',
            'is_conjunctive': True
        })
        spatial_df_pp_wordnet = pd.concat(
            [spatial_df_pp_wordnet, new_row.to_frame().T],
            ignore_index=True
        )

# mark all others as not conjunctive
# mask = spatial_df_pp_wordnet['preposition'].isin(conjunctive_p)
# spatial_df_pp_wordnet.loc[~mask, 'is_conjunctive'] = False

In [117]:
spatial_df_pp_wordnet.head()

Unnamed: 0,preposition,is_atomic,is_spatial,class,transitivity,synonyms,antonyms,hypernym,hyponym,meronym,holonym,supersense,is_conjunctive
0,a,True,True,,,,,,,,,,
1,aboard,True,True,,both,"alongside, on base, on board",,,,,,,
2,above,True,True,PROJECTIVE,both,"higher up, in a higher place, supra, to a high...",below,"section, subdivision",,,,,
3,across,True,True,EXTENDED,both,"crossways, crosswise",,,,,,,
4,adjacent,True,True,,,,,,,,,,


In [118]:
spatial_df_pp_wordnet['is_conjunctive'].value_counts()

is_conjunctive
True    30
Name: count, dtype: int64

In [125]:
spatial_df_pp_wordnet['is_spatial'].value_counts()

is_spatial
True    257
Name: count, dtype: int64

In [132]:
spatial_df_pp_wordnet.loc[
	spatial_df_pp_wordnet['preposition'] == 'according to',
	'preposition'
]

Series([], Name: preposition, dtype: object)

In [120]:
p_plus_p = [
    "across from",
    "ahead of",
    "along with",
    "apart from",
    "as for",
    "as from",
    "as of",
    "as per",
    "as regards",
    "as to",
    "aside from",
    "away from",
    "back to",
    "counter to",
    "in between",
    "near to",
    "next to",
    "opposite of",
    "out from",
    "out of",
    "outside of",
    "round about",
    "up against",
    "up to",
    "close to",
    "due to",
    "far from",
    "prior to",
    "pursuant to",
    "rather than",
    "subsequent to",
    
]


In [121]:
p_article_noun_p = [
    "by way of",
    "for lack of",
    "from want of",
    "for want of",
    "in contact with",
    "in line with",
    "in place of",
    "in point of",
    "in relation to",
    "with regard to",
    "in regard to",
    "with respect to",
    "in respect to",
    "in touch with",
    "on grounds of",
    "on the part of",
    "on top of",
    "with a view to",
    
]


In [122]:
# spatial_df_pp_wordnet['is_conjunctive'] turn to false if not true

In [134]:
spatial_df_pp_wordnet['is_spatial'].value_counts()

is_spatial
True    257
Name: count, dtype: int64

In [124]:
# # make conjunctive_p p_plus_p p_article_noun_p to json file named complex_pp.json
# complex_pp = {
#     "conjunctive_p": conjunctive_p,
#     "p_plus_p": p_plus_p,
#     "p_article_noun_p": p_article_noun_p
# }
# with open('pp_lexicon/complex_pp_repop.json', 'w') as f:
#     json.dump(complex_pp, f, indent=4, ensure_ascii=False)