In [110]:
import pandas
import json
import csv
from nltk.corpus import wordnet as wn


In [None]:
with open('atomic_p.json', 'r') as f:
    pp = json.load(f)
    
# print(pp['atomic_p'].keys())
# sort pp['atomic_p'].keys() alphabetically
keys_sorted = sorted(pp['atomic_p'].keys())
print(keys_sorted)


['above', 'across', 'against', 'along', 'among', 'around', 'away', 'behind', 'below', 'beside', 'between', 'beyond', 'down', 'in', 'in front of', 'inside', 'left', 'near', 'next to', 'off', 'on', 'out', 'outside', 'over', 'past', 'right', 'through', 'under', 'up', 'upon']


In [132]:
len(keys_sorted)

30

In [143]:
synonyms_atomic_p = {}
for key in keys_sorted:
    # collect all lemma names in a flat list
    all_lemmas = []
    for syn in wn.synsets(key):
        all_lemmas.extend(syn.lemma_names())
    # remove duplicates while preserving order
    synonyms_atomic_p[key] = list(dict.fromkeys(all_lemmas))

# inspect result
synonyms_atomic_p


{'above': ['above',
  'supra',
  'higher_up',
  'in_a_higher_place',
  'to_a_higher_place'],
 'across': ['across', 'crosswise', 'crossways'],
 'against': [],
 'along': ['along', 'on'],
 'among': [],
 'around': ['about',
  'around',
  'approximately',
  'close_to',
  'just_about',
  'some',
  'roughly',
  'more_or_less',
  'or_so',
  'round'],
 'away': ['away', 'outside', 'off', 'forth', 'out', 'aside', 'by'],
 'behind': ['buttocks',
  'nates',
  'arse',
  'butt',
  'backside',
  'bum',
  'buns',
  'can',
  'fundament',
  'hindquarters',
  'hind_end',
  'keister',
  'posterior',
  'prat',
  'rear',
  'rear_end',
  'rump',
  'stern',
  'seat',
  'tail',
  'tail_end',
  'tooshie',
  'tush',
  'bottom',
  'behind',
  'derriere',
  'fanny',
  'ass',
  'slow',
  'behindhand',
  'in_arrears'],
 'below': ['below',
  'at_a_lower_place',
  'to_a_lower_place',
  'beneath',
  'infra',
  'downstairs',
  'down_the_stairs',
  'on_a_lower_floor',
  'under'],
 'beside': [],
 'between': ['between', 'bet

In [142]:
# # synonyms atomic_p to json
# with open('synonyms_atomic_p.json', 'w') as f:
#     json.dump(synonyms_atomic_p, f, indent=4)

In [10]:
def get_atomic_p_prop(prop='', counter=5):
# pp is a dict, access preposition as key
    try:
        if prop is not None and prop in ['isAtomicMorph', 'class', 'spellOutHEAD', 'path_p_morphology', 'measure_allowed']:
            for key, value in pp['atomic_p'].items():
                # print(f"key: {key}")
                for el in value:
                    if el == prop:
                        print(f"{key}: {pp['atomic_p'][key][el]} ")
                        counter += 1
                        if counter == 5:
                            break
    except KeyError as e:
        print(f"KeyError: {e} not found in atomic_p")

In [6]:
pp_wordnet_wiki_pop = []
with open('dictionaries/pp_wordnet_wiki_pop.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile, quotechar='|', dialect='excel')
    for row in reader:
        if row['preposition'] == '':
            continue
        pp_wordnet_wiki_pop.append({
            'preposition': row['preposition'],
            'isAtomic': row.get('is_atomic'),
            'isSpatial': row.get('is_spatial')
        })

In [None]:
len(pp_wordnet_wiki_pop)

[{'preposition': 'a', 'isAtomic': 'TRUE', 'isSpatial': 'TRUE'},
 {'preposition': 'aboard', 'isAtomic': 'TRUE', 'isSpatial': 'TRUE'},
 {'preposition': 'about', 'isAtomic': 'FALSE', 'isSpatial': 'FALSE'},
 {'preposition': 'above', 'isAtomic': 'TRUE', 'isSpatial': 'TRUE'},
 {'preposition': 'absent', 'isAtomic': 'FALSE', 'isSpatial': 'FALSE'}]

In [133]:
unique_tokens = set() # unique token is defined as set of unique words in preposition
def tokenize_preposition(preposition):
    return preposition.split(' ')

# Example usage
for pp in pp_wordnet_wiki_pop:
    tokens = tokenize_preposition(pp['preposition'])
    # print(f"Tokens for '{pp['preposition']}': {tokens}")    
    unique_tokens.update(tokens)

print(f"length of unique_tokens before: {len(unique_tokens)}")

c = 0
unique_tokens_copy = unique_tokens.copy()
for k in keys_sorted:
    if k in unique_tokens_copy:
        unique_tokens_copy.remove(k)
        c += 1
        # print(f"{k} is in unique tokens")
    else:
        # print(f"{k} is NOT in unique tokens")
        continue
print(f"numbers of non overlapping unique_tokens: {len(unique_tokens_copy)}")

for k in unique_tokens_copy:
    print(k)

length of unique_tokens before: 153
numbers of non overlapping unique_tokens: 125
back
base
opposed
well
astride
adjacent
center
with
respect
higher
top
addition
front
except
heart
behest
underside
atop
concerning
of
nigh
opposite
point
regardless
means
ahead
from
most
corner
into
within
behalf
virtue
circa
following
besides
minus
regard
view
account
face
aboard
save
to
via
aside
middle
versus
nearest
bottom
end
afore
flank
apart
soon
subsequent
least
instead
notwithstanding
far
foot
worth
cross
such
tween
onto
per
edge
regards
throughout
plus
before
prior
thanks
underneath
case
pursuant
rather
for
rear
betwixt
sake
during
despite
including
spite
according
side
a
after
owing
rim
next
skin
towards
at
until
but
upside
nearer
amongst
by
without
because
since
due
as
beneath
less
like
surface
astern
absent
close
about
the
core
toward
amid
amidst
alongside
place
accordance
lieu
than


In [111]:
def is_english_word(w):
    w = w.lower()
    return bool(wn.synsets(w))

In [None]:
def decompose_preposition(preposition, unique_tokens, method='substring'):
    
    result = {}
    p = preposition.lower()

    if method == 'substring':
        for token in unique_tokens:
            t = token.lower()
            if p not in t:
                continue

            count = t.count(p)

            remainder = t.replace(p, "", 1)

            # if remainder == '':
            #     continue
            if is_english_word(p) and is_english_word(remainder):
                result[token] = {
                    'decomposition': [p, remainder],
                    'occurrence': count
            }

        return result
    

    return result

In [56]:
atomic = list(pp['atomic_p'].keys())

In [107]:
# collect all decompositions
result_decompose = {}
for pp in atomic:
    if pp in unique_tokens:
        comps = decompose_preposition(pp, unique_tokens_copy, method='substring')
        if comps:
            result_decompose[pp] = comps

# turn it into a flat table
rows = []
for preposition, comps in result_decompose.items():
    for token, details in comps.items():
        rows.append({
            'preposition': preposition,
            'token': token,
            'decomposition': details['decomposition'],
            'occurrence': details['occurrence']
        })

# dataFrame of all decompositions
df_decompose = pandas.DataFrame(rows)
df_decompose


Unnamed: 0,preposition,token,decomposition,occurrence
0,near,nearest,"[near, est]",1
1,near,nearer,"[near, er]",1
2,along,alongside,"[along, side]",1
3,through,throughout,"[through, out]",1
4,under,underside,"[under, side]",1
5,up,upside,"[up, side]",1
6,on,soon,"[on, so]",1
7,in,point,"[in, pot]",1
8,in,minus,"[in, mus]",1
9,in,instead,"[in, stead]",1


In [122]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

# Create instances of each stemmer
porter   = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer(language='english')

In [129]:
stemmer = [porter, lancaster, snowball]
# build mapping token -> stem for each stemmer
def build_stem_mapping(tokens, stemmer):
    stem_mapping = {}
    for token in tokens:
        stem = stemmer.stem(token)
        if stem not in stem_mapping:
            stem_mapping[stem] = []
        stem_mapping[stem].append(token)
    return stem_mapping

df_stem_porter = build_stem_mapping(unique_tokens, porter)
df_stem_lancaster = build_stem_mapping(unique_tokens, lancaster)
df_stem_snowball = build_stem_mapping(unique_tokens, snowball)
# Convert the stem mappings to DataFrames
df_stem_porter = pandas.DataFrame.from_dict(df_stem_porter, orient='index').reset_index()

print(df_stem_porter.to_string())


            index                0           1
0            back             back        None
1            base             base        None
2           oppos          opposed        None
3          behind           behind        None
4            past             past        None
5            well             well        None
6          astrid          astride        None
7           adjac         adjacent        None
8          center           center        None
9            with             with        None
10        respect          respect        None
11          along            along        None
12            out              out        None
13         higher           higher        None
14            top              top        None
15          addit         addition        None
16          front            front        None
17         except           except        None
18          heart            heart        None
19         behest           behest        None
20       unde

In [130]:
df_stem_lancaster

{'back': ['back'],
 'bas': ['base'],
 'oppos': ['opposed'],
 'behind': ['behind'],
 'past': ['past'],
 'wel': ['well'],
 'astrid': ['astride'],
 'adjac': ['adjacent'],
 'cent': ['center'],
 'with': ['with'],
 'respect': ['respect'],
 'along': ['along'],
 'out': ['out'],
 'high': ['higher'],
 'top': ['top'],
 'addit': ['addition'],
 'front': ['front'],
 'exceiv': ['except'],
 'heart': ['heart'],
 'behest': ['behest'],
 'undersid': ['underside'],
 'besid': ['beside', 'besides'],
 'atop': ['atop'],
 'concern': ['concerning'],
 'of': ['of'],
 'nigh': ['nigh'],
 'opposit': ['opposite'],
 'point': ['point'],
 'regardless': ['regardless'],
 'mean': ['means'],
 'ahead': ['ahead'],
 'from': ['from'],
 'most': ['most'],
 'corn': ['corner'],
 'into': ['into'],
 'within': ['within'],
 'behalf': ['behalf'],
 'virtu': ['virtue'],
 'circ': ['circa'],
 'follow': ['following'],
 'min': ['minus'],
 'ov': ['over'],
 'regard': ['regard', 'regards'],
 'down': ['down'],
 'near': ['near', 'nearer'],
 'view':

In [131]:
df_stem_snowball

{'back': ['back'],
 'base': ['base'],
 'oppos': ['opposed'],
 'behind': ['behind'],
 'past': ['past'],
 'well': ['well'],
 'astrid': ['astride'],
 'adjac': ['adjacent'],
 'center': ['center'],
 'with': ['with'],
 'respect': ['respect'],
 'along': ['along'],
 'out': ['out'],
 'higher': ['higher'],
 'top': ['top'],
 'addit': ['addition'],
 'front': ['front'],
 'except': ['except'],
 'heart': ['heart'],
 'behest': ['behest'],
 'undersid': ['underside'],
 'besid': ['beside', 'besides'],
 'atop': ['atop'],
 'concern': ['concerning'],
 'of': ['of'],
 'nigh': ['nigh'],
 'opposit': ['opposite'],
 'point': ['point'],
 'regardless': ['regardless'],
 'mean': ['means'],
 'ahead': ['ahead'],
 'from': ['from'],
 'most': ['most'],
 'corner': ['corner'],
 'into': ['into'],
 'within': ['within'],
 'behalf': ['behalf'],
 'virtu': ['virtue'],
 'circa': ['circa'],
 'follow': ['following'],
 'minus': ['minus'],
 'over': ['over'],
 'regard': ['regard', 'regards'],
 'down': ['down'],
 'near': ['near'],
 'vie