In [4]:
import pandas
import json
import csv
from nltk.corpus import wordnet as wn
from nltk import ngrams
from collections import Counter

In [7]:
with open('pp_lexicon/atomic_p.json', 'r') as f:
    pp = json.load(f)
    
# sort pp['atomic_p'].keys() alphabetically
keys_sorted = sorted(pp['atomic_p'].keys())
print(keys_sorted)


['above', 'across', 'against', 'along', 'among', 'around', 'away', 'behind', 'below', 'beside', 'between', 'beyond', 'down', 'in', 'in front of', 'inside', 'left', 'near', 'next to', 'off', 'on', 'out', 'outside', 'over', 'past', 'right', 'through', 'under', 'up', 'upon']


In [9]:
len(keys_sorted)

30

In [10]:
synonyms_atomic_p = {}
for key in keys_sorted:
    # collect all lemma names in a flat list
    all_lemmas = []
    for syn in wn.synsets(key):
        all_lemmas.extend(syn.lemma_names())
    # remove duplicates while preserving order
    synonyms_atomic_p[key] = list(dict.fromkeys(all_lemmas))

# inspect result
synonyms_atomic_p


{'above': ['above',
  'supra',
  'higher_up',
  'in_a_higher_place',
  'to_a_higher_place'],
 'across': ['across', 'crosswise', 'crossways'],
 'against': [],
 'along': ['along', 'on'],
 'among': [],
 'around': ['about',
  'around',
  'approximately',
  'close_to',
  'just_about',
  'some',
  'roughly',
  'more_or_less',
  'or_so',
  'round'],
 'away': ['away', 'outside', 'off', 'forth', 'out', 'aside', 'by'],
 'behind': ['buttocks',
  'nates',
  'arse',
  'butt',
  'backside',
  'bum',
  'buns',
  'can',
  'fundament',
  'hindquarters',
  'hind_end',
  'keister',
  'posterior',
  'prat',
  'rear',
  'rear_end',
  'rump',
  'stern',
  'seat',
  'tail',
  'tail_end',
  'tooshie',
  'tush',
  'bottom',
  'behind',
  'derriere',
  'fanny',
  'ass',
  'slow',
  'behindhand',
  'in_arrears'],
 'below': ['below',
  'at_a_lower_place',
  'to_a_lower_place',
  'beneath',
  'infra',
  'downstairs',
  'down_the_stairs',
  'on_a_lower_floor',
  'under'],
 'beside': [],
 'between': ['between', 'bet

In [None]:
# # synonyms atomic_p to json
# with open('synonyms_atomic_p.json', 'w') as f:
#     json.dump(synonyms_atomic_p, f, indent=4)

In [11]:
def get_atomic_p_prop(prop='', counter=5):
# pp is a dict, access preposition as key
    try:
        if prop is not None and prop in ['isAtomicMorph', 'class', 'spellOutHEAD', 'path_p_morphology', 'measure_allowed']:
            for key, value in pp['atomic_p'].items():
                # print(f"key: {key}")
                for el in value:
                    if el == prop:
                        print(f"{key}: {pp['atomic_p'][key][el]} ")
                        counter += 1
                        if counter == 5:
                            break
    except KeyError as e:
        print(f"KeyError: {e} not found in atomic_p")

In [16]:
pp_wordnet_wiki_pop = []
with open('dictionaries/pp_wordnet_wiki_pop.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile, quotechar='|', dialect='excel')
    for row in reader:
        if row['preposition'] == '':
            continue
        pp_wordnet_wiki_pop.append({
            'preposition': row['preposition'],
            'isAtomic': row.get('is_atomic'),
            'isSpatial': row.get('is_spatial')
        })

In [17]:
len(pp_wordnet_wiki_pop)

186

In [18]:
pp_wordnet_wiki_pop_spatial = [pp for pp in pp_wordnet_wiki_pop if pp['isSpatial'] == 'TRUE']
len(pp_wordnet_wiki_pop_spatial)


135

## Get atomic morphemes from unique tokens

Unique tokens from the list of wikipedia, wordnet, and dictionaries preposition phrase individual words (as token).

In [19]:
unique_tokens = set() # unique token is defined as set of unique words in preposition
def tokenize_preposition(preposition):
    return preposition.split(' ')

# Example usage
for pp in pp_wordnet_wiki_pop_spatial:
    tokens = tokenize_preposition(pp['preposition'])
    # print(f"Tokens for '{pp['preposition']}': {tokens}")    
    unique_tokens.update(tokens)

print(f"length of unique_tokens before: {len(unique_tokens)}")

c = 0
unique_tokens_copy = unique_tokens.copy()
for k in keys_sorted:
    if k in unique_tokens_copy:
        unique_tokens_copy.remove(k)
        c += 1
        # print(f"{k} is in unique tokens")
    else:
        # print(f"{k} is NOT in unique tokens")
        continue
print(f"numbers of non overlapping unique_tokens: {len(unique_tokens_copy)}")

for k in unique_tokens_copy:
    print(k)

length of unique_tokens before: 106
numbers of non overlapping unique_tokens: 78
top
astride
atop
corner
by
edge
onto
after
but
end
into
upside
for
place
base
higher
bottom
as
via
aside
back
nearest
betwixt
at
ahead
astern
following
center
within
prior
without
front
before
opposite
from
beneath
with
rear
means
apart
towards
except
heart
afore
tween
aboard
throughout
foot
nearer
nigh
alongside
virtue
next
toward
amongst
side
the
of
far
amid
adjacent
underneath
skin
flank
plus
to
subsequent
addition
cross
surface
a
amidst
underside
middle
core
rim
face
close


In [None]:
# export unique_tokens_copy as json
# with open('pp_lexicon/unique_tokens_copy.json', 'w') as f:
#     json.dump(list(unique_tokens_copy), f, indent=4)

## Decompose p

In [23]:
def is_english_word(w):
    w = w.lower()
    return bool(wn.synsets(w))

In [24]:
def decompose_preposition(preposition, unique_tokens, method='substring'):
    
    result = {}
    p = preposition.lower()

    if method == 'substring':
        for token in unique_tokens:
            t = token.lower()
            if p not in t:
                continue

            count = t.count(p)

            remainder = t.replace(p, "", 1)

            # if remainder == '':
            #     continue
            if is_english_word(p) and is_english_word(remainder):
                result[token] = {
                    'decomposition': [p, remainder],
                    'occurrence': count
            }

        return result
    
        # if method == 'find_bigram':
    

    return result

In [46]:
# atomic = list(keys_sorted['atomic_p'].keys())
atomic = keys_sorted
atomic

['above',
 'across',
 'against',
 'along',
 'among',
 'around',
 'away',
 'behind',
 'below',
 'beside',
 'between',
 'beyond',
 'down',
 'in',
 'in front of',
 'inside',
 'left',
 'near',
 'next to',
 'off',
 'on',
 'out',
 'outside',
 'over',
 'past',
 'right',
 'through',
 'under',
 'up',
 'upon']

In [47]:
unique_tokens_copy

{'a',
 'aboard',
 'addition',
 'adjacent',
 'afore',
 'after',
 'ahead',
 'alongside',
 'amid',
 'amidst',
 'amongst',
 'apart',
 'as',
 'aside',
 'astern',
 'astride',
 'at',
 'atop',
 'back',
 'base',
 'before',
 'beneath',
 'betwixt',
 'bottom',
 'but',
 'by',
 'center',
 'close',
 'core',
 'corner',
 'cross',
 'edge',
 'end',
 'except',
 'face',
 'far',
 'flank',
 'following',
 'foot',
 'for',
 'from',
 'front',
 'heart',
 'higher',
 'into',
 'means',
 'middle',
 'nearer',
 'nearest',
 'next',
 'nigh',
 'of',
 'onto',
 'opposite',
 'place',
 'plus',
 'prior',
 'rear',
 'rim',
 'side',
 'skin',
 'subsequent',
 'surface',
 'the',
 'throughout',
 'to',
 'top',
 'toward',
 'towards',
 'tween',
 'underneath',
 'underside',
 'upside',
 'via',
 'virtue',
 'with',
 'within',
 'without'}

In [26]:
# collect all decompositions
result_decompose = {}
for pp in atomic:
    if pp in unique_tokens:
        comps = decompose_preposition(pp, unique_tokens_copy, method='substring')
        if comps:
            result_decompose[pp] = comps

# turn it into a flat table
rows = []
for preposition, comps in result_decompose.items():
    for token, details in comps.items():
        rows.append({
            'preposition': preposition,
            'token': token,
            'decomposition': details['decomposition'],
            'occurrence': details['occurrence']
        })

# dataFrame of all decompositions
df_decompose = pandas.DataFrame(rows)
df_decompose


Unnamed: 0,preposition,token,decomposition,occurrence
0,along,alongside,"[along, side]",1
1,near,nearest,"[near, est]",1
2,near,nearer,"[near, er]",1
3,out,throughout,"[out, through]",1
4,through,throughout,"[through, out]",1
5,under,underside,"[under, side]",1
6,up,upside,"[up, side]",1


In [382]:
# remainder_decomp = [el[1] for el in df_decompose['decomposition']]
# remainder_decomp = set(remainder_decomp)
# to_remove = {'so', 'pot', 'mus', 'pot', 'mus', 'ab'} #particles and morphemes that isnt valid
# remainder_decomp = remainder_decomp - to_remove
# remainder_decomp

{'er', 'est', 'out', 'side', 'through'}

In [42]:
# get all tokens in df_decompose
tokens_decompose = set()
for token in df_decompose['token']:
    tokens_decompose.add(token)

unique_tokens_not_decomposed = unique_tokens_copy - tokens_decompose
print(f"length of unique_tokens_not_decomposed: {len(unique_tokens_not_decomposed)}")
unique_tokens_not_decomposed

length of unique_tokens_not_decomposed: 72


{'a',
 'aboard',
 'addition',
 'adjacent',
 'afore',
 'after',
 'ahead',
 'amid',
 'amidst',
 'amongst',
 'apart',
 'as',
 'aside',
 'astern',
 'astride',
 'at',
 'atop',
 'back',
 'base',
 'before',
 'beneath',
 'betwixt',
 'bottom',
 'but',
 'by',
 'center',
 'close',
 'core',
 'corner',
 'cross',
 'edge',
 'end',
 'except',
 'face',
 'far',
 'flank',
 'following',
 'foot',
 'for',
 'from',
 'front',
 'heart',
 'higher',
 'into',
 'means',
 'middle',
 'next',
 'nigh',
 'of',
 'onto',
 'opposite',
 'place',
 'plus',
 'prior',
 'rear',
 'rim',
 'side',
 'skin',
 'subsequent',
 'surface',
 'the',
 'to',
 'top',
 'toward',
 'towards',
 'tween',
 'underneath',
 'via',
 'virtue',
 'with',
 'within',
 'without'}

There are 114 unique tokens of prepositional phrase that are not decomposed by atomic_ps.

## Stemming for checking atomic elements

In [28]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

# Create instances of each stemmer
porter   = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer(language='english')

In [29]:
stemmer = [porter, lancaster, snowball]
# build mapping token -> stem for each stemmer
def build_stem_mapping(tokens, stemmer):
    stem_mapping = {}
    for token in tokens:
        stem = stemmer.stem(token)
        if stem not in stem_mapping:
            stem_mapping[stem] = []
        stem_mapping[stem].append(token)
    return stem_mapping

df_stem_porter = build_stem_mapping(unique_tokens, porter)
df_stem_lancaster = build_stem_mapping(unique_tokens, lancaster)
df_stem_snowball = build_stem_mapping(unique_tokens, snowball)
# Convert the stem mappings to DataFrames
df_stem_porter = pandas.DataFrame.from_dict(df_stem_porter, orient='index').reset_index()

print(df_stem_porter.to_string())


          index           0        1
0          away        away     None
1           top         top     None
2            by          by     None
3           edg        edge     None
4         after       after     None
5           but         but     None
6           end         end     None
7        outsid     outside     None
8         among       among     None
9          into        into     None
10        along       along     None
11        upsid      upside     None
12          for         for     None
13        place       place     None
14       bottom      bottom     None
15         asid       aside     None
16      betwixt     betwixt     None
17           at          at     None
18        ahead       ahead     None
19       astern      astern     None
20       center      center     None
21      without     without     None
22        front       front     None
23         near        near     None
24         with        with     None
25         rear        rear     None
2

In [30]:
df_stem_lancaster

{'away': ['away'],
 'top': ['top'],
 'by': ['by'],
 'edg': ['edge'],
 'aft': ['after'],
 'but': ['but'],
 'end': ['end'],
 'outsid': ['outside'],
 'among': ['among'],
 'into': ['into'],
 'along': ['along'],
 'upsid': ['upside'],
 'for': ['for'],
 'plac': ['place'],
 'bottom': ['bottom'],
 'asid': ['aside'],
 'betwixt': ['betwixt'],
 'at': ['at'],
 'ahead': ['ahead'],
 'astern': ['astern'],
 'cent': ['center'],
 'without': ['without'],
 'front': ['front'],
 'near': ['near', 'nearer'],
 'with': ['with'],
 'rear': ['rear'],
 'apart': ['apart'],
 'exceiv': ['except'],
 'past': ['past'],
 'off': ['off'],
 'tween': ['tween'],
 'throughout': ['throughout'],
 'alongsid': ['alongside'],
 'toward': ['toward', 'towards'],
 'amongst': ['amongst'],
 'sid': ['side'],
 'left': ['left'],
 'through': ['through'],
 'between': ['between'],
 'plu': ['plus'],
 'to': ['to'],
 'subsequ': ['subsequent'],
 'addit': ['addition'],
 'cross': ['cross'],
 'a': ['a'],
 'up': ['up'],
 'amidst': ['amidst'],
 'undersid

In [31]:
df_stem_snowball

{'away': ['away'],
 'top': ['top'],
 'by': ['by'],
 'edg': ['edge'],
 'after': ['after'],
 'but': ['but'],
 'end': ['end'],
 'outsid': ['outside'],
 'among': ['among'],
 'into': ['into'],
 'along': ['along'],
 'upsid': ['upside'],
 'for': ['for'],
 'place': ['place'],
 'bottom': ['bottom'],
 'asid': ['aside'],
 'betwixt': ['betwixt'],
 'at': ['at'],
 'ahead': ['ahead'],
 'astern': ['astern'],
 'center': ['center'],
 'without': ['without'],
 'front': ['front'],
 'near': ['near'],
 'with': ['with'],
 'rear': ['rear'],
 'apart': ['apart'],
 'except': ['except'],
 'past': ['past'],
 'off': ['off'],
 'tween': ['tween'],
 'throughout': ['throughout'],
 'alongsid': ['alongside'],
 'toward': ['toward', 'towards'],
 'amongst': ['amongst'],
 'side': ['side'],
 'left': ['left'],
 'through': ['through'],
 'between': ['between'],
 'plus': ['plus'],
 'to': ['to'],
 'subsequ': ['subsequent'],
 'addit': ['addition'],
 'cross': ['cross'],
 'a': ['a'],
 'up': ['up'],
 'amidst': ['amidst'],
 'undersid': 

## Get atomic morphemes with ngram

In [44]:
c = 0
print("Unique tokens not decomposed:")
for i in unique_tokens_not_decomposed:
    print(i)
    c += 1
    if c == 10:
        break

Unique tokens not decomposed:
top
atop
corner
by
edge
onto
after
but
end
into


In [33]:
def get_char_ngrams(tokens, n):

    ngrams_list = []
    ngram_map = {}
    
    for token in tokens:
        for gram in ngrams(list(token), n):
            # check if gram is valid suffix or prefix in english with wordnet
            
            
            ngrams_list.append(''.join(gram))
            
            g = ''.join(gram)
            ngram_map.setdefault(g, []).append(token)
    return ngrams_list, ngram_map

In [34]:
# get bigrams of letters from each token
bigrams_list, bigram_map = get_char_ngrams(unique_tokens, 2)
bigram_counts = Counter(bigrams_list)
most_common_bigrams = bigram_counts.most_common(20)

for bigram, count in most_common_bigrams:
    mapping = bigram_map.get(bigram, [])
    print(f"(count: {count}) | {bigram}: {mapping} ")

(count: 12) | id: ['outside', 'upside', 'aside', 'alongside', 'side', 'amidst', 'underside', 'beside', 'inside', 'amid', 'middle', 'astride'] 
(count: 12) | de: ['outside', 'upside', 'aside', 'alongside', 'side', 'underside', 'underside', 'under', 'beside', 'inside', 'underneath', 'astride'] 
(count: 11) | ar: ['near', 'rear', 'apart', 'toward', 'nearest', 'towards', 'heart', 'aboard', 'nearer', 'around', 'far'] 
(count: 10) | er: ['after', 'astern', 'center', 'underside', 'over', 'corner', 'higher', 'under', 'nearer', 'underneath'] 
(count: 10) | on: ['among', 'along', 'front', 'alongside', 'amongst', 'addition', 'beyond', 'onto', 'on', 'upon'] 
(count: 9) | si: ['outside', 'upside', 'aside', 'alongside', 'side', 'underside', 'opposite', 'beside', 'inside'] 
(count: 9) | ea: ['ahead', 'near', 'rear', 'nearest', 'beneath', 'means', 'heart', 'nearer', 'underneath'] 
(count: 8) | to: ['top', 'into', 'bottom', 'toward', 'to', 'atop', 'onto', 'towards'] 
(count: 8) | in: ['into', 'against'

In [35]:
trigrams_list, trigram_map = get_char_ngrams(unique_tokens, 3)
trigram_counts = Counter(trigrams_list)
most_common_trigrams = trigram_counts.most_common(20)

for trigram, count in most_common_trigrams:
    mapping = trigram_map.get(trigram, [])
    print(f"{trigram}: {mapping} (count: {count})")

ide: ['outside', 'upside', 'aside', 'alongside', 'side', 'underside', 'beside', 'inside', 'astride'] (count: 9)
sid: ['outside', 'upside', 'aside', 'alongside', 'side', 'underside', 'beside', 'inside'] (count: 8)
nea: ['near', 'nearest', 'beneath', 'nearer', 'underneath'] (count: 5)
ear: ['near', 'rear', 'nearest', 'heart', 'nearer'] (count: 5)
out: ['outside', 'without', 'throughout', 'out'] (count: 4)
ong: ['among', 'along', 'alongside', 'amongst'] (count: 4)
ace: ['place', 'adjacent', 'surface', 'face'] (count: 4)
und: ['underside', 'under', 'around', 'underneath'] (count: 4)
ter: ['after', 'astern', 'center'] (count: 3)
for: ['for', 'before', 'afore'] (count: 3)
ast: ['astern', 'past', 'astride'] (count: 3)
ent: ['center', 'subsequent', 'adjacent'] (count: 3)
wit: ['without', 'with', 'within'] (count: 3)
ith: ['without', 'with', 'within'] (count: 3)
rou: ['throughout', 'through', 'around'] (count: 3)
ard: ['toward', 'towards', 'aboard'] (count: 3)
mid: ['amidst', 'amid', 'middle'] 

In [36]:
fourgrams_list, fourgram_map = get_char_ngrams(unique_tokens, 4)
fourgram_counts = Counter(fourgrams_list)
most_common_fourgrams = fourgram_counts.most_common(20)

for fourgram, count in most_common_fourgrams:
    mapping = fourgram_map.get(fourgram, [])
    print(f"{fourgram}: {mapping} (count: {count})")

side: ['outside', 'upside', 'aside', 'alongside', 'side', 'underside', 'beside', 'inside'] (count: 8)
with: ['without', 'with', 'within'] (count: 3)
near: ['near', 'nearest', 'nearer'] (count: 3)
unde: ['underside', 'under', 'underneath'] (count: 3)
nder: ['underside', 'under', 'underneath'] (count: 3)
amon: ['among', 'amongst'] (count: 2)
mong: ['among', 'amongst'] (count: 2)
alon: ['along', 'alongside'] (count: 2)
long: ['along', 'alongside'] (count: 2)
betw: ['betwixt', 'between'] (count: 2)
cent: ['center', 'adjacent'] (count: 2)
hout: ['without', 'throughout'] (count: 2)
twee: ['tween', 'between'] (count: 2)
ween: ['tween', 'between'] (count: 2)
thro: ['throughout', 'through'] (count: 2)
hrou: ['throughout', 'through'] (count: 2)
roug: ['throughout', 'through'] (count: 2)
ough: ['throughout', 'through'] (count: 2)
ongs: ['alongside', 'amongst'] (count: 2)
towa: ['toward', 'towards'] (count: 2)


In [37]:
# -------------------------------------------------------------
# Code snippet to check whether an n‐letter string acts 
# as a prefix/suffix in WordNet’s English lexicon
# -------------------------------------------------------------

import nltk
from nltk.corpus import wordnet as wn

# Make sure wordnet is downloaded:
# nltk.download("wordnet")

lemmas = set(wn.all_lemma_names())

def find_affix_pairs(gram):
    g = gram.lower()
    suffix = []
    prefix = []
    for w in lemmas:
        if w.endswith(g) and len(w) > len(g):
            base = w[:-len(g)]
            if base in lemmas:
                suffix.append((base, w))
        if w.startswith(g) and len(w) > len(g):
            base = w[len(g):]
            if base in lemmas:
                prefix.append((w, base))
    return (prefix, suffix) if (prefix or suffix) else None

def get_common_affix(ngram_list, n):
    for bg in ngram_list:
        found = find_affix_pairs(bg)
        if not found:
            print(f"{n}‐gram '{bg}' does NOT appear as a productive affix.\n")
            continue

        prefix, suffix = found

        if suffix:
            print(f"{n}‐gram '{bg}' as SUFFIX:")
            for base, suffixed in suffix[:10]:
                print(f"  • {base} → {suffixed}")
            print(f"  ({len(suffix)} total)\n")

        if prefix:
            print(f"{n}‐gram '{bg}' as PREFIX:")
            for prefixed, base in prefix[:10]:
                print(f"  • {prefixed} → {base}")
            print(f"  ({len(prefix)} total)\n")


In [38]:
bigram = []
for i in most_common_bigrams:
    bigram.append(i[0])
    
get_common_affix(bigram, 2)

2‐gram 'id' as SUFFIX:
  • pa → paid
  • nsa → nsaid
  • fet → fetid
  • sol → solid
  • ar → arid
  • inla → inlaid
  • ov → ovid
  • re → reid
  • p → pid
  • ma → maid
  (62 total)

2‐gram 'id' as PREFIX:
  • ido → o
  • ides → es
  • idf → f
  • idle → le
  • idea → ea
  • identity → entity
  • idp → p
  • iddm → dm
  • idling → ling
  • idun → un
  (12 total)

2‐gram 'de' as SUFFIX:
  • ce → cede
  • ri → ride
  • ba → bade
  • abo → abode
  • man → mande
  • ai → aide
  • phyllo → phyllode
  • chi → chide
  • fa → fade
  • gui → guide
  (42 total)

2‐gram 'de' as PREFIX:
  • devisor → visor
  • deal → al
  • denomination → nomination
  • delegation → legation
  • degauss → gauss
  • decay → cay
  • debut → but
  • decoder → coder
  • deconstructivism → constructivism
  • depopulate → populate
  (461 total)

2‐gram 'ar' as SUFFIX:
  • son → sonar
  • chad → chadar
  • column → columnar
  • ge → gear
  • huss → hussar
  • inst → instar
  • line → linear
  • gu → guar
  • tart → tar

In [39]:
trigram = []
for i in most_common_trigrams:
    trigram.append(i[0])
    
# 2. Test a few common trigrams
get_common_affix(trigram, 3)


3‐gram 'ide' as SUFFIX:
  • rings → ringside
  • r → ride
  • a → aide
  • res → reside
  • ways → wayside
  • burns → burnside
  • fluor → fluoride
  • ore → oreide
  • az → azide
  • gu → guide
  (39 total)

3‐gram 'ide' as PREFIX:
  • ides → s
  • idea → a
  • ideal → al
  • ideology → ology
  • ideate → ate
  • ideally → ally
  (6 total)

3‐gram 'sid' as SUFFIX:
  • ha → hasid
  • cap → capsid
  • re → resid
  (3 total)

3‐gram 'sid' as PREFIX:
  • sidle → le
  • sidney → ney
  • sids → s
  • sidalcea → alcea
  • sida → a
  • side → e
  • sidon → on
  (7 total)

3‐gram 'nea' as SUFFIX:
  • gui → guinea
  • genus_ara → genus_aranea
  • us → usnea
  • ti → tinea
  • ara → aranea
  (5 total)

3‐gram 'nea' as PREFIX:
  • neat → t
  • neap → p
  • neaten → ten
  • near → r
  • nearest → rest
  (5 total)

3‐gram 'ear' as SUFFIX:
  • g → gear
  • goody → goodyear
  • wheat → wheatear
  • lin → linear
  • s → sear
  • h → hear
  • w → wear
  • cl → clear
  • d → dear
  • end → endear
  (20

In [40]:
fourgram = []
for i in most_common_fourgrams:
    fourgram.append(i[0])

# 3. Test a few common fourgrams
get_common_affix(fourgram, 4)

4‐gram 'side' as SUFFIX:
  • up → upside
  • ring → ringside
  • re → reside
  • mountain → mountainside
  • way → wayside
  • over → overside
  • river → riverside
  • burn → burnside
  • dock → dockside
  • lake → lakeside
  (41 total)

4‐gram 'side' as PREFIX:
  • sidelight → light
  • sidereal → real
  • sidearm → arm
  • sideburn → burn
  • sidestroke → stroke
  • sidesaddle → saddle
  • sideslip → slip
  • sideline → line
  • sideward → ward
  • siderite → rite
  (28 total)

4‐gram 'with' as SUFFIX:
  • here → herewith
  • there → therewith
  • forth → forthwith
  (3 total)

4‐gram 'with' as PREFIX:
  • withdrawn → drawn
  • within → in
  • withal → al
  • withdraw → draw
  • withhold → hold
  • withholder → holder
  • withstand → stand
  • withe → e
  • withdrawing_room → drawing_room
  • withy → y
  (14 total)

4‐gram 'near' as SUFFIX:
  • li → linear
  (1 total)

4‐gram 'near' as PREFIX:
  • nearsighted → sighted
  • nearer → er
  • nearsightedness → sightedness
  • nearside →

## Checking atomic morph with wordfreq

In [41]:
# -----------------------------------------------
# Example: using wordfreq to test “xy” as a suffix or prefix
# -----------------------------------------------

from wordfreq import top_n_list

def find_suffix_pairs_wordfreq(ngram, top_n=50000):
    ngram = ngram.lower()
    words = top_n_list("en", n=top_n)
    wordset = set(w.lower() for w in words)
    pairs = []
    for w in wordset:
        if w.endswith(ngram) and len(w) > len(ngram):
            base = w[:-len(ngram)]
            if base in wordset:
                pairs.append((base, w))
    # return both the matching pairs and the total count
    return pairs, 

def find_prefix_pairs_wordfreq(ngram, top_n=50000):
    ngram = ngram.lower()
    words = top_n_list("en", n=top_n)
    wordset = set(w.lower() for w in words)
    pairs = []
    count_w = 0
    for w in wordset:
        if w.startswith(ngram) and len(w) > len(ngram):
            base = w[len(ngram):]
            if base in wordset:
                pairs.append((w, base))
    return pairs

def is_valid_suffix_wordfreq(ngram, top_n=50000):
    return bool(find_suffix_pairs_wordfreq(ngram, top_n=top_n))

def is_valid_prefix_wordfreq(ngram, top_n=50000):
    return bool(find_prefix_pairs_wordfreq(ngram, top_n=top_n))

def affix_validity_wordfreq(ngram, top_n=50000):
    """
    Returns a dict with booleans for suffix and prefix productivity.
    """
    return {
        "suffix": is_valid_suffix_wordfreq(ngram, top_n),
        "prefix": is_valid_prefix_wordfreq(ngram, top_n)
    }

# 3. Test a few common bigrams:
for bg in bigram:
    val = affix_validity_wordfreq(bg, top_n=50000)
    print(f"{bg}: suffix? {val['suffix']}, prefix? {val['prefix']}")
    if val['suffix']:
        print(f"  examples suffix → {find_suffix_pairs_wordfreq(bg)[:1]} \n {len(find_suffix_pairs_wordfreq(bg)[0])} times")
    if val['prefix']:
        print(f"  examples prefix → {find_prefix_pairs_wordfreq(bg)[:5]} \n {len(find_prefix_pairs_wordfreq(bg))} times")
    print()


id: suffix? True, prefix? True
  examples suffix → ([('pa', 'paid'), ('b', 'bid'), ('ac', 'acid'), ('s', 'sid'), ('rash', 'rashid'), ('v', 'vid'), ('maj', 'majid'), ('sol', 'solid'), ('d', 'did'), ('wal', 'walid'), ('r', 'rid'), ('sa', 'said'), ('usa', 'usaid'), ('l', 'lid'), ('ar', 'arid'), ('ham', 'hamid'), ('had', 'hadid'), ('liv', 'livid'), ('metro', 'metroid'), ('qua', 'quaid'), ('ov', 'ovid'), ('k', 'kid'), ('re', 'reid'), ('ra', 'raid'), ('lip', 'lipid'), ('a', 'aid'), ('rab', 'rabid'), ('shah', 'shahid'), ('w', 'wid'), ('p', 'pid'), ('ma', 'maid'), ('flu', 'fluid'), ('pla', 'plaid'), ('m', 'mid'), ('sl', 'slid'), ('cov', 'covid'), ('la', 'laid'), ('av', 'avid'), ('e', 'eid'), ('qu', 'quid'), ('val', 'valid'), ('devo', 'devoid'), ('luc', 'lucid'), ('sk', 'skid'), ('medica', 'medicaid'), ('tim', 'timid'), ('bra', 'braid'), ('h', 'hid'), ('viv', 'vivid'), ('gr', 'grid'), ('rf', 'rfid'), ('dav', 'david'), ('hum', 'humid'), ('en', 'enid'), ('rap', 'rapid'), ('sta', 'staid'), ('far',