In [110]:
import pandas
import json
import csv
from nltk.corpus import wordnet as wn
from nltk import ngrams
from collections import Counter

In [111]:
with open('pp_lexicon/atomic_p.json', 'r') as f:
    pp = json.load(f)
    
# sort pp['atomic_p'].keys() alphabetically
keys_sorted = sorted(pp['atomic_p'].keys())
print(keys_sorted)


['above', 'across', 'against', 'along', 'among', 'around', 'away', 'behind', 'below', 'beside', 'between', 'beyond', 'down', 'in', 'in front of', 'inside', 'left', 'near', 'next to', 'off', 'on', 'out', 'outside', 'over', 'past', 'right', 'through', 'under', 'up', 'upon']


In [112]:
len(keys_sorted)

30

In [113]:
synonyms_atomic_p = {}
for key in keys_sorted:
    # collect all lemma names in a flat list
    all_lemmas = []
    for syn in wn.synsets(key):
        all_lemmas.extend(syn.lemma_names())
    # remove duplicates while preserving order
    synonyms_atomic_p[key] = list(dict.fromkeys(all_lemmas))

# inspect result
synonyms_atomic_p


{'above': ['above',
  'supra',
  'higher_up',
  'in_a_higher_place',
  'to_a_higher_place'],
 'across': ['across', 'crosswise', 'crossways'],
 'against': [],
 'along': ['along', 'on'],
 'among': [],
 'around': ['about',
  'around',
  'approximately',
  'close_to',
  'just_about',
  'some',
  'roughly',
  'more_or_less',
  'or_so',
  'round'],
 'away': ['away', 'outside', 'off', 'forth', 'out', 'aside', 'by'],
 'behind': ['buttocks',
  'nates',
  'arse',
  'butt',
  'backside',
  'bum',
  'buns',
  'can',
  'fundament',
  'hindquarters',
  'hind_end',
  'keister',
  'posterior',
  'prat',
  'rear',
  'rear_end',
  'rump',
  'stern',
  'seat',
  'tail',
  'tail_end',
  'tooshie',
  'tush',
  'bottom',
  'behind',
  'derriere',
  'fanny',
  'ass',
  'slow',
  'behindhand',
  'in_arrears'],
 'below': ['below',
  'at_a_lower_place',
  'to_a_lower_place',
  'beneath',
  'infra',
  'downstairs',
  'down_the_stairs',
  'on_a_lower_floor',
  'under'],
 'beside': [],
 'between': ['between', 'bet

In [114]:
# # synonyms atomic_p to json
# with open('synonyms_atomic_p.json', 'w') as f:
#     json.dump(synonyms_atomic_p, f, indent=4)

In [115]:
def get_atomic_p_prop(prop='', counter=5):
# pp is a dict, access preposition as key
    try:
        if prop is not None and prop in ['isAtomicMorph', 'class', 'spellOutHEAD', 'path_p_morphology', 'measure_allowed']:
            for key, value in pp['atomic_p'].items():
                # print(f"key: {key}")
                for el in value:
                    if el == prop:
                        print(f"{key}: {pp['atomic_p'][key][el]} ")
                        counter += 1
                        if counter == 5:
                            break
    except KeyError as e:
        print(f"KeyError: {e} not found in atomic_p")

In [116]:
pp_wordnet_wiki_pop = []
with open('dictionaries/pp_wordnet_wiki_pop_2.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile, quotechar='|', dialect='excel')
    for row in reader:
        if row['preposition'] == '':
            continue
        pp_wordnet_wiki_pop.append({
            'preposition': row['preposition'],
            'isAtomic': row.get('is_atomic'),
            'isSpatial': row.get('is_spatial')
        })

In [117]:
len(pp_wordnet_wiki_pop)

135

In [130]:
pp_wordnet_wiki_pop_spatial = [pp for pp in pp_wordnet_wiki_pop if pp['isSpatial'] == 'True']
len(pp_wordnet_wiki_pop_spatial)


135

## Get atomic morphemes from unique tokens

Unique tokens from the list of wikipedia, wordnet, and dictionaries preposition phrase individual words (as token).

In [131]:
unique_tokens = set() # unique token is defined as set of unique words in preposition
def tokenize_preposition(preposition):
    return preposition.split(' ')

# Example usage
for pp in pp_wordnet_wiki_pop_spatial:
    tokens = tokenize_preposition(pp['preposition'])
    # print(f"Tokens for '{pp['preposition']}': {tokens}")    
    unique_tokens.update(tokens)

print(f"length of unique_tokens before: {len(unique_tokens)}")

c = 0
unique_tokens_copy = unique_tokens.copy()
for k in keys_sorted:
    if k in unique_tokens_copy:
        unique_tokens_copy.remove(k)
        c += 1
        # print(f"{k} is in unique tokens")
    else:
        # print(f"{k} is NOT in unique tokens")
        continue
print(f"numbers of non overlapping unique_tokens: {len(unique_tokens_copy)}")

for k in unique_tokens_copy:
    print(k)

length of unique_tokens before: 106
numbers of non overlapping unique_tokens: 78
rim
next
a
within
astride
betwixt
nearest
surface
addition
amidst
astern
apart
amid
into
underneath
from
heart
at
face
top
cross
prior
nigh
side
of
beneath
amongst
ahead
for
back
rear
higher
toward
place
core
to
alongside
aboard
except
opposite
onto
means
center
after
underside
virtue
flank
corner
end
but
adjacent
upside
edge
afore
foot
aside
throughout
tween
front
skin
far
by
base
as
bottom
middle
with
without
atop
following
close
subsequent
before
via
the
nearer
plus
towards


In [132]:
# export unique_tokens_copy as json
# with open('pp_lexicon/unique_tokens_copy.json', 'w') as f:
#     json.dump(list(unique_tokens_copy), f, indent=4)

## Decompose p

In [133]:
def is_english_word(w):
    w = w.lower()
    return bool(wn.synsets(w))

In [134]:
def decompose_preposition(preposition, unique_tokens, method='substring'):
    
    result = {}
    p = preposition.lower()

    if method == 'substring':
        for token in unique_tokens:
            t = token.lower()
            if p not in t: #check whether preposition is a substring of token
                continue

            count = t.count(p)

            remainder = t.replace(p, "", 1)

            # if remainder == '':
            #     continue
            if is_english_word(p) and is_english_word(remainder):
                result[token] = {
                    'decomposition': [p, remainder],
                    'occurrence': count
            }

        return result
    
        # if method == 'find_bigram':
    

    return result

In [135]:
# atomic = list(keys_sorted['atomic_p'].keys())
atomic = keys_sorted
atomic

['above',
 'across',
 'against',
 'along',
 'among',
 'around',
 'away',
 'behind',
 'below',
 'beside',
 'between',
 'beyond',
 'down',
 'in',
 'in front of',
 'inside',
 'left',
 'near',
 'next to',
 'off',
 'on',
 'out',
 'outside',
 'over',
 'past',
 'right',
 'through',
 'under',
 'up',
 'upon']

In [136]:
unique_tokens_copy

{'a',
 'aboard',
 'addition',
 'adjacent',
 'afore',
 'after',
 'ahead',
 'alongside',
 'amid',
 'amidst',
 'amongst',
 'apart',
 'as',
 'aside',
 'astern',
 'astride',
 'at',
 'atop',
 'back',
 'base',
 'before',
 'beneath',
 'betwixt',
 'bottom',
 'but',
 'by',
 'center',
 'close',
 'core',
 'corner',
 'cross',
 'edge',
 'end',
 'except',
 'face',
 'far',
 'flank',
 'following',
 'foot',
 'for',
 'from',
 'front',
 'heart',
 'higher',
 'into',
 'means',
 'middle',
 'nearer',
 'nearest',
 'next',
 'nigh',
 'of',
 'onto',
 'opposite',
 'place',
 'plus',
 'prior',
 'rear',
 'rim',
 'side',
 'skin',
 'subsequent',
 'surface',
 'the',
 'throughout',
 'to',
 'top',
 'toward',
 'towards',
 'tween',
 'underneath',
 'underside',
 'upside',
 'via',
 'virtue',
 'with',
 'within',
 'without'}

In [137]:
# collect all decompositions
result_decompose = {}
for pp in atomic:
    if pp in unique_tokens:
        comps = decompose_preposition(pp, unique_tokens_copy, method='substring')
        if comps:
            result_decompose[pp] = comps

# turn it into a flat table
rows = []
for preposition, comps in result_decompose.items():
    for token, details in comps.items():
        rows.append({
            'preposition': preposition,
            'token': token,
            'decomposition': details['decomposition'],
            'occurrence': details['occurrence']
        })

# dataFrame of all decompositions
df_decompose = pandas.DataFrame(rows)
df_decompose


Unnamed: 0,preposition,token,decomposition,occurrence
0,along,alongside,"[along, side]",1
1,near,nearest,"[near, est]",1
2,near,nearer,"[near, er]",1
3,out,throughout,"[out, through]",1
4,through,throughout,"[through, out]",1
5,under,underside,"[under, side]",1
6,up,upside,"[up, side]",1


In [139]:
# remainder_decomp = [el[1] for el in df_decompose['decomposition']]
# remainder_decomp = set(remainder_decomp)
# to_remove = {'so', 'pot', 'mus', 'pot', 'mus', 'ab'} #particles and morphemes that isnt valid
# remainder_decomp = remainder_decomp - to_remove
# remainder_decomp

In [140]:
# get all tokens in df_decompose
tokens_decompose = set()
for token in df_decompose['token']:
    tokens_decompose.add(token)

unique_tokens_not_decomposed = unique_tokens_copy - tokens_decompose
print(f"length of unique_tokens_not_decomposed: {len(unique_tokens_not_decomposed)}")
unique_tokens_not_decomposed

length of unique_tokens_not_decomposed: 72


{'a',
 'aboard',
 'addition',
 'adjacent',
 'afore',
 'after',
 'ahead',
 'amid',
 'amidst',
 'amongst',
 'apart',
 'as',
 'aside',
 'astern',
 'astride',
 'at',
 'atop',
 'back',
 'base',
 'before',
 'beneath',
 'betwixt',
 'bottom',
 'but',
 'by',
 'center',
 'close',
 'core',
 'corner',
 'cross',
 'edge',
 'end',
 'except',
 'face',
 'far',
 'flank',
 'following',
 'foot',
 'for',
 'from',
 'front',
 'heart',
 'higher',
 'into',
 'means',
 'middle',
 'next',
 'nigh',
 'of',
 'onto',
 'opposite',
 'place',
 'plus',
 'prior',
 'rear',
 'rim',
 'side',
 'skin',
 'subsequent',
 'surface',
 'the',
 'to',
 'top',
 'toward',
 'towards',
 'tween',
 'underneath',
 'via',
 'virtue',
 'with',
 'within',
 'without'}

There are 72 unique tokens of prepositional phrase that are not decomposed by atomic_ps.

## Stemming for checking atomic elements

In [141]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

# Create instances of each stemmer
porter   = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer(language='english')

In [142]:
stemmer = [porter, lancaster, snowball]
# build mapping token -> stem for each stemmer
def build_stem_mapping(tokens, stemmer):
    stem_mapping = {}
    for token in tokens:
        stem = stemmer.stem(token)
        if stem not in stem_mapping:
            stem_mapping[stem] = []
        stem_mapping[stem].append(token)
    return stem_mapping

df_stem_porter = build_stem_mapping(unique_tokens, porter)
df_stem_lancaster = build_stem_mapping(unique_tokens, lancaster)
df_stem_snowball = build_stem_mapping(unique_tokens, snowball)
# Convert the stem mappings to DataFrames
df_stem_porter = pandas.DataFrame.from_dict(df_stem_porter, orient='index').reset_index()

print(df_stem_porter.to_string())


          index           0        1
0           rim         rim     None
1             a           a     None
2           off         off     None
3       betwixt     betwixt     None
4       nearest     nearest     None
5          amid        amid     None
6          into        into     None
7    underneath  underneath     None
8          past        past     None
9          face        face     None
10        cross       cross     None
11      through     through     None
12      between     between     None
13        prior       prior     None
14         nigh        nigh     None
15         side        side     None
16      amongst     amongst     None
17        ahead       ahead     None
18          for         for     None
19         back        back     None
20         rear        rear     None
21        right       right     None
22         core        core     None
23           to          to     None
24       beyond      beyond     None
25       except      except     None
2

In [143]:
df_stem_lancaster

{'rim': ['rim'],
 'a': ['a'],
 'off': ['off'],
 'betwixt': ['betwixt'],
 'nearest': ['nearest'],
 'amid': ['amid'],
 'into': ['into'],
 'undernea': ['underneath'],
 'past': ['past'],
 'fac': ['face'],
 'cross': ['cross'],
 'through': ['through'],
 'between': ['between'],
 'pri': ['prior'],
 'nigh': ['nigh'],
 'sid': ['side'],
 'amongst': ['amongst'],
 'ahead': ['ahead'],
 'for': ['for'],
 'back': ['back'],
 'rear': ['rear'],
 'right': ['right'],
 'cor': ['core'],
 'to': ['to'],
 'beyond': ['beyond'],
 'exceiv': ['except'],
 'left': ['left'],
 'onto': ['onto'],
 'mean': ['means'],
 'insid': ['inside'],
 'aft': ['after'],
 'undersid': ['underside'],
 'virtu': ['virtue'],
 'against': ['against'],
 'up': ['up'],
 'end': ['end'],
 'behind': ['behind'],
 'edg': ['edge'],
 'foot': ['foot'],
 'asid': ['aside'],
 'among': ['among'],
 'far': ['far'],
 'by': ['by'],
 'out': ['out'],
 'down': ['down'],
 'with': ['with'],
 'atop': ['atop'],
 'clos': ['close'],
 'away': ['away'],
 'across': ['across

In [144]:
df_stem_snowball

{'rim': ['rim'],
 'a': ['a'],
 'off': ['off'],
 'betwixt': ['betwixt'],
 'nearest': ['nearest'],
 'amid': ['amid'],
 'into': ['into'],
 'underneath': ['underneath'],
 'past': ['past'],
 'face': ['face'],
 'cross': ['cross'],
 'through': ['through'],
 'between': ['between'],
 'prior': ['prior'],
 'nigh': ['nigh'],
 'side': ['side'],
 'amongst': ['amongst'],
 'ahead': ['ahead'],
 'for': ['for'],
 'back': ['back'],
 'rear': ['rear'],
 'right': ['right'],
 'core': ['core'],
 'to': ['to'],
 'beyond': ['beyond'],
 'except': ['except'],
 'left': ['left'],
 'onto': ['onto'],
 'mean': ['means'],
 'insid': ['inside'],
 'after': ['after'],
 'undersid': ['underside'],
 'virtu': ['virtue'],
 'against': ['against'],
 'up': ['up'],
 'end': ['end'],
 'behind': ['behind'],
 'edg': ['edge'],
 'foot': ['foot'],
 'asid': ['aside'],
 'among': ['among'],
 'far': ['far'],
 'by': ['by'],
 'out': ['out'],
 'down': ['down'],
 'with': ['with'],
 'atop': ['atop'],
 'close': ['close'],
 'away': ['away'],
 'across'

## Get atomic morphemes with ngram

In [145]:
c = 0
print("Unique tokens not decomposed:")
for i in unique_tokens_not_decomposed:
    print(i)
    c += 1
    if c == 10:
        break

Unique tokens not decomposed:
rim
next
a
within
astride
betwixt
surface
addition
amidst
astern


In [146]:
def get_char_ngrams(tokens, n):

    ngrams_list = []
    ngram_map = {}
    
    for token in tokens:
        for gram in ngrams(list(token), n):
            # check if gram is valid suffix or prefix in english with wordnet
            
            
            ngrams_list.append(''.join(gram))
            
            g = ''.join(gram)
            ngram_map.setdefault(g, []).append(token)
    return ngrams_list, ngram_map

In [147]:
# get bigrams of letters from each token
bigrams_list, bigram_map = get_char_ngrams(unique_tokens, 2)
bigram_counts = Counter(bigrams_list)
most_common_bigrams = bigram_counts.most_common(20)

for bigram, count in most_common_bigrams:
    mapping = bigram_map.get(bigram, [])
    print(f"(count: {count}) | {bigram}: {mapping} ")

(count: 12) | id: ['amid', 'side', 'inside', 'underside', 'aside', 'astride', 'beside', 'amidst', 'alongside', 'outside', 'upside', 'middle'] 
(count: 12) | de: ['underneath', 'side', 'inside', 'underside', 'underside', 'aside', 'astride', 'beside', 'alongside', 'outside', 'upside', 'under'] 
(count: 11) | ar: ['nearest', 'rear', 'far', 'apart', 'toward', 'aboard', 'near', 'around', 'towards', 'nearer', 'heart'] 
(count: 10) | er: ['underneath', 'after', 'underside', 'astern', 'higher', 'over', 'center', 'corner', 'under', 'nearer'] 
(count: 10) | on: ['amongst', 'beyond', 'onto', 'among', 'addition', 'on', 'alongside', 'upon', 'along', 'front'] 
(count: 9) | ea: ['nearest', 'underneath', 'ahead', 'rear', 'means', 'beneath', 'near', 'nearer', 'heart'] 
(count: 9) | si: ['side', 'inside', 'underside', 'aside', 'beside', 'alongside', 'opposite', 'outside', 'upside'] 
(count: 8) | be: ['betwixt', 'between', 'beyond', 'behind', 'beside', 'beneath', 'before', 'below'] 
(count: 8) | in: ['in

In [148]:
trigrams_list, trigram_map = get_char_ngrams(unique_tokens, 3)
trigram_counts = Counter(trigrams_list)
most_common_trigrams = trigram_counts.most_common(20)

for trigram, count in most_common_trigrams:
    mapping = trigram_map.get(trigram, [])
    print(f"{trigram}: {mapping} (count: {count})")

ide: ['side', 'inside', 'underside', 'aside', 'astride', 'beside', 'alongside', 'outside', 'upside'] (count: 9)
sid: ['side', 'inside', 'underside', 'aside', 'beside', 'alongside', 'outside', 'upside'] (count: 8)
nea: ['nearest', 'underneath', 'beneath', 'near', 'nearer'] (count: 5)
ear: ['nearest', 'rear', 'near', 'nearer', 'heart'] (count: 5)
und: ['underneath', 'underside', 'around', 'under'] (count: 4)
ace: ['face', 'surface', 'place', 'adjacent'] (count: 4)
ong: ['amongst', 'among', 'alongside', 'along'] (count: 4)
out: ['out', 'outside', 'throughout', 'without'] (count: 4)
mid: ['amid', 'amidst', 'middle'] (count: 3)
nde: ['underneath', 'underside', 'under'] (count: 3)
der: ['underneath', 'underside', 'under'] (count: 3)
ast: ['past', 'astride', 'astern'] (count: 3)
rou: ['through', 'around', 'throughout'] (count: 3)
igh: ['nigh', 'right', 'higher'] (count: 3)
for: ['for', 'afore', 'before'] (count: 3)
ore: ['core', 'afore', 'before'] (count: 3)
ter: ['after', 'astern', 'center']

In [149]:
fourgrams_list, fourgram_map = get_char_ngrams(unique_tokens, 4)
fourgram_counts = Counter(fourgrams_list)
most_common_fourgrams = fourgram_counts.most_common(20)

for fourgram, count in most_common_fourgrams:
    mapping = fourgram_map.get(fourgram, [])
    print(f"{fourgram}: {mapping} (count: {count})")

side: ['side', 'inside', 'underside', 'aside', 'beside', 'alongside', 'outside', 'upside'] (count: 8)
near: ['nearest', 'near', 'nearer'] (count: 3)
unde: ['underneath', 'underside', 'under'] (count: 3)
nder: ['underneath', 'underside', 'under'] (count: 3)
with: ['with', 'within', 'without'] (count: 3)
betw: ['betwixt', 'between'] (count: 2)
eare: ['nearest', 'nearer'] (count: 2)
amid: ['amid', 'amidst'] (count: 2)
neat: ['underneath', 'beneath'] (count: 2)
eath: ['underneath', 'beneath'] (count: 2)
face: ['face', 'surface'] (count: 2)
cros: ['cross', 'across'] (count: 2)
ross: ['cross', 'across'] (count: 2)
thro: ['through', 'throughout'] (count: 2)
hrou: ['through', 'throughout'] (count: 2)
roug: ['through', 'throughout'] (count: 2)
ough: ['through', 'throughout'] (count: 2)
twee: ['between', 'tween'] (count: 2)
ween: ['between', 'tween'] (count: 2)
amon: ['amongst', 'among'] (count: 2)


In [150]:
lemmas = set(wn.all_lemma_names())

def find_affix_pairs(gram):
    g = gram.lower()
    suffix = []
    prefix = []
    for w in lemmas:
        if w.endswith(g) and len(w) > len(g):
            base = w[:-len(g)]
            if base in lemmas:
                suffix.append((base, w))
        if w.startswith(g) and len(w) > len(g):
            base = w[len(g):]
            if base in lemmas:
                prefix.append((w, base))
    return (prefix, suffix) if (prefix or suffix) else None

def get_common_affix(ngram_list, n):
    for bg in ngram_list:
        found = find_affix_pairs(bg)
        if not found:
            print(f"{n}‐gram '{bg}' does NOT appear as a productive affix.\n")
            continue

        prefix, suffix = found

        if suffix:
            print(f"{n}‐gram '{bg}' as SUFFIX:")
            for base, suffixed in suffix[:10]:
                print(f"  • {base} → {suffixed}")
            print(f"  ({len(suffix)} total)\n")

        if prefix:
            print(f"{n}‐gram '{bg}' as PREFIX:")
            for prefixed, base in prefix[:10]:
                print(f"  • {prefixed} → {base}")
            print(f"  ({len(prefix)} total)\n")


In [151]:
bigram = []
for i in most_common_bigrams:
    bigram.append(i[0])
    
get_common_affix(bigram, 2)

2‐gram 'id' as SUFFIX:
  • sa → said
  • can → canid
  • sap → sapid
  • gel → gelid
  • re → reid
  • dana → danaid
  • thrip → thripid
  • avo → avoid
  • y → yid
  • ma → maid
  (62 total)

2‐gram 'id' as PREFIX:
  • idf → f
  • idun → un
  • identity → entity
  • idler → ler
  • ides → es
  • idle → le
  • idea → ea
  • iddm → dm
  • idp → p
  • ido → o
  (12 total)

2‐gram 'de' as SUFFIX:
  • spa → spade
  • no → node
  • arca → arcade
  • ti → tide
  • mo → mode
  • wi → wide
  • hi → hide
  • abo → abode
  • nu → nude
  • sa → sade
  (42 total)

2‐gram 'de' as PREFIX:
  • detransitivize → transitivize
  • dec → c
  • dehumanisation → humanisation
  • demythologisation → mythologisation
  • desalinate → salinate
  • degeneration → generation
  • devitalization → vitalization
  • demodulation → modulation
  • decree → cree
  • detractor → tractor
  (461 total)

2‐gram 'ar' as SUFFIX:
  • annul → annular
  • t → tar
  • qat → qatar
  • cell → cellar
  • deb → debar
  • dew → dewar


In [152]:
trigram = []
for i in most_common_trigrams:
    trigram.append(i[0])
    
# 2. Test a few common trigrams
get_common_affix(trigram, 3)


3‐gram 'ide' as SUFFIX:
  • rings → ringside
  • am → amide
  • tops → topside
  • ab → abide
  • sn → snide
  • t → tide
  • w → wide
  • h → hide
  • ox → oxide
  • matric → matricide
  (39 total)

3‐gram 'ide' as PREFIX:
  • ideate → ate
  • ideology → ology
  • ides → s
  • ideally → ally
  • idea → a
  • ideal → al
  (6 total)

3‐gram 'sid' as SUFFIX:
  • re → resid
  • ha → hasid
  • cap → capsid
  (3 total)

3‐gram 'sid' as PREFIX:
  • sidon → on
  • sidle → le
  • sidney → ney
  • side → e
  • sidalcea → alcea
  • sids → s
  • sida → a
  (7 total)

3‐gram 'nea' as SUFFIX:
  • gui → guinea
  • ti → tinea
  • ara → aranea
  • genus_ara → genus_aranea
  • us → usnea
  (5 total)

3‐gram 'nea' as PREFIX:
  • nearest → rest
  • near → r
  • neat → t
  • neap → p
  • neaten → ten
  (5 total)

3‐gram 'ear' as SUFFIX:
  • y → year
  • sm → smear
  • d → dear
  • wheat → wheatear
  • g → gear
  • t → tear
  • cl → clear
  • n → near
  • lin → linear
  • h → hear
  (20 total)

3‐gram 'ear

In [153]:
fourgram = []
for i in most_common_fourgrams:
    fourgram.append(i[0])

# 3. Test a few common fourgrams
get_common_affix(fourgram, 4)

4‐gram 'side' as SUFFIX:
  • ring → ringside
  • dock → dockside
  • top → topside
  • canyon → canyonside
  • ship → shipside
  • under → underside
  • west → westside
  • along → alongside
  • green → greenside
  • over → overside
  (41 total)

4‐gram 'side' as PREFIX:
  • sideshow → show
  • sideburn → burn
  • sidesplitting → splitting
  • sideline → line
  • sidereal_time → real_time
  • sideway → way
  • sidestep → step
  • sideways → ways
  • sidelong → long
  • sideswipe → swipe
  (28 total)

4‐gram 'near' as SUFFIX:
  • li → linear
  (1 total)

4‐gram 'near' as PREFIX:
  • nearest → est
  • nearside → side
  • nearby → by
  • nearsighted → sighted
  • nearer → er
  • nearness → ness
  • nearsightedness → sightedness
  (7 total)

4‐gram 'unde' as PREFIX:
  • undesigned → signed
  • undecomposed → composed
  • undeserving → serving
  • under → r
  • underevaluation → revaluation
  • underage → rage
  • undefinable → finable
  • undeterminable → terminable
  (8 total)

4‐gram 'nd

## Checking atomic morph with wordfreq

In [154]:
# -----------------------------------------------
# Example: using wordfreq to test “xy” as a suffix or prefix
# -----------------------------------------------

from wordfreq import top_n_list

def find_suffix_pairs_wordfreq(ngram, top_n=50000):
    ngram = ngram.lower()
    words = top_n_list("en", n=top_n)
    wordset = set(w.lower() for w in words)
    pairs = []
    for w in wordset:
        if w.endswith(ngram) and len(w) > len(ngram):
            base = w[:-len(ngram)]
            if base in wordset:
                pairs.append((base, w))
    # return both the matching pairs and the total count
    return pairs, 

def find_prefix_pairs_wordfreq(ngram, top_n=50000):
    ngram = ngram.lower()
    words = top_n_list("en", n=top_n)
    wordset = set(w.lower() for w in words)
    pairs = []
    count_w = 0
    for w in wordset:
        if w.startswith(ngram) and len(w) > len(ngram):
            base = w[len(ngram):]
            if base in wordset:
                pairs.append((w, base))
    return pairs

def is_valid_suffix_wordfreq(ngram, top_n=50000):
    return bool(find_suffix_pairs_wordfreq(ngram, top_n=top_n))

def is_valid_prefix_wordfreq(ngram, top_n=50000):
    return bool(find_prefix_pairs_wordfreq(ngram, top_n=top_n))

def affix_validity_wordfreq(ngram, top_n=50000):
    """
    Returns a dict with booleans for suffix and prefix productivity.
    """
    return {
        "suffix": is_valid_suffix_wordfreq(ngram, top_n),
        "prefix": is_valid_prefix_wordfreq(ngram, top_n)
    }

# 3. Test a few common bigrams:
for bg in bigram:
    val = affix_validity_wordfreq(bg, top_n=50000)
    print(f"{bg}: suffix? {val['suffix']}, prefix? {val['prefix']}")
    if val['suffix']:
        print(f"  examples suffix → {find_suffix_pairs_wordfreq(bg)[:2]} \n {len(find_suffix_pairs_wordfreq(bg)[0])} times")
    if val['prefix']:
        print(f"  examples prefix → {find_prefix_pairs_wordfreq(bg)[5:]} \n {len(find_prefix_pairs_wordfreq(bg))} times")
    print()


id: suffix? True, prefix? True
  examples suffix → ([('c', 'cid'), ('sa', 'said'), ('qu', 'quid'), ('ov', 'ovid'), ('e', 'eid'), ('rf', 'rfid'), ('re', 'reid'), ('usa', 'usaid'), ('k', 'kid'), ('lip', 'lipid'), ('p', 'pid'), ('vo', 'void'), ('tim', 'timid'), ('ma', 'maid'), ('medica', 'medicaid'), ('d', 'did'), ('l', 'lid'), ('val', 'valid'), ('v', 'vid'), ('leon', 'leonid'), ('shah', 'shahid'), ('ham', 'hamid'), ('la', 'laid'), ('had', 'hadid'), ('pa', 'paid'), ('maj', 'majid'), ('dav', 'david'), ('liv', 'livid'), ('av', 'avid'), ('rap', 'rapid'), ('wal', 'walid'), ('hum', 'humid'), ('bra', 'braid'), ('sol', 'solid'), ('gr', 'grid'), ('devo', 'devoid'), ('ar', 'arid'), ('en', 'enid'), ('sk', 'skid'), ('cov', 'covid'), ('qua', 'quaid'), ('flu', 'fluid'), ('sl', 'slid'), ('r', 'rid'), ('sta', 'staid'), ('h', 'hid'), ('dru', 'druid'), ('w', 'wid'), ('b', 'bid'), ('m', 'mid'), ('rab', 'rabid'), ('pla', 'plaid'), ('am', 'amid'), ('s', 'sid'), ('ib', 'ibid'), ('rig', 'rigid'), ('luc', 'luci

In [155]:
for tr in trigram:
    val = affix_validity_wordfreq(tr, top_n=50000)
    print(f"{tr}: suffix? {val['suffix']}, prefix? {val['prefix']}")
    if val['suffix']:
        print(f"  examples suffix → {find_suffix_pairs_wordfreq(tr)[:2]} \n {len(find_suffix_pairs_wordfreq(tr)[0])} times")
    if val['prefix']:
        print(f"  examples prefix → {find_prefix_pairs_wordfreq(tr)[:2]} \n {len(find_prefix_pairs_wordfreq(tr))} times")
    print()

ide: suffix? True, prefix? True
  examples suffix → ([('fires', 'fireside'), ('s', 'side'), ('rings', 'ringside'), ('b', 'bide'), ('ons', 'onside'), ('str', 'stride'), ('mountains', 'mountainside'), ('gu', 'guide'), ('waters', 'waterside'), ('res', 'reside'), ('flips', 'flipside'), ('gl', 'glide'), ('roads', 'roadside'), ('oceans', 'oceanside'), ('mornings', 'morningside'), ('ab', 'abide'), ('sn', 'snide'), ('carb', 'carbide'), ('t', 'tide'), ('brom', 'bromide'), ('prov', 'provide'), ('seas', 'seaside'), ('pr', 'pride'), ('broads', 'broadside'), ('woods', 'woodside'), ('rivers', 'riverside'), ('w', 'wide'), ('as', 'aside'), ('h', 'hide'), ('ox', 'oxide'), ('downs', 'downside'), ('conf', 'confide'), ('wests', 'westside'), ('v', 'vide'), ('cyan', 'cyanide'), ('hills', 'hillside'), ('burns', 'burnside'), ('sl', 'slide'), ('div', 'divide'), ('ways', 'wayside'), ('whites', 'whiteside'), ('pres', 'preside'), ('pools', 'poolside'), ('r', 'ride'), ('a', 'aide'), ('subs', 'subside'), ('lakes', 

In [156]:
for fr in fourgram:
    val = affix_validity_wordfreq(fr, top_n=50000)
    print(f"{fr}: suffix? {val['suffix']}, prefix? {val['prefix']}")
    if val['suffix']:
        print(f"  examples suffix → {find_suffix_pairs_wordfreq(fr)[:10]} \n {len(find_suffix_pairs_wordfreq(fr)[0])} times")
    if val['prefix']:
        print(f"  examples prefix → {find_prefix_pairs_wordfreq(fr)[:10]} \n {len(find_prefix_pairs_wordfreq(fr))} times")
    print()

side: suffix? True, prefix? True
  examples suffix → ([('fire', 'fireside'), ('ring', 'ringside'), ('on', 'onside'), ('mountain', 'mountainside'), ('country', 'countryside'), ('water', 'waterside'), ('re', 'reside'), ('flip', 'flipside'), ('road', 'roadside'), ('ocean', 'oceanside'), ('morning', 'morningside'), ('south', 'southside'), ('sea', 'seaside'), ('broad', 'broadside'), ('wood', 'woodside'), ('sunny', 'sunnyside'), ('river', 'riverside'), ('a', 'aside'), ('north', 'northside'), ('down', 'downside'), ('under', 'underside'), ('west', 'westside'), ('along', 'alongside'), ('hill', 'hillside'), ('burn', 'burnside'), ('way', 'wayside'), ('white', 'whiteside'), ('pre', 'preside'), ('mersey', 'merseyside'), ('east', 'eastside'), ('pool', 'poolside'), ('sub', 'subside'), ('lake', 'lakeside'), ('out', 'outside'), ('back', 'backside'), ('state', 'stateside'), ('up', 'upside'), ('be', 'beside'), ('in', 'inside'), ('bed', 'bedside'), ('bay', 'bayside'), ('tyne', 'tyneside'), ('off', 'offsid