This notebook contains the code for reproducing the intrinsic evaluation in the paper

In [4]:
# Requirements

import sentencepiece as spm
import os, urllib.request, zipfile, re, string
import pandas as pd
from collections import defaultdict, Counter
from itertools import chain

In [5]:
# Import the sentencepiece models 

bpe_default = spm.SentencePieceProcessor(os.path.join('..', 'models', 'sentencepiece', 'bpe_default', 'wikipedia_spm3_bpe_16000.model'))
bpe_prime = spm.SentencePieceProcessor(os.path.join('..', 'models', 'sentencepiece', 'bpe_prime', 'wikipedia_spm1_bpe_16000.model'))
unigram_default = spm.SentencePieceProcessor(os.path.join('..', 'models', 'sentencepiece', 'unigram_default', 'wikipedia_spm3_unigram_16000.model'))
unigram_prime = spm.SentencePieceProcessor(os.path.join('..', 'models', 'sentencepiece', 'unigram_prime', 'wikipedia_spm1_unigram_16000.model'))

In [18]:
# Download the morphological datasets
#LADEC
urllib.request.urlretrieve("https://era.library.ualberta.ca/items/dc3b9033-14d0-48d7-b6fa-6398a30e61e4/download/830937da-a00b-4735-8cf2-3c67d5cc6d50", "LADECv1-2019.csv")

#MorphoLex
urllib.request.urlretrieve("https://raw.githubusercontent.com/hugomailhot/MorphoLex-en/master/MorphoLEX_en.xlsx", "MorphoLEX_en.xlsx")

#MorphyNet
urllib.request.urlretrieve("https://github.com/kbatsuren/MorphyNet/blob/main/eng/eng.derivational.v1.tsv?raw=true", "eng.derivational.v1.tsv")

#DagoBERT
urllib.request.urlretrieve("http://cistern.cis.lmu.de/dagobert/dagobert_data.zip", "dagobert_data.zip")
with zipfile.ZipFile("dagobert_data.zip", 'r') as zip_ref:
    zip_ref.extractall("dagobert_data")

In [31]:
# Function to get_boundaries from a tokenisation
def get_boundaries(y):
    boundaries = [len(''.join(y[:i])) for i in range(1, len(y))]
    return boundaries

In [33]:
# An example
get_boundaries(["un", "help", "ful", "ness"])

[2, 6, 9]

In [36]:
tokenisers = {'bpe_default': bpe_default, 'bpe_prime': bpe_prime, 'unigram_default': unigram_default, 'unigram_prime': unigram_prime}

### Quantitative Evaluation of Morphological Correctness (Tables 1 and 2)

#### LADEC

In [25]:
# Read LADEC dataset
ladec_df = pd.read_csv("LADECv1-2019.csv")

In [37]:
# Remove duplicate entries (we only want those with a unique morphological parse)
ladec_df = ladec_df[~ladec_df.duplicated('stim', keep=False)]

In [38]:
# Number of entries in the dataset with a unique morphological parse 
len(ladec_df)

7804

In [43]:
# Compute metrics for each of the tokenisers on the LADEC dataset
# This generates the results in the LADEC column of Table 1 in the paper

x = ladec_df[['c1', 'c2', 'stim']]

tps = defaultdict(int)
fps = defaultdict(int)
fns = defaultdict(int)
tns = defaultdict(int)
lengths = defaultdict(int)

for _, row in x.iterrows():
    # Iterate over the tokenisers 
    for key, val in tokenisers.items():
        # Gold standard morphological segmentation from the dataset
        gstandard = [row['c1'], row['c2']]
        spm = val
        
        # Tokenise the compound with the given tokeniser, stripping space symbols
        y = [x.lstrip('▁') for x in spm.encode(row['stim'], out_type=str) if x != "▁" ]
        
        # Get the boundaries for the gold standard and the tokeniser
        gstandard_boundaries = get_boundaries(gstandard)
        y_boundaries = get_boundaries(y)
        fn = 0
        for i in y_boundaries:
            if i in gstandard_boundaries:
                # True positives are those appearing in both generated and reference
                tps[key] += 1
            else:
                # False positives are those appearing in the generated but not the reference
                fps[key] += 1
        for i in gstandard_boundaries:
            if i not in y_boundaries:
                # False negatives are those appearing in the reference but not the generated
                fn += 1
                
        tns[key] += (len(row['stim']) - 1 - len(y_boundaries) - fn)
        fns[key] += fn
                
        lengths[key] += len(y)
                


print('length')
for k in tokenisers.keys():
    print(lengths[k] / len(x), k)
print()

print('precision')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fps[k]), k)
print()

print('recall')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fns[k]), k)
print()

print('f1')
for k in tokenisers.keys():
    print(100 * (tps[k] / (tps[k] + 0.5*(fps[k] + fns[k]))), k)
print()

length
2.981547924141466 bpe_default
2.5991799077396207 bpe_prime
2.80228088159918 unigram_default
2.6652998462327013 unigram_prime

precision
41.211846870150026 bpe_default
53.79807692307692 bpe_prime
51.93032349804479 unigram_default
56.717451523545705 unigram_prime

recall
81.66324961558175 bpe_default
86.03280369041518 bpe_prime
93.59302921578677 unigram_default
94.45156330087134 unigram_prime

f1
54.77909575382499 bpe_default
66.19996056004733 bpe_prime
66.79775024006585 unigram_default
70.875 unigram_prime



#### MorphoLex

In [82]:
# Read data

dfs = []
for i in range(1, 33):
    dfs += [pd.read_excel('MorphoLex_en.xlsx', sheet_name=i)]

morpho_db = pd.concat(dfs)

  warn(msg)


In [117]:
# Only keep revelant columns
x = morpho_db[['Nmorph', 'Word', 'MorphoLexSegm', 'PRS_signature']]
# Remove NaN entries
x = x[x['Nmorph'] > 1]
# Keep only the entries with a concatenative parse 
x = x[x['MorphoLexSegm'].apply(lambda x: ''.join(re.split('\<|\>|\}|\{|\(|\)', x))) == x['Word']]

In [118]:
# Perform evaluation on the whole of the dataset first
# This generates the results in the MorphoLex column of Table 1 in the paper

tps = defaultdict(int)
fps = defaultdict(int)
fns = defaultdict(int)
tns = defaultdict(int)
lengths = defaultdict(int)

def get_boundaries(y):
    boundaries = [len(''.join(y[:i])) for i in range(1, len(y))]
    return boundaries
    
for _, row in x.iterrows():
    seg = row['MorphoLexSegm']
    gstandard = [y for y in re.split('\<|\>|\}|\{|\(|\)', seg) if y != '']
    gstandard_boundaries = get_boundaries(gstandard)

    for key, val in tokenisers.items():
        
        spm = val
        
        y = [x.lstrip('▁') for x in spm.encode(row['Word'], out_type=str) if x != "▁" ]
        y_boundaries = get_boundaries(y)
        fn = 0
        for i in y_boundaries:
            if i in gstandard_boundaries:
                tps[key] += 1
            else:
                fps[key] += 1
        for i in gstandard_boundaries:
            if i not in y_boundaries:
                fn += 1
                
        tns[key] += (len(row['Word']) - 1 - len(y_boundaries) - fn)
        fns[key] += fn
                
        lengths[key] += len(y)
                
print('length')
for k in tokenisers.keys():
    print(lengths[k] / len(x), k)
print()

print('precision')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fps[k]), k)
print()

print('recall')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fns[k]), k)
print()

print('f1')
for k in tokenisers.keys():
    print(100 * (tps[k] / (tps[k] + 0.5*(fps[k] + fns[k]))), k)
print()

length
2.668856002660459 bpe_default
2.467741935483871 bpe_prime
2.5575324243431994 unigram_default
2.651064183571666 unigram_prime

precision
43.391620584865244 bpe_default
50.77602809561572 bpe_prime
58.081562933703424 unigram_default
53.859710962284105 unigram_prime

recall
57.59820129612485 bpe_default
59.277873297182914 bpe_prime
71.95476788784552 unigram_default
70.73138473746859 unigram_prime

f1
49.49566699815315 bpe_default
54.69855992189407 bpe_prime
64.27811909262759 unigram_default
61.1531974500443 unigram_prime



In [85]:
# Perform evaluation on the prefix only entries

morpholex_only_prefix = x[x['PRS_signature'].isin(['1,1,0', '1,2,0', '1,3,0', '2,1,0'])]

In [96]:
# This generates results in "Only Prefixes" column of Table 2 in the paper

tps = defaultdict(int)
fps = defaultdict(int)
fns = defaultdict(int)
tns = defaultdict(int)
lengths = defaultdict(int)

def get_boundaries(y):
    boundaries = [len(''.join(y[:i])) for i in range(1, len(y))]
    return boundaries
    
for _, row in morpholex_only_prefix.iterrows():
    seg = row['MorphoLexSegm']
    gstandard = [y for y in re.split('\<|\>|\}|\{|\(|\)', seg) if y != '']
    gstandard_boundaries = get_boundaries(gstandard)

    for key, val in tokenisers.items():
        
        spm = val
        
        y = [x.lstrip('▁') for x in spm.encode(row['Word'], out_type=str) if x != "▁" ]
        y_boundaries = get_boundaries(y)
        fn = 0
        for i in y_boundaries:
            if i in gstandard_boundaries:
                tps[key] += 1
            else:
                fps[key] += 1
        for i in gstandard_boundaries:
            if i not in y_boundaries:
                fn += 1
                
        tns[key] += (len(row['Word']) - 1 - len(y_boundaries) - fn)
        fns[key] += fn
                
        lengths[key] += len(y)
                
print('length')
for k in tokenisers.keys():
    print(lengths[k] / len(morpholex_only_prefix), k)
print()

print('precision')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fps[k]), k)
print()

print('recall')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fns[k]), k)
print()

print('f1')
for k in tokenisers.keys():
    print(100 * (tps[k] / (tps[k] + 0.5*(fps[k] + fns[k]))), k)
print()

length
2.5437382001258655 bpe_default
2.2580239144115795 bpe_prime
2.509754562617999 unigram_default
2.478288231592196 unigram_prime

precision
33.50998777007746 bpe_default
50.3751875937969 bpe_prime
53.438932888703626 unigram_default
57.17326521924223 unigram_prime

recall
50.27522935779817 bpe_default
61.59021406727829 bpe_prime
78.40978593272172 unigram_default
82.14067278287462 unigram_prime

f1
40.21526418786693 bpe_default
55.4210236653825 bpe_prime
63.559742191373324 unigram_default
67.41967871485943 unigram_prime



In [89]:
# Perform evaluation on the suffix only entries
morpholex_only_suffix = x[x['PRS_signature'].isin(['0,1,1', '0,1,2', '0,1,3',  '0,2,1', '0,2,2', '0,2,3', '0,3,1'])]

In [97]:
# This generates results in "Only Suffixes" column of Table 2 in the paper

tps = defaultdict(int)
fps = defaultdict(int)
fns = defaultdict(int)
tns = defaultdict(int)
lengths = defaultdict(int)

def get_boundaries(y):
    boundaries = [len(''.join(y[:i])) for i in range(1, len(y))]
    return boundaries
    
for _, row in morpholex_only_suffix.iterrows():
    seg = row['MorphoLexSegm']
    gstandard = [y for y in re.split('\<|\>|\}|\{|\(|\)', seg) if y != '']
    gstandard_boundaries = get_boundaries(gstandard)

    for key, val in tokenisers.items():
        
        spm = val
        
        y = [x.lstrip('▁') for x in spm.encode(row['Word'], out_type=str) if x != "▁" ]
        y_boundaries = get_boundaries(y)
        fn = 0
        for i in y_boundaries:
            if i in gstandard_boundaries:
                tps[key] += 1
            else:
                fps[key] += 1
        for i in gstandard_boundaries:
            if i not in y_boundaries:
                fn += 1
                
        tns[key] += (len(row['Word']) - 1 - len(y_boundaries) - fn)
        fns[key] += fn
                
        lengths[key] += len(y)
                
print('length')
for k in tokenisers.keys():
    print(lengths[k] / len(morpholex_only_suffix), k)
print()

print('precision')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fps[k]), k)
print()

print('recall')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fns[k]), k)
print()

print('f1')
for k in tokenisers.keys():
    print(100 * (tps[k] / (tps[k] + 0.5*(fps[k] + fns[k]))), k)
print()

length
2.3336697909156454 bpe_default
2.1720980533525593 bpe_prime
2.217808219178082 unigram_default
2.3860129776496035 unigram_prime

precision
11.990485457887338 bpe_default
14.449160361690348 bpe_prime
15.860517435320585 unigram_default
14.117769454848107 unigram_prime

recall
74.00734067400734 bpe_default
78.37837837837837 bpe_prime
89.3893893893894 unigram_default
90.55722389055722 unigram_prime

f1
20.637357525005815 bpe_default
24.400124649423496 bpe_prime
26.940868865647627 unigram_default
24.427343503892715 unigram_prime



In [108]:
# Number of entries containing prefixes
len(x[x['PRS_signature'].isin(['1,1,0', '1,1,1', '1,1,2', '1,1,3','1,2,0', '1,2,1', '1,2,2', '1,2,3', '1,3,0', '2,1,0', '2,1,1',
       '2,1,2'])])

2692

In [109]:
# Number of entries containing suffixes
len(x[x['PRS_signature'].isin(['0,1,1', '0,1,2', '0,1,3','0,2,1', '0,2,2', '0,2,3', '0,3,1', '1,1,1', '1,1,2', '1,1,3', '1,2,1', '1,2,2', '1,2,3', '2,1,1', '2,1,2'])])

7422

####  MorphyNet

In [110]:
morphynet_der = pd.read_csv('eng.derivational.v1.tsv', sep='\t', header=None)

In [119]:
# This generates the results in the MorphyNet column of Table 1 in the paper

x = morphynet_der[[0, 1]]

tps = defaultdict(int)
fps = defaultdict(int)
fns = defaultdict(int)
tns = defaultdict(int)
lengths = defaultdict(int)


def get_boundaries(y):
    boundaries = [len(''.join(y[:i])) for i in range(1, len(y))]
    return boundaries

length = 0 

for _, row in x.iterrows():
    d = row[0]
    try:
        gstandard = list(filter(None, re.split(f"({d})", row[1])))
    except:
        continue
    # We don't want entries consisting of a single morpheme
    if len(gstandard) == 1 or ''.join(gstandard) != row[1]:
        continue
    gstandard_boundaries = get_boundaries(gstandard)   
    
    for key, val in tokenisers.items():
        
        spm = val
        y = [x.lstrip('▁') for x in spm.encode(row[1], out_type=str) if x != "▁" ]
        y_boundaries = get_boundaries(y)
        fn = 0
        for i in y_boundaries:
            if i in gstandard_boundaries:
                tps[key] += 1
            else:
                fps[key] += 1
        for i in gstandard_boundaries:
            if i not in y_boundaries:
                fn += 1
                
        tns[key] += (len(row[1]) - 1 - len(y_boundaries) - fn)
        fns[key] += fn
                
        lengths[key] += len(y)
    
    length += 1

print('length')
for k in tokenisers.keys():
    print(lengths[k] / len(x), k)
print()

print('precision')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fps[k]), k)
print()

print('recall')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fns[k]), k)
print()

print('f1')
for k in tokenisers.keys():
    print(100 * (tps[k] / (tps[k] + 0.5*(fps[k] + fns[k]))), k)
print()

  gstandard = list(filter(None, re.split(f"({d})", row[1])))
  gstandard = list(filter(None, re.split(f"({d})", row[1])))


length
3.170682846875819 bpe_default
2.9342738227965053 bpe_prime
3.0859766091742142 unigram_default
3.032545495733595 unigram_prime

precision
19.89155064496397 bpe_default
24.620755125350637 bpe_prime
32.298464069775804 unigram_default
33.59828141783029 unigram_prime

recall
53.27525540037196 bpe_default
59.19054954226277 bpe_prime
83.33135847754073 unigram_default
84.60282215479091 unigram_prime

f1
28.96743750884127 bpe_default
34.77612075828064 bpe_prime
46.55329963117579 unigram_default
48.09615716917418 unigram_prime



#### DagoBERT

In [121]:
dfs = []
for i in range(1, 8):
    dfs += [pd.read_csv(os.path.join('dagobert_data', f'bin{i}.txt'), sep='\|\|\|', header=None)]

dagobert_db = pd.concat(dfs)

  dfs += [pd.read_csv(os.path.join('dagobert_data', f'bin{i}.txt'), sep='\|\|\|', header=None)]


In [122]:
# This generates the results in the DagoBERT column of Table 1 in the paper

x = dagobert_db[~dagobert_db.duplicated(1)][[1, 2]]

tps = defaultdict(int)
fps = defaultdict(int)
fns = defaultdict(int)
tns = defaultdict(int)
lengths = defaultdict(int)


for _, row in x.iterrows():
    d = row[2]
    try:
        gstandard = list(filter(None, re.split(f"({d})", row[1])))
    except:
        continue
    if len(gstandard) == 1:
        continue
    gstandard_boundaries = get_boundaries(gstandard)
    
        
    for key, val in tokenisers.items():
        
        spm = val
        y = [x.lstrip('▁') for x in spm.encode(row[1], out_type=str) if x != "▁" ]
        y_boundaries = get_boundaries(y)
        fn = 0
        for i in y_boundaries:
            if i in gstandard_boundaries:
                tps[key] += 1
            else:
                fps[key] += 1
        for i in gstandard_boundaries:
            if i not in y_boundaries:
                fn += 1
                
        tns[key] += (len(row[1]) - 1 - len(y_boundaries) - fn)
        fns[key] += fn
                
        lengths[key] += len(y)
    length += 1
                

print('length')
for k in tokenisers.keys():
    print(lengths[k] / len(x), k)
print()

print('precision')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fps[k]), k)
print()

print('recall')
for k in tokenisers.keys():
    print(100 * tps[k] / (tps[k] + fns[k]), k)
print()

print('f1')
for k in tokenisers.keys():
    print(100 * (tps[k] / (tps[k] + 0.5*(fps[k] + fns[k]))), k)
print()

length
3.2155287482599313 bpe_default
2.8623941197310363 bpe_prime
3.160619518112817 unigram_default
2.8098538879127406 unigram_prime

precision
28.396429455720554 bpe_default
37.421687245818774 bpe_prime
45.33219456645518 unigram_default
54.52958401799176 unigram_prime

recall
60.08691797200355 bpe_default
66.78208710604798 bpe_prime
93.58694484000108 unigram_default
94.62371369461835 unigram_prime

f1
38.56666767263493 bpe_default
47.96560187669986 bpe_prime
61.07871975731326 unigram_default
69.18776621281309 unigram_prime



### Qualitative Evaluation of Morphological Correctness (Table 3) 

In [18]:
# This generates the data in Table 3 of the paper

inputs = ["directional", "unidirectional", "electroneutral", "neurotransmitter", "responsiveness", "hyporesponsiveness",
         "hyperresponsiveness", "saturated", "unsaturated", "equal", "unequal", "multiplayer", "nonmultiplayer", "overpriced",
         "accessible", "unaccessible", "unicycle"]
table_3 = pd.DataFrame(columns=["input", "bpe_default", "bpe_prime", "unigram_default", "unigram_prime"])

for string in inputs:
    row = {
            'input': string, 
            'bpe_default': ', '.join(bpe_default.encode(string, out_type=str)),
            'bpe_prime': ', '.join(bpe_prime.encode(string, out_type=str)),
            'unigram_default': ', '.join(unigram_default.encode(string, out_type=str)),
            'unigram_prime': ', '.join(unigram_prime.encode(string, out_type=str))
          }
    table_3 = table_3.append(row, ignore_index=True)

table_3

Unnamed: 0,input,bpe_default,bpe_prime,unigram_default,unigram_prime
0,directional,"▁direction, al","direction, al","▁direction, al","direction, al"
1,unidirectional,"▁un, id, ire, ction, al","un, id, ire, ction, al","▁un, i, direct, ional","uni, direction, al"
2,electroneutral,"▁elect, r, one, ut, ral","electr, one, utr, al","▁electron, eu, tral","electro, neutral"
3,neurotransmitter,"▁neuro, trans, mit, ter","neuro, transmitter","▁neuro, trans, mitt, er","neuro, transmitter"
4,responsiveness,"▁respons, iveness","respons, iveness","▁re, s, pon, s, ive, ness","r, e, sp, on, s, ive, ness"
5,hyporesponsiveness,"▁hyp, ores, p, ons, iveness","hypo, respons, iveness","▁hypo, res, pon, s, ive, ness","hypo, r, e, sp, on, s, ive, ness"
6,hyperresponsiveness,"▁hyper, resp, ons, iveness","hyper, respons, iveness","▁hyper, res, pon, s, ive, ness","hyper, r, e, sp, on, s, ive, ness"
7,saturated,"▁sat, urated","sat, urated","▁sat, ur, ated",saturated
8,unsaturated,"▁uns, atur, ated","un, sat, urated","▁un, sa, tur, ated","un, saturated"
9,equal,▁equal,equal,▁equal,equal


In [20]:
# Also define a function to generate tokenisations for each of the four tokenisers given inputs

def generate_tokenisations(strs):
    for x in strs:
        print((', '.join(bpe_default.encode(x, out_type=str))))
        print((', '.join(bpe_prime.encode(x, out_type=str))))
        print((', '.join(unigram_default.encode(x, out_type=str))))
        print((', '.join(unigram_prime.encode(x, out_type=str))))

In [21]:
# Example segmentation from the paper
generate_tokenisations(["thisisasentencethatneedstobesegmented"])

▁this, is, as, ent, enc, eth, at, ne, ed, st, ob, ese, g, mented
this, is, as, ent, ence, that, need, sto, b, ese, g, mented
▁this, isa, s, ent, ence, that, ne, ed, s, to, be, s, eg, ment, ed
this, is, a, sentence, that, needs, to, be, segment, e, d


### Vocabulary Comparison (Table 4)

In [24]:
# Get the vocabularies of the models

unigram_default_vocab = [unigram_default.id_to_piece(id) for id in range(unigram_default.get_piece_size())]
unigram_prime_vocab = [unigram_prime.id_to_piece(id) for id in range(unigram_prime.get_piece_size())]
bpe_default_vocab = [bpe_default.id_to_piece(id) for id in range(bpe_default.get_piece_size())]
bpe_prime_vocab = [bpe_prime.id_to_piece(id) for id in range(bpe_prime.get_piece_size())]

In [34]:
# Remove duplicate items from default Unigram and BPE vocabularies
unigram_default_vocab_no_duplicates = set([x.lstrip('▁') if x != "▁" else x for x in unigram_default_vocab])
bpe_default_vocab_no_duplicates = set([x.lstrip('▁') if x != "▁" else x for x in bpe_default_vocab])

In [35]:
# Percentage of duplicate items for default BPE
100*(1 - len(bpe_default_vocab_no_duplicates) / len(bpe_default_vocab))

8.650000000000002

In [36]:
# Percentage of duplicate items for default Unigram
100*(1 - len(unigram_default_vocab_no_duplicates) / len(unigram_default_vocab))

9.081249999999997

In [None]:
# List of prefixes taken from Wikipedia
prefixes = ["a-",
"after-",
"anti-",
"back-",
"be-",
"by-",
"co-",
"de-",
"dis-",
"dis-",
"down-",
"en-,",
"em",
"ex-",
"fore-",
"hind-",
"mid-",
"midi-",
"mini-",
"mis-",
"off-",
"on-",
"out-",
"over-",
"post-",
"pre-",
"pro-",
"re-",
"self-",
"step-",
"twi-",
"un-",
"un-",
"under-",
"up-",
"with-",
"a-",
"Afro-",
"ambi-",
"amphi-",
"an-",
"a",
"ana-,",
"Anglo-",
"ante-",
"anti-",
"apo-,",
"ap",
"arch-",
"astro-",
"auto-",
"bi-",
"bio-",
"circum-",
"cis-",
"con-,",
"co",
"com",
"col", 
"cor",
"contra-,",
"contro",
"counter-",
"cryo-",
"crypto-",
"de-",
"demi-",
"demo-",
"deuter-",
"di-",
"dia-",
"dis-",
"di",
"dif",
"du-",
"duo-",
"eco-",
"electro-",
"en-,",
"el",
"em",
"epi-,",
"ep",
"Euro-",
"ex-",
"extra-",
"Franco-",
"geo-",
"gyro-",
"hetero-",
"hemi-",
"homo-",
"hydro-",
"hyper-",
"hypo-",
"ideo-",
"idio-",
"in-",
"Indo-",
"in-,",
"il",
"im",
"ir",
"infra-",
"inter-",
"intra-",
"iso-",
"macro-",
"mal-",
"maxi-",
"mega-,",
"megalo",
"meta-",
"micro-",
"mono-,",
"mon",
"multi-,",
"mult",
"neo-",
"non-",
"ob-",
"omni-",
"ortho-",
"paleo-",
"pan-",
"para-",
"ped-",
"pen-",
"per-",
"peri-",
"photo-",
"pleo-",
"pod-",
"poly-",
"post-",
"pre-",
"preter-",
"pro-",
"pro-",
"pros-",
"proto-",
"pseudo-",
"pyro-",
"quadri-",
"quasi-",
"retro-",
"semi-",
"socio-",
"sub-,",
"sup",
"super-",
"supra-",
"sur-",
"syn-,",
"sy",
"syl",
"sym",
"tele-",
"trans-",
"tri-",
"ultra-",
"uni-",
"vice-",]
prefixes = [a.rstrip(',').rstrip('-') for a in prefixes]

In [None]:
# List of suffixes taken from Wikipedia
suffixes = ["-a",
"-ability",
"-able",
"-ably",
"-ac",
"-acean",
"-aceous",
"-ad",
"-ade",
"-aemia",
"-age",
"-agog",
"-aholic",
"-al",
"-algia",
"-all",
"-amine",
"-an",
"-ana",
"-ance",
"-ancy",
"-androus",
"-andry",
"-ane",
"-ant",
"-ar",
"-arch",
"-archy",
"-ard",
"-arian",
"-arium",
"-art",
"-ary",
"-ase",
"-ate",
"-athon",
"-ation",
"-ative",
"-ator",
"-atory",
"B",
"-biont",
"-biosis",
"-blast",
"-bot",
"C",
"-cade",
"-caine",
"-carp",
"-carpic",
"-carpous",
"-cele",
"-cene",
"-centric",
"-cephalic",
"-cephalous",
"-cephaly",
"-chore",
"-chory",
"-chrome",
"-cide",
"-clast",
"-clinal",
"-cline",
"-clinic",
"-coccus",
"-coel",
"-coele",
"-colous",
"-cracy",
"-crat",
"-cratic,",
"-cy",
"-cyte",
"D",
"-dale",
"-derm",
"-derma",
"-dermatous",
"-dom",
"-drome",
"-dromous",
"E",
"-ean",
"-eaux",
"-ectomy",
"-ed",
"-ee",
"-eer",
"-ein",
"-eme",
"-emia",
"-en",
"-ence",
"-enchyma",
"-ency",
"-ene",
"-ent",
"-eous",
"-er",
"-ergic",
"-ergy",
"-es",
"-escence",
"-escent",
"-ese",
"-esque",
"-ess",
"-est",
"-et",
"-eth",
"-etic",
"-ette",
"-ey",
"F",
"-facient",
"-faction",
"-fer",
"-ferous",
"-fic",
"-fication",
"-fid",
"-florous",
"-fold",
"-foliate",
"-foliolate",
"-form",
"-fuge",
"-ful",
"-fy",
"G",
"-gamous",
"-gamy",
"-gate",
"-gen,",
"-genesis",
"-genetic",
"-genic",
"-genous",
"-geny",
"-gnathous",
"-gon",
"-gony",
"-gram",
"-graph",
"-grapher",
"-graphy",
"-gyne",
"-gynous",
"-gyny",
"H",
"-hood",
"I",
"-ia",
"-ial",
"-ian",
"-iana",
"-iasis",
"-iatric",
"-iatrics",
"-iatry",
"-ibility",
"-ible",
"-ic",
"-icide",
"-ician",
"-ick",
"-ics",
"-id",
"-ide",
"-ie",
"-ify",
"-ile",
"-in",
"-ine",
"-ing",
"-ion",
"-ious",
"-isation",
"-ise",
"-ish",
"-ism",
"-ist",
"-istic",
"-istical",
"-istically",
"-ite",
"-itious",
"-itis",
"-ity",
"-ium",
"-ive",
"-ix",
"-ization",
"-ize",
"-i-",
"J",
"K",
"-kin",
"-kinesis",
"-kins",
"L",
"-land",
"-latry",
"-le",
"-lepry",
"-less",
"-let",
"-like",
"-ling",
"-lite",
"-lith",
"-lithic",
"-log",
"-logic",
"-logical",
"-logist",
"-logy",
"-ly",
"-lyse",
"-lysis",
"-lyte",
"-lytic",
"-lyze",
"M",
"-mancy",
"-mania",
"-meister",
"-ment",
"-mer",
"-mere",
"-merous",
"-meter",
"-metric",
"-metrics",
"-metry",
"-mire",
"-mo",
"-morph",
"-morphic",
"-morphism",
"-morphous",
"-most",
"-mycete",
"-mycin",
"N",
"-n't",
"-nasty",
"-ness",
"-nik",
"-nomy",
"-nomics",
"O",
"-o",
"-ode",
"-odon",
"-odont",
"-odontia",
"-oholic",
"-oic",
"-oid",
"-ol",
"-ole",
"-oma",
"-ome",
"-omics",
"-on",
"-one",
"-ont",
"-onym",
"-onymy",
"-opia",
"-opsis",
"-opsy",
"-or",
"-orama",
"-ory",
"-ose",
"-osis",
"-otic",
"-otomy",
"-ous",
"-o-",
"P",
"-para",
"-parous",
"-path",
"-pathy",
"-ped",
"-pede",
"-penia",
"-petal",
"-phage",
"-phagia",
"-phagous",
"-phagy",
"-phane",
"-phasia",
"-phil",
"-phile",
"-philia",
"-philiac",
"-philic",
"-philous",
"-phobe",
"-phobia",
"-phobic",
"-phone",
"-phony",
"-phore",
"-phoresis",
"-phorous",
"-phrenia",
"-phyll",
"-phyllous",
"-plasia",
"-plasm",
"-plast",
"-plastic",
"-plasty",
"-plegia",
"-plex",
"-ploid",
"-pod",
"-pode",
"-podous",
"-poieses",
"-poietic",
"-pter",
"-punk",
"Q",
"R",
"-rrhagia",
"-rrhea",
"-ric",
"-ry",
"S",
"-'s",
"-s",
"-scape",
"-scope",
"-scopy",
"-script",
"-sect",
"-sepalous",
"-ship",
"-some",
"-speak",
"-sperm",
"-sphere",
"-sporous",
"-st",
"-stasis",
"-stat",
"-ster",
"-stome",
"-stomy",
"T",
"-taxis",
"-taxy",
"-tend",
"-th",
"-therm",
"-thermal,",
"-thermy",
"-thon",
"-thymia",
"-tion",
"-tome",
"-tomy",
"-tonia",
"-trichous",
"-trix",
"-tron",
"-trophic",
"-trophy",
"-tropic",
"-tropism",
"-tropous",
"-tropy",
"-tude",
"-ture",
"-ty",
"U",
"-ular",
"-ule",
"-ure",
"-urgy",
"-uria",
"-uronic",
"-urous",
"V",
"-valent",
"-virile",
"-vorous",
"W",
"-ward",
"-wards",
"-ware",
"-ways",
"-wear",
"-wide",
"-wise",
"-worthy",
"X",
"-xor",
"Y",
"-y",
"-yl",
"-yne",
"Z",
"-zilla",
"-zoic",
"-zoon",
"-zygous",
"-zyme",]
suffixes = [a.lstrip('-') for a in suffixes if a[0] == '-']