# Morpheme Finder
[TOC]


## Import & Define Env Variables

In [35]:
from collections import defaultdict
from tqdm import tqdm
from requests import request, ConnectionError
from json import loads
from random import sample
from math import ceil
import pycrfsuite
import re

word_dict_morpholex = defaultdict(None)
word_dict_celex = defaultdict(None)
label_func = defaultdict(None)
known_prefixes = set()
known_suffixes = set()

EVQR_AFFIX = '<evqr.affix>'
PREFIX_AND_SUFFIX = '<prefix.and.suffix>'
VOWEL = '<vowel>'
CELEX_WORD_ROOT = '<celex.word.root>'
morpholex = '<morphoLEX>'

CROSS_VALIDATION_FOLD = 5

In [36]:
try:
    with open('.env.json') as f:
        ENV_VARIABLES = loads(f.read())
        f.close()
except FileNotFoundError:
    ENV_VARIABLES = {'DATA_DIR': 'C:\\'}
DATA_DIR = ENV_VARIABLES['DATA_DIR']
FTP_DIR = 'http://m106.nthu.edu.tw/~s106062341/morpheme_finder_data/'

### Class Word

In [37]:
class Word:

    @staticmethod
    def create_synonym_postfix(word, delete=None, append=None):
        return f'{word}{f"--{delete}--" if delete is not None else ""}{f"++{append}++" if append is not None else ""}'

    @staticmethod
    def create_synonym_prefix(word, delete=None, append=None):
        return f'{f"--{delete}--" if delete is not None else ""}{f"++{append}++" if append is not None else ""}{word}'

    @staticmethod
    def letter_cmp(a, b):
        divider = 0
        for i, (letter_a, letter_b) in enumerate(zip(a, b)):
            if letter_a != letter_b:
                divider = i
        return min(divider, len(a), len(b))

    def __init__(self, text, affix_list):
        self.text = text
        self.affix_list = affix_list
        self.synonym = defaultdict(None)
        self.label = defaultdict(None)

    @property
    def count(self):
        return sum([c for c in self.synonym.values()])

    def create_label(self, label_name, *args):
        if label_name not in label_func:
            return False
        self.label[label_name] = label_func[label_name](self, *args)
        return True

## Data Accessing
### first provide a method to access files either in local storage or in FTP

In [38]:
def get_file(filename: str, callback: classmethod) -> bool:
    try:
        with open(f'asd{DATA_DIR}{filename}', 'r') as f:
            callback(f.read())
            f.close()
            return True
    except FileNotFoundError:
        try:
            res = request('GET', f'{FTP_DIR}{filename}')
            res.encoding = 'Big5'
            callback(res.text)
            return True
        except ConnectionError:
            print('HTTP connection failed')
            return False
        except Exception as e:
            print(f'Load failed: {e}')
            return False

### Load Data
includes:
1. *EVQR.word.and.affix.txt'*
2. *prefixes.txt*
3. *suffixes.txt*

In [39]:
# def evqr_word_and_suffix_callback(content):
#     for line in content.split('\n')[1:-1]:
#         word, *affix_list = line.replace('-', '').split(' ')[:-1]
#         word_dict[word] = (Word(word, affix_list))
# if get_file('EVQR.word.and.affix.txt', evqr_word_and_suffix_callback):
#     print('Load done')

In [40]:
# def celex_word_and_root_callback(content):
#     for line in content.split('\r\n'):
#         word, *affix_list = line.split(' ')
#         word_dict[word] = (Word(word, affix_list))
# if get_file('CELEX.word.and.root.txt', celex_word_and_root_callback):
#     print('Load done')

In [41]:
bad_celex = []
def celex_word_and_root_callback(content):
    for line in content.split('\r\n'):
        word, *affix_list = line.split(' ')
        if word == ''.join(affix_list):
            word_dict_celex[word] = Word(word, affix_list)
        else:
            bad_celex.append(line)
if get_file('CELEX.word.and.root.txt', celex_word_and_root_callback):
    print(f'Load CELEX.word.and.root.txt done [{len(word_dict_celex.keys())} / {len(bad_celex)}]')

Load CELEX.word.and.root.txt done [11770 / 8296]


In [42]:
bad_morpholex = []
def morpholex_callback(content):
    for line in content.split('\r\n'):
        word, *affix_list = line.split('\t')
        try:
            affix_list = affix_list[0].split()
        except:
            print(affix_list)
        if word == ''.join(affix_list):
            word_dict_morpholex[word] = Word(word, affix_list)
        else:
            bad_morpholex.append(line)
if get_file('morphoLEX.txt', morpholex_callback):
    print(f'Load morphoLEX.txt done [{len(word_dict_morpholex.keys())} / {len(bad_morpholex)}]')

[]
Load morphoLEX.txt done [27025 / 41589]


In [43]:
def prefix_callback(content):
    for line in content.split('\n')[1:-1]:
        known_prefixes.update(filter(lambda x: len(x) > 0, line[:-1].strip().replace('-', '').split(', ')))

def suffix_callback(content):
    for line in content.split('\n'):
        known_suffixes.update(filter(lambda x: len(x) > 0, line[:-1].strip().replace('-', '').split(', ')))

if get_file('prefixes_1.txt', prefix_callback) and get_file('suffixes.txt', suffix_callback) and get_file('prefixes.txt', prefix_callback) and get_file('all_suffixes.txt', suffix_callback):
    print('Load prefixes & suffixes done')

Load prefixes & suffixes done


In [44]:
def word_roots_callback(content):
    for line in content.split('\n'):
        prefixes = line.split('\t')[0].split(', ')
        prefixes = [re.sub(r'[^A-z]', '', prefix) for prefix in prefixes]
        if '' not in prefixes:
            known_prefixes.update(prefixes)
if get_file('word_roots.txt', word_roots_callback):
    print('Load word_roots.txt done')

Load word_roots.txt done


In [45]:
# def prefix_callback(content):
#     for line in content.split('\n')[1:-1]:
#         known_prefixes.update(filter(lambda x: len(x) > 0, line[:-1].strip().replace('-', '').split(', ')))
        
# if get_file('prefixes_1.txt', prefix_callback) and get_file('prefixes.txt', prefix_callback):
#     print("DONE")

## Labelize Word
### Mapping Label Function
because different label has its label function respectively

In [46]:
def evqr_affix(word):
    text = word.text
    label = [0] * len(text)
    pos = 0
    for affix in word.affix_list:
        if affix.lower() in text:
            label[text.find(affix, pos)] = 1 if pos != 0 else 0
            pos = text.find(affix, pos) + len(affix)
        else:
            k = Word.letter_cmp(text[pos:], affix)
            if k > 1:
                label[pos] = 1 if pos != 0 else 0
                pos += 1

    return [t for t in zip(text, label)]

def vowel(word):
    vowels = {"a", "e", "i", "o", "u"}
    return [(letter, int(letter in vowels)) for letter in word.text]

def prefix_and_suffix(word):
    word_len = len(word.text)
    label = [0] * word_len

    for i in range(word_len):
        pattern = word.text[:word_len - 1 - i]
        if pattern in known_prefixes:
            label[len(pattern)] = 1

    for i in range(word_len):
        pattern = word.text[i + 1:]
        if pattern in known_suffixes:
            label[i] = 2 if label[i] == 0 else 3

    return [t for t in zip(word.text, label)]

def celex_word_root(word):
    text = word.text
    label = [0] * len(text)
    pos = 0
    for affix in word.affix_list:
        prev_pos = text.find(affix, pos)
        label[prev_pos] = 1 if pos != 0 else 0
        pos = prev_pos + len(affix)

    return [t for t in zip(text, label)]

label_func[EVQR_AFFIX] = evqr_affix
label_func[VOWEL] = vowel
label_func[PREFIX_AND_SUFFIX] = prefix_and_suffix
label_func[CELEX_WORD_ROOT] = celex_word_root
label_func[morpholex] = celex_word_root
print('Mapping done')

Mapping done


### Create Label for each Word

In [47]:
for word in tqdm(word_dict_morpholex.values()):
    if not word.create_label(morpholex):
        print('Failed at combining labels')
        
for word in tqdm(word_dict_celex.values()):
    if not word.create_label(morpholex):
        print('Failed at combining labels')
print('Label done')

100%|█████████████████████████████████| 27025/27025 [00:00<00:00, 445408.29it/s]
100%|█████████████████████████████████| 11770/11770 [00:00<00:00, 411024.82it/s]

Label done





In [48]:
# print(f'labeled by EVQR.word.and.suffix: ignoble -> {word_dict["ignoble"].label[EVQR_AFFIX]}')
# print(f'labeled by prefix & suffix     : demagog -> {word_dict["demagog"].label[PREFIX_AND_SUFFIX]}')
# print(f'labeled by position of vowels  : amphibology -> {word_dict["amphibology"].label[VOWEL]}')

In [49]:
# word_dict = {**word_dict_morpholex, **word_dict_celex}
word_dict = word_dict_celex
# word_dict = word_dict_morpholex

In [50]:
prepared_word = []
for word in tqdm(word_dict.values()):
    prepared_word.append(word.label[morpholex])

100%|████████████████████████████████| 11770/11770 [00:00<00:00, 1582426.45it/s]


## Training
### features creator
based on
1. prev & after letter

In [51]:
def create_char_features(word, i):
    
    text = ''
    length = len(word)
    for j in range(length):
        text += word[j][0]
    pre = text[:i]
    suf = text[i:]
    features = [
        'bias',
        'char=' + word[i][0],
        'vowel=1' if word[i][0] in ['a', 'e', 'i', 'o', 'u'] else 'vowel=0', 
        
        'prefix=1' if pre in known_prefixes else 'prefix=0', 
        'suffix=1' if suf in known_suffixes else 'suffix=0' 
    ]
    
#     not from the beginning nor til the end
    if i > 0:
        for j in range(i+1, length):
            if text[i:j] in known_prefixes or text[i:j] in known_suffixes:
                features.extend([
                    'potential_morpheme'
                ])
                break
    

    if i >= 1:
        features.extend([
            'char-1=' + word[i-1][0],
            'char-1:0=' + word[i-1][0] + word[i][0],
        ])
    else:
        features.append("BOS")

    if i >= 2:
        features.extend([
            'char-2=' + word[i-2][0],
            'char-2:0=' + word[i-2][0] + word[i-1][0] + word[i][0],
            'char-2:-1=' + word[i-2][0] + word[i-1][0],
        ])
        
        
        
    if i + 1 < len(word):
        features.extend([
            'char+1=' + word[i+1][0],
            'char:+1=' + word[i][0] + word[i+1][0],
        ])
    else:
        features.append("EOS")
        
        
    if i + 2 < len(word):
        features.extend([
            'char+2=' + word[i+2][0],
            'char:+2=' + word[i][0] + word[i+1][0] + word[i+2][0],
            'char+1:+2=' + word[i+1][0] + word[i+2][0],
        ])


    if i + 3 < len(word):
        features.extend([
            'char+3=' + word[i+3][0], 
            'char:+3=' + word[i][0] + word[i+1][0] + word[i+2][0] + word[i+3][0],
            'char+1:+3=' + word[i+1][0] + word[i+2][0] + word[i+3][0], 
            'char+2:+3=' + word[i+2][0] + word[i+3][0]
        ])
        
    if i >= 3:
        features.extend([
            'char-3=' + word[i-3][0], 
            'char-3:0=' + word[i-3][0] + word[i-2][0] + word[i-1][0] + word[i][0],
            'char-3:-1=' + word[i-3][0] + word[i-2][0] + word[i-1][0],
            'char-3:-2=' + word[i-3][0] + word[i-2][0]
        ])
        
    if i + 4 < len(word):
        features.extend([
            'char+4=' + word[i+4][0], 
            'char:+4=' + word[i][0] + word[i+1][0] + word[i+2][0] + word[i+3][0] + word[i+4][0],
            'char+1:+4=' + word[i+1][0] + word[i+2][0] + word[i+3][0] + word[i+4][0], 
            'char+2:+4=' + word[i+2][0] + word[i+3][0] + word[i+4][0], 
            'char+3:+4=' + word[i+3][0] + word[i+4][0]
        ])
        
    if i >= 4:
        features.extend([
            'char-4=' + word[i-4][0], 
            'char-4:0=' + word[i-4][0] + word[i-3][0] + word[i-2][0] + word[i-1][0] + word[i][0],
            'char-4:-1=' + word[i-4][0] + word[i-3][0] + word[i-2][0] + word[i-1][0],
            'char-4:-2=' + word[i-4][0] + word[i-3][0] + word[i-2][0], 
            'char-4:-3=' + word[i-4][0] + word[i-3][0]
        ])
        
    if i + 5 < len(word):
        features.extend([
            'char+5=' + word[i+5][0], 
            'char:+5=' + word[i][0] + word[i+1][0] + word[i+2][0] + word[i+3][0] + word[i+4][0] + word[i+5][0],
            'char+1:+5=' + word[i+1][0] + word[i+2][0] + word[i+3][0] + word[i+4][0] + word[i+5][0], 
            'char+2:+5=' + word[i+2][0] + word[i+3][0] + word[i+4][0] + word[i+5][0], 
            'char+3:+5=' + word[i+3][0] + word[i+4][0] + word[i+5][0], 
            'char+4:+5=' + word[i+4][0] + word[i+5][0],
        ])
        
    if i >= 5:
        features.extend([
            'char-5=' + word[i-5][0], 
            'char-5:0=' + word[i-5][0] + word[i-4][0] + word[i-3][0] + word[i-2][0] + word[i-1][0] + word[i][0],
            'char-5:-1=' + word[i-5][0] + word[i-4][0] + word[i-3][0] + word[i-2][0] + word[i-1][0],
            'char-5:-2=' + word[i-5][0] + word[i-4][0] + word[i-3][0] + word[i-2][0], 
            'char-5:-3=' + word[i-5][0] + word[i-4][0] + word[i-3][0], 
            'char-5:-4=' + word[i-5][0] + word[i-4][0],
        ])
        
        
    if i + 6 < len(word):
        features.extend([
            'char+6=' + word[i+6][0], 
            'char:+6=' + word[i][0] + word[i+1][0] + word[i+2][0] + word[i+3][0] + word[i+4][0] + word[i+5][0] + word[i+6][0],
            'char+1:+6=' + word[i+1][0] + word[i+2][0] + word[i+3][0] + word[i+4][0] + word[i+5][0] + word[i+6][0], 
            'char+2:+6=' + word[i+2][0] + word[i+3][0] + word[i+4][0] + word[i+5][0] + word[i+6][0], 
            'char+3:+6=' + word[i+3][0] + word[i+4][0] + word[i+5][0] + word[i+6][0], 
            'char+4:+6=' + word[i+4][0] + word[i+5][0] + word[i+6][0],
            'char+5:+6=' + word[i+5][0] + word[i+6][0],
        ])
        
    if i >= 6:
        features.extend([
            'char-6=' + word[i-6][0], 
            'char-6:0=' + word[i-6][0] + word[i-5][0] + word[i-4][0] + word[i-3][0] + word[i-2][0] + word[i-1][0] + word[i][0],
            'char-6:-1=' + word[i-6][0] + word[i-5][0] + word[i-4][0] + word[i-3][0] + word[i-2][0] + word[i-1][0],
            'char-6:-2=' + word[i-6][0] + word[i-5][0] + word[i-4][0] + word[i-3][0] + word[i-2][0], 
            'char-6:-3=' + word[i-6][0] + word[i-5][0] + word[i-4][0] + word[i-3][0], 
            'char-6:-4=' + word[i-6][0] + word[i-5][0] + word[i-4][0],
            'char-6:-5=' + word[i-6][0] + word[i-5][0],
        ])
        
    return features


def create_word_features(prepared_word):
    return [create_char_features(prepared_word, i) for i in range(len(prepared_word))]


def create_word_labels(prepared_word):
    return [str(part[1]) for part in prepared_word]

### create k-fold cross validation
we split all data into 5 folds here

In [52]:
sample_range = set(range(len(word_dict.values())))
sample_set_size = ceil(len(sample_range) / CROSS_VALIDATION_FOLD)
sample_list = []
selected_samples = set()
for i in range(CROSS_VALIDATION_FOLD - 1):
    samples = set(sample(sample_range, sample_set_size))
    sample_list.append(samples)
    sample_range.difference_update(samples)
sample_list.append(set(sample_range))

In [53]:
# for l in sample_list:
#     print(' ')
#     print(list(sorted(l)))

### interface of using pycrfsuite

In [54]:
def train(folds):
    trainer = pycrfsuite.Trainer(verbose=False)
    for fold in tqdm(folds):
        for idx in fold:
            trainer.append(create_word_features(prepared_word[idx]),
                           create_word_labels(prepared_word[idx]))

    trainer.set_params({
        'c1': 0.1,
        'c2': 1e-3,
        'max_iterations': 100,
        'feature.possible_transitions': True
    })
    trainer.train('word-segmentation.crfsuite')


def test(fold):
    tagger = pycrfsuite.Tagger()
    tagger.open('word-segmentation.crfsuite')
    score = 0
    for word in fold:
        w = word.replace(" ", "")
        prediction = tagger.tag(create_word_features(w))
        complete = ""
        for i, p in enumerate(prediction):
            if int(p) >= 1:
                complete += " " + w[i]
            else:
                complete += w[i]
        if complete == ' '.join(word_dict[word].affix_list):
            score += 1
        else:
            continue
#             print(f'{word} -> {complete}, {word_dict[word].affix_list}')
    return score / len(fold)



## Implement
run 5 times of train & test here

In [55]:
word_list = list(word_dict.values())
scores = []
for test_set_idx in range(len(sample_list)):
    test_fold = [word_list[idx].text for idx in sample_list[test_set_idx]]
    train_folds = sample_list[:test_set_idx] + sample_list[(test_set_idx+1):]
    train(train_folds)
    scores.append(test(test_fold))
print(scores)
print(sum(scores) / 5)
    
    

100%|█████████████████████████████████████████████| 4/4 [00:03<00:00,  1.24it/s]
100%|█████████████████████████████████████████████| 4/4 [00:03<00:00,  1.15it/s]
100%|█████████████████████████████████████████████| 4/4 [00:03<00:00,  1.19it/s]
100%|█████████████████████████████████████████████| 4/4 [00:03<00:00,  1.24it/s]
100%|█████████████████████████████████████████████| 4/4 [00:03<00:00,  1.23it/s]


[0.8725573491928632, 0.8598130841121495, 0.8666100254885302, 0.8776550552251486, 0.8619371282922684]
0.867714528462192


In [56]:
tagger = pycrfsuite.Tagger()
tagger.open('word-segmentation.crfsuite')

def segment_word(word):
    w = word.replace(" ", "")
    prediction = tagger.tag(create_word_features(w))
    complete = ""
    for i, p in enumerate(prediction):
        if int(p) >= 1:
            complete += " " + w[i]
        else:
            complete += w[i]
    return complete

In [57]:
segment_word('segmentation')

'segment ation'

In [58]:
segment_word('segment')

'seg ment'

In [59]:
segment_word('intersect')

'inter sect'

In [60]:
segment_word('preview')

'pre view'

Other tests

In [76]:
word_dict_for_test = word_dict_celex
word_dict_for_train = word_dict_morpholex
model_name = 'morpholex'

# word_dict_for_test = word_dict_morpholex
# word_dict_for_train = word_dict_celex
# model_name = 'celex'

In [77]:
# pure word
word_for_test = []
for word in tqdm(word_dict_for_test.values()):
    word_for_test.append(word.text)

100%|████████████████████████████████| 11770/11770 [00:00<00:00, 2327312.75it/s]


In [78]:
word_for_test[0]

'aback'

In [79]:
word_for_train = []
for word in tqdm(word_dict_for_train.values()):
    word_for_train.append(word.label[morpholex])

100%|████████████████████████████████| 27025/27025 [00:00<00:00, 1879941.38it/s]


In [80]:
word_for_train[0]

[('a', 0), ('l', 0), ('f', 0)]

In [81]:

trainer = pycrfsuite.Trainer(verbose=False)
for idx in range(len(word_for_train)):
    trainer.append(create_word_features(word_for_train[idx]),
                   create_word_labels(word_for_train[idx]))

trainer.set_params({
    'c1': 0.1,
    'c2': 1e-3,
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('word-segmentation_' + model_name + '.crfsuite')

In [82]:
tagger = pycrfsuite.Tagger()
tagger.open('word-segmentation_' + model_name + '.crfsuite')
score = 0
for word in word_for_test:
    w = word.replace(" ", "")
    prediction = tagger.tag(create_word_features(w))
    complete = ""
    for i, p in enumerate(prediction):
        if int(p) >= 1:
            complete += " " + w[i]
        else:
            complete += w[i]
    if complete == ' '.join(word_dict_for_test[word].affix_list):
        score += 1
    else:
        continue
#         print(f'{word} -> {complete}, {word_dict_for_test[word].affix_list}')
print(score / len(word_for_test))

0.7553101104502974


In [83]:
tagger = pycrfsuite.Tagger()
tagger.open('word-segmentation_' + model_name + '.crfsuite')

def segment_word_(word):
    w = word.replace(" ", "")
    prediction = tagger.tag(create_word_features(w))
    complete = ""
    for i, p in enumerate(prediction):
        if int(p) >= 1:
            complete += " " + w[i]
        else:
            complete += w[i]
    return complete

In [84]:
segment_word_('segmentation')

'segmentation'

In [85]:
import pandas as pd
from pandas import DataFrame

In [86]:
d = DataFrame()

In [87]:
ff = [0.86, 0.86, 0.83]
ev = [0.70, 0.75, '_']

In [88]:
d['5-fold cross validation'] = ff
d['acc on the other dataset'] = ev

In [89]:
d

Unnamed: 0,5-fold cross validation,acc on the other dataset
0,0.86,0.7
1,0.86,0.75
2,0.83,_
