## Toxic Comment Classification Challenge

Based on:
- https://www.kaggle.com/tunguz/logistic-regression-with-words-and-char-n-grams
- https://medium.com/@martinpella/logistic-regression-from-scratch-in-python-124c5636b8ac

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import scipy
from scipy import sparse

In [None]:
# data from https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1aBA7lUyAB4YsfTAFHJDaFSOMZ3dYlmt4' -O test.csv
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1_X4qVCka54NIoTNGfgtA8lClnLaMEEo-' -O train.csv

In [34]:
!ls

 sample_data   test.csv   train2.csv  'view?usp=sharing'
 test2.csv     tets.csv   train.csv


In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [3]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=1000, dtype=np.float32) # 1000 and float32 for smaller size of the data

word_vectorizer.fit(all_text)

train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

# char_vectorizer = TfidfVectorizer(
#     sublinear_tf=True,
#     strip_accents='unicode',
#     analyzer='char',
#     stop_words='english',
#     ngram_range=(2, 6),
#     max_features=5) #50000
# char_vectorizer.fit(all_text)
# train_char_features = char_vectorizer.transform(train_text)
# test_char_features = char_vectorizer.transform(test_text)

# train_features = hstack([train_char_features, train_word_features])
# test_features = hstack([test_char_features, test_word_features])

In [4]:
train_features = train_word_features[:50000, :]
test_features = test_word_features 

In [5]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name][:50000]
    classifier = LogisticRegression(C=0.1, solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

# submission.to_csv('submission.csv', index=False)

CV score for class toxic is 0.9293452157933823
CV score for class severe_toxic is 0.9703661637757519
CV score for class obscene is 0.9672287752755669
CV score for class threat is 0.9586053365497146
CV score for class insult is 0.9481660654309895
CV score for class identity_hate is 0.9526833173986237
Total CV score is 0.9543991457040047


Hand-made log-reg

In [6]:
class LogisticRegression2:
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
        # intercept = sparse.csr_matrix(np.ones((X.shape[0], 1)))
        # return scipy.sparse.hstack([intercept, train_features])
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            loss = self.__loss(h, y)
                
            if(self.verbose == True and i % 5000 == 0):
                print(f'loss: {loss} \t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X):
        return self.predict_prob(X).round()

### Toy example - iris

In [37]:
from sklearn import datasets

In [38]:
iris = sklearn.datasets.load_iris()
X = iris.data[:, :2]
y = (iris.target != 0) * 1

In [39]:
X.shape, y.shape

((150, 2), (150,))

In [49]:
model = LogisticRegression2(lr=0.6, num_iter=5000)
model.fit(X, y)
preds = model.predict(X)
(preds == y).mean()

1.0

In [7]:
for class_name in class_names:
    print(class_name)

toxic
severe_toxic
obscene
threat
insult
identity_hate


### Result

In [8]:
class_name = 'toxic'
train_data = train_features.toarray().astype(np.float16)
train_target = train[class_name][:50000]

model = LogisticRegression2(lr=0.6, num_iter=5000)
model.fit(train_data, train_target)

preds = model.predict(train_data)

In [9]:
roc_auc_score(train_target, preds)

0.6621700909875433

### CuPy based

In [10]:
import cupy as cp 

class LogisticRegression2_cp:
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        intercept = cp.ones((X.shape[0], 1))
        return cp.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + cp.exp(-z))

    def __loss(self, h, y):
        return (-y * cp.log(h) - (1 - y) * cp.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.theta = cp.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = cp.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = cp.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            
            z = cp.dot(X, self.theta)
            h = self.__sigmoid(z)
            loss = self.__loss(h, y)
                
            if(self.verbose ==True and i % 5000 == 0):
                print(f'loss: {loss} \t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(cp.dot(X, self.theta))
    
    def predict(self, X):
        return self.predict_prob(X).round()

In [15]:
class_name = 'toxic'

train_data = cp.array(train_features.toarray().astype(np.float16))
train_target = cp.array(train[class_name][:50000])

model = LogisticRegression2_cp(lr=0.7, num_iter=50000, verbose=True)
model.fit(train_data, train_target)
preds = model.predict(train_data)

loss: 0.5872211299226802 	
loss: 0.18030092109521675 	
loss: 0.16020582964766272 	
loss: 0.1514020123394156 	
loss: 0.14637800241508434 	
loss: 0.14312807241846429 	
loss: 0.14086036314354874 	
loss: 0.13919394249997266 	
loss: 0.13792176769776543 	
loss: 0.13692149817221402 	


In [16]:
roc_auc_score(cp.asnumpy(train_target), cp.asnumpy(preds))

0.7905050414102754

In [35]:
class_name = 'toxic'

train_data = cp.array(train_features.toarray().astype(np.float16))
train_target = cp.array(train[class_name][:50000])

model = LogisticRegression2_cp(lr=0.2, num_iter=300000, verbose=True)
model.fit(train_data, train_target)
preds = model.predict(train_data)

loss: 0.6607707205136976 	
loss: 0.23159990911221284 	
loss: 0.20128802724835448 	
loss: 0.1856774080671144 	
loss: 0.175922517035376 	
loss: 0.16914512731159656 	
loss: 0.16411420768034918 	
loss: 0.16020870623413344 	
loss: 0.1570777766599001 	
loss: 0.15450626935612238 	
loss: 0.15235386045031074 	
loss: 0.150524566075961 	
loss: 0.1489502088996186 	
loss: 0.14758086030215523 	
loss: 0.1463790205413086 	
loss: 0.14531591999324736 	
loss: 0.14436908284789787 	
loss: 0.1435206730250885 	
loss: 0.14275634160199174 	
loss: 0.14206440535666093 	
loss: 0.14143524957345738 	
loss: 0.14086088617962228 	
loss: 0.14033462162884958 	
loss: 0.13985080371691885 	
loss: 0.1394046260851262 	
loss: 0.13899197550369977 	
loss: 0.1386093113042481 	
loss: 0.13825356926789498 	
loss: 0.1379220843259253 	
loss: 0.13761252788170877 	
loss: 0.13732285660495333 	
loss: 0.13705127030700348 	
loss: 0.1367961770631323 	
loss: 0.13655616416209254 	
loss: 0.1363299737744023 	
loss: 0.13611648246684074 	
loss: 0

In [36]:
roc_auc_score(cp.asnumpy(train_target), cp.asnumpy(preds))

0.7994795323861603

### Timing

In [57]:
#gpu
model = LogisticRegression2_cp(lr=0.6, num_iter=5000)
train_data = cp.array(train_features.toarray().astype(np.float16))
train_target = cp.array(train[class_name][:50000])
%timeit -r 1 model.fit(train_data, train_target)

1 loop, best of 1: 28.2 s per loop


In [59]:
#cpu
model = LogisticRegression2(lr=0.6, num_iter=5000)
train_data = train_features.toarray().astype(np.float16)
train_target = train[class_name][:50000]
%timeit -r 1 model.fit(train_data, train_target)

1 loop, best of 1: 8min 29s per loop


In [60]:
# CuPy version 18 times faster - збс

## BPE from article

In [None]:
# https://leimao.github.io/blog/Byte-Pair-Encoding/

In [22]:
import re, collections # https://docs.python.org/3/library/re.html

def get_vocab(filename):
    vocab = collections.defaultdict(int)
    with open(filename, 'r', encoding='utf-8') as fhand:
        for line in fhand:
            words = line.strip().split()
            for word in words:
                vocab[' '.join(list(word)) + ' </w>'] += 1
    return vocab

def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1): # for all pairs of two
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair)) # какая-то история с добавлением \ - пробел что ли
    #print(bigram)
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)') # какая-то история с добавлением \ - пробел что ли, 
    # https://docs.python.org/3/library/re.html#re.compile
    #print(p)
    for word in v_in:
        w_out = p.sub(''.join(pair), word) # https://docs.python.org/3/library/re.html#re.sub
        #print(w_out)
        v_out[w_out] = v_in[word]
    return v_out

def get_tokens(vocab):
    tokens = collections.defaultdict(int)
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens[token] += freq
    return tokens

def get_tokens_from_vocab(vocab):
    tokens_frequencies = collections.defaultdict(int)
    vocab_tokenization = {}
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens_frequencies[token] += freq
        vocab_tokenization[''.join(word_tokens)] = word_tokens
    return tokens_frequencies, vocab_tokenization

### Experients

In [23]:
vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}
print('==========')
print('Tokens Before BPE')
tokens = get_tokens(vocab)
print('Tokens: {}'.format(tokens))
print('Number of tokens: {}'.format(len(tokens)))
print('==========')

Tokens Before BPE
Tokens: defaultdict(<class 'int'>, {'l': 7, 'o': 7, 'w': 16, '</w>': 16, 'e': 17, 'r': 2, 'n': 6, 's': 9, 't': 9, 'i': 3, 'd': 3})
Number of tokens: 11


In [24]:
vocab

{'l o w </w>': 5,
 'l o w e r </w>': 2,
 'n e w e s t </w>': 6,
 'w i d e s t </w>': 3}

In [25]:
tokens

defaultdict(int,
            {'</w>': 16,
             'd': 3,
             'e': 17,
             'i': 3,
             'l': 7,
             'n': 6,
             'o': 7,
             'r': 2,
             's': 9,
             't': 9,
             'w': 16})

In [26]:
pairs = get_stats(vocab)
pairs

defaultdict(int,
            {('d', 'e'): 3,
             ('e', 'r'): 2,
             ('e', 's'): 9,
             ('e', 'w'): 6,
             ('i', 'd'): 3,
             ('l', 'o'): 7,
             ('n', 'e'): 6,
             ('o', 'w'): 7,
             ('r', '</w>'): 2,
             ('s', 't'): 9,
             ('t', '</w>'): 9,
             ('w', '</w>'): 5,
             ('w', 'e'): 8,
             ('w', 'i'): 3})

In [27]:
best = max(pairs, key=pairs.get)
best

('e', 's')

In [30]:
num_merges = 10
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print('Iter: {}'.format(i))
    print('Best pair: {}'.format(best))
    tokens = get_tokens(vocab)
    print('Tokens: {}'.format(tokens))
    print('Number of tokens: {}'.format(len(tokens)))
    print('==========')

Iter: 0
Best pair: ('l', 'o')
Tokens: defaultdict(<class 'int'>, {'lo': 7, 'w': 16, '</w>': 7, 'e': 8, 'r': 2, 'n': 6, 'est</w>': 9, 'i': 3, 'd': 3})
Number of tokens: 9
Iter: 1
Best pair: ('lo', 'w')
Tokens: defaultdict(<class 'int'>, {'low': 7, '</w>': 7, 'e': 8, 'r': 2, 'n': 6, 'w': 9, 'est</w>': 9, 'i': 3, 'd': 3})
Number of tokens: 9
Iter: 2
Best pair: ('n', 'e')
Tokens: defaultdict(<class 'int'>, {'low': 7, '</w>': 7, 'e': 2, 'r': 2, 'ne': 6, 'w': 9, 'est</w>': 9, 'i': 3, 'd': 3})
Number of tokens: 9
Iter: 3
Best pair: ('ne', 'w')
Tokens: defaultdict(<class 'int'>, {'low': 7, '</w>': 7, 'e': 2, 'r': 2, 'new': 6, 'est</w>': 9, 'w': 3, 'i': 3, 'd': 3})
Number of tokens: 9
Iter: 4
Best pair: ('new', 'est</w>')
Tokens: defaultdict(<class 'int'>, {'low': 7, '</w>': 7, 'e': 2, 'r': 2, 'newest</w>': 6, 'w': 3, 'i': 3, 'd': 3, 'est</w>': 3})
Number of tokens: 9
Iter: 5
Best pair: ('low', '</w>')
Tokens: defaultdict(<class 'int'>, {'low</w>': 5, 'low': 2, 'e': 2, 'r': 2, '</w>': 2, 'newes

In [31]:
vocab

{'low</w>': 5, 'lowe r </w>': 2, 'newest</w>': 6, 'widest</w>': 3}

In [21]:
num_merges = 3
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print('Iter: {}'.format(i))
    print('Best pair: {}'.format(best))
    tokens = get_tokens(vocab)
    print('Tokens: {}'.format(tokens))
    print('Number of tokens: {}'.format(len(tokens)))
    print('==========')

e\ s
re.compile('(?<!\\S)e\\ s(?!\\S)')
l o w </w>
l o w e r </w>
n e w es t </w>
w i d es t </w>
Iter: 0
Best pair: ('e', 's')
Tokens: defaultdict(<class 'int'>, {'l': 7, 'o': 7, 'w': 16, '</w>': 16, 'e': 8, 'r': 2, 'n': 6, 'es': 9, 't': 9, 'i': 3, 'd': 3})
Number of tokens: 11
es\ t
re.compile('(?<!\\S)es\\ t(?!\\S)')
l o w </w>
l o w e r </w>
n e w est </w>
w i d est </w>
Iter: 1
Best pair: ('es', 't')
Tokens: defaultdict(<class 'int'>, {'l': 7, 'o': 7, 'w': 16, '</w>': 16, 'e': 8, 'r': 2, 'n': 6, 'est': 9, 'i': 3, 'd': 3})
Number of tokens: 10
est\ \<\/w\>
re.compile('(?<!\\S)est\\ \\<\\/w\\>(?!\\S)')
l o w </w>
l o w e r </w>
n e w est</w>
w i d est</w>
Iter: 2
Best pair: ('est', '</w>')
Tokens: defaultdict(<class 'int'>, {'l': 7, 'o': 7, 'w': 16, '</w>': 7, 'e': 8, 'r': 2, 'n': 6, 'est</w>': 9, 'i': 3, 'd': 3})
Number of tokens: 10


In [None]:
# vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}

# Get free book from Gutenberg
# wget http://www.gutenberg.org/cache/epub/16457/pg16457.txt
vocab = get_vocab('pg16457.txt')

print('==========')
print('Tokens Before BPE')
tokens = get_tokens(vocab)
print('Tokens: {}'.format(tokens))
print('Number of tokens: {}'.format(len(tokens)))
print('==========')

num_merges = 1000
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print('Iter: {}'.format(i))
    print('Best pair: {}'.format(best))
    tokens = get_tokens(vocab)
    print('Tokens: {}'.format(tokens))
    print('Number of tokens: {}'.format(len(tokens)))
    print('==========')

### Encoding + Decoding

In [87]:
import re, collections

def get_vocab(filename):
    vocab = collections.defaultdict(int)
    with open(filename, 'r', encoding='utf-8') as fhand:
        for line in fhand:
            words = line.strip().split()
            for word in words:
                vocab[' '.join(list(word)) + ' </w>'] += 1

    return vocab

def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

def get_tokens_from_vocab(vocab):
    tokens_frequencies = collections.defaultdict(int)
    vocab_tokenization = {}
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens_frequencies[token] += freq
        vocab_tokenization[''.join(word_tokens)] = word_tokens
    return tokens_frequencies, vocab_tokenization

def measure_token_length(token):
    if token[-4:] == '</w>':
        return len(token[:-4]) + 1
    else:
        return len(token)

def tokenize_word(string, sorted_tokens, unknown_token='</u>'):
    print('TTT', sorted_tokens)
    if string == '':
        return []
    if sorted_tokens == []:
        return [unknown_token]

    string_tokens = []
    for i in range(len(sorted_tokens)):
        token = sorted_tokens[i]
        token_reg = re.escape(token.replace('.', '[.]'))

        matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token_reg, string)]
        #print('matched_positions', matched_positions)
        if len(matched_positions) == 0:
            continue
        substring_end_positions = [matched_position[0] for matched_position in matched_positions]

        substring_start_position = 0
        for substring_end_position in substring_end_positions:
            substring = string[substring_start_position:substring_end_position]
            #print(substring)
            string_tokens += tokenize_word(string=substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
            string_tokens += [token]
            #print(token)
            substring_start_position = substring_end_position + len(token)
        remaining_substring = string[substring_start_position:]
        string_tokens += tokenize_word(string=remaining_substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
        break
    return string_tokens


In [71]:
vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}
print('==========')
print('Tokens Before BPE')
tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
print('All tokens: {}'.format(tokens_frequencies.keys()))
print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))
print('==========')

Tokens Before BPE
All tokens: dict_keys(['l', 'o', 'w', '</w>', 'e', 'r', 'n', 's', 't', 'i', 'd'])
Number of tokens: 11


In [72]:
tokens_frequencies, vocab_tokenization

(defaultdict(int,
             {'</w>': 16,
              'd': 3,
              'e': 17,
              'i': 3,
              'l': 7,
              'n': 6,
              'o': 7,
              'r': 2,
              's': 9,
              't': 9,
              'w': 16}),
 {'low</w>': ['l', 'o', 'w', '</w>'],
  'lower</w>': ['l', 'o', 'w', 'e', 'r', '</w>'],
  'newest</w>': ['n', 'e', 'w', 'e', 's', 't', '</w>'],
  'widest</w>': ['w', 'i', 'd', 'e', 's', 't', '</w>']})

In [73]:
print('==========')
print('Tokens Before BPE')
tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
print('All tokens: {}'.format(tokens_frequencies.keys()))
print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))
print('==========')

num_merges = 3 # 10000
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print('Iter: {}'.format(i))
    print('Best pair: {}'.format(best))
    tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
    print('All tokens: {}'.format(tokens_frequencies.keys()))
    print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))
    print('==========')


Tokens Before BPE
All tokens: dict_keys(['l', 'o', 'w', '</w>', 'e', 'r', 'n', 's', 't', 'i', 'd'])
Number of tokens: 11
Iter: 0
Best pair: ('e', 's')
All tokens: dict_keys(['l', 'o', 'w', '</w>', 'e', 'r', 'n', 'es', 't', 'i', 'd'])
Number of tokens: 11
Iter: 1
Best pair: ('es', 't')
All tokens: dict_keys(['l', 'o', 'w', '</w>', 'e', 'r', 'n', 'est', 'i', 'd'])
Number of tokens: 10
Iter: 2
Best pair: ('est', '</w>')
All tokens: dict_keys(['l', 'o', 'w', '</w>', 'e', 'r', 'n', 'est</w>', 'i', 'd'])
Number of tokens: 10


In [74]:
# Let's check how tokenization will be for a known word
word_given_known = 'mountains</w>'
#word_given_known = 'JJJ'
word_given_unknown = 'Ilikeeatingapples!</w>'

sorted_tokens_tuple = sorted(tokens_frequencies.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)
sorted_tokens = [token for (token, freq) in sorted_tokens_tuple]

print(sorted_tokens)

word_given = word_given_known 

['est</w>', 'w', 'e', 'l', 'o', '</w>', 'n', 'i', 'd', 'r']


In [75]:
for i in vocab_tokenization:
    print(i)

low</w>
lower</w>
newest</w>
widest</w>


In [88]:
print('Tokenizing word: {}...'.format(word_given))
if word_given in vocab_tokenization:
    print('Tokenization of the known word:')
    print(vocab_tokenization[word_given])
    print('Tokenization treating the known word as unknown:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))
else:
    print('Tokenizating of the unknown word:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))

Tokenizing word: mountains</w>...
Tokenizating of the unknown word:
TTT ['est</w>', 'w', 'e', 'l', 'o', '</w>', 'n', 'i', 'd', 'r']
matched_positions []
matched_positions [(11, 12)]
TTT ['e', 'l', 'o', '</w>', 'n', 'i', 'd', 'r']
matched_positions []
matched_positions []
matched_positions [(1, 2)]
TTT ['</w>', 'n', 'i', 'd', 'r']
matched_positions []
matched_positions []
matched_positions []
matched_positions []
matched_positions []
TTT ['</w>', 'n', 'i', 'd', 'r']
matched_positions []
matched_positions [(1, 2), (5, 6)]
TTT ['i', 'd', 'r']
matched_positions []
matched_positions []
matched_positions []
TTT ['i', 'd', 'r']
matched_positions [(2, 3)]
TTT ['d', 'r']
matched_positions []
matched_positions []
TTT ['d', 'r']
TTT ['i', 'd', 'r']
matched_positions []
matched_positions []
matched_positions []
TTT ['e', 'l', 'o', '</w>', 'n', 'i', 'd', 'r']
matched_positions []
matched_positions []
matched_positions []
matched_positions []
matched_positions []
matched_positions []
matched_positio

In [44]:
tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>')

['o', 'n', 'i', 'n', 'w']

In [45]:
word_given = word_given_unknown 

print('Tokenizing word: {}...'.format(word_given))
if word_given in vocab_tokenization:
    print('Tokenization of the known word:')
    print(vocab_tokenization[word_given])
    print('Tokenization treating the known word as unknown:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))
else:
    print('Tokenizating of the unknown word:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))

Tokenizing word: Ilikeeatingapples!</w>...
Tokenizating of the unknown word:
['l', 'i', 'e', 'e', 'i', 'n', 'l', 'e', 'w']
