# Counting length of a string
Let's use a neural net to see if we can detect parity of a word; does it have an even or odd number of characters?

### Import the data and grok it

In [1]:
import json
from pathlib import Path

from fastai2.vision.all import *
import pandas as pd
import torch

In [2]:
# this is a list of over 370,000 "english words", from https://github.com/dwyl/english-words
# note that these aren't necessarily useful words; a lot of them seem pretty gibberish.
path = Path('/storage/data/words_dictionary.json')

In [3]:
# words is a dict object; the keys are the words, the values are irrelevant
with path.open() as f:
    words = json.load(f)

In [4]:
# we can look at some of them
from itertools import islice
print([word for word in islice(words.keys(), 10)])

['a', 'aa', 'aaa', 'aah', 'aahed', 'aahing', 'aahs', 'aal', 'aalii', 'aaliis']


In [5]:
# if we're willing to load the whole thing, we can look at random words
from random import choice, randrange, sample

word_list = list(words.keys())
print(sample(word_list, 10))

['camelina', 'stigmatoid', 'polyzoarial', 'trachodont', 'nonauthentication', 'phragmosis', 'pseudaconine', 'unemerging', 'sewery', 'microfibrillar']


In [6]:
# let's define some function to learn.
# here, I'm going to learn whether the word has an even or odd number of letters in it

# as applied to a word or list of words
def learn_fn(words):
    if isinstance(words, str):
        return len(words) % 2
    return [len(word) % 2 for word in words]

# as applied to a tensor
def label_fn(vec):
    return len(vec.nonzero()) % 2

def label_str(label):
    return 'odd' if label == 1 else 'even'

In [8]:
[(word, len(word), learn_fn(word)) for word in sample(word_list, 10)]

[('tinamou', 7, 1),
 ('holocausts', 10, 0),
 ('retinoscopically', 16, 0),
 ('nesquehonite', 12, 0),
 ('amphistomoid', 12, 0),
 ('cowpony', 7, 1),
 ('renographic', 11, 1),
 ('homogeneization', 15, 1),
 ('germanite', 9, 1),
 ('reflate', 7, 1)]

In [9]:
# let's try image techniques. Can get common-length numerical vectors using 
# ascii encoding of characters in the string, and 0-pad up to the maximum
# word-length present in the dataset.

# NOTE: for this dataset, we find maxlen = 31
maxlen = max(len(word) for word in words)
word_vecs = [
    tensor([ord(c) for c in word] + [0] * (maxlen - len(word)))
    for word in words.keys()
]

In [10]:
# get the word back

def vec2word(vec):
    return ''.join(chr(n) for n in vec if n > 0)

vec2word(choice(word_vecs))

'precommune'

In [11]:
# sanity check that we built the thing right

# should find that each vec has the same length
from collections import Counter
print(Counter(len(vec) for vec in word_vecs))

# should find that the entries correspond to the letters of the word
w = choice(word_vecs)
print(f'{vec2word(w)}\n{w}')

Counter({31: 370101})
topminnows
tensor([116, 111, 112, 109, 105, 110, 110, 111, 119, 115,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0])


### baseline using pointwise distance to mean

In [12]:
# if we treat the words as "images", we want the entries to be in [0,1]; that means we need to know
# how large the entries can be. As they are ascii ord() values for characters in [a:z], we just need to 
# scale by ord(z)
maxchar = ord('z')
maxchar

122

In [30]:
# lets construct a training and validation set
# (normally we should be careful to split each class into training/validation;
# here the classes are large and of about equal size so we should be fine)

words_range = range(len(words))
valid_inds = sample(words_range, int(0.2 * len(words)))
valid_set = set(valid_inds)
train_inds = [ind for ind in words_range if ind not in valid_set]

words_stack = torch.stack(word_vecs).float()/maxchar
train_stack = words_stack[train_inds]
valid_stack = words_stack[valid_inds]

In [32]:
# separate the odds and evens out in our training set
odds = words_stack[[i for i in train_inds if label_fn(word_vecs[i]) == 1]]
evens =  words_stack[[i for i in train_inds if label_fn(word_vecs[i]) == 0]]

# the size of a datapoint will be the total number of letters possible; maxlen from earlier
print(f'all: {words_stack.size()}')
print(f'train: {train_stack.size()}')
print(f'odd train: {odds.size()}')
print(f'even train: {evens.size()}')

all: torch.Size([370101, 31])
train: torch.Size([296081, 31])
odd train: torch.Size([147604, 31])
even train: torch.Size([148477, 31])


In [33]:
# let's visualize the mean values for each class
train_mean = train_stack.mean(0)
odd_mean = odds.mean(0)
even_mean = evens.mean(0)
df = pd.DataFrame({'train': train_mean, 'odd': odd_mean, 'even': even_mean})
df.style.set_properties(**{'font-size':'6pt'}).background_gradient('Greys')

Unnamed: 0,train,odd,even
0,0.882253,0.883518,0.883953
1,0.882186,0.882094,0.882263
2,0.885521,0.886372,0.884434
3,0.875855,0.871932,0.879827
4,0.859042,0.872147,0.845863
5,0.820686,0.796139,0.84497
6,0.749674,0.794501,0.70516
7,0.650787,0.597201,0.70408
8,0.527413,0.595606,0.459664
9,0.39967,0.341291,0.457694


In [15]:
# we can use MSE between means as a metric.
ind = randrange(words_stack.size(0))
vec = words_stack[ind]
print(f'{word_list[ind]} is {label_str(label_fn(vec))}. odd mse: {F.mse_loss(vec, odd_mean):.4f}, even mse: {F.mse_loss(vec, even_mean):.4f}')

logopedic is odd. odd mse: 0.0135, even mse: 0.0167


In [40]:
# overall performance by comparing to mean via MSE
# don't see an obvious way to use mse_loss on stack vs. individual (i.e., the broadcast semantics)
def word_dist(a,b):
    return ((a - b)**2).mean(-1).sqrt()

def is_odd(x):
    return word_dist(x, odd_mean) < word_dist(x, even_mean)

In [41]:
# classes are same size so it's not insane to be lazy and average validation accuracy
odds_valid = words_stack[[i for i in valid_inds if label_fn(word_vecs[i]) == 1]]
evens_valid =  words_stack[[i for i in valid_inds if label_fn(word_vecs[i]) == 0]]

odd_accuracy = is_odd(odds_valid).float().mean()
even_accuracy = (1 - is_odd(evens_valid).float()).mean()

odd_accuracy, even_accuracy, (odd_accuracy + even_accuracy)/2

(tensor(0.9269), tensor(0.9241), tensor(0.9255))

not bad! ~92.5% accuracy

### train a fastai Learner

In [9]:
# let's turn this data in to a dataframe to prepare for turning it into a fastai2.text.TextDataLoader
import pandas as pd

df = pd.DataFrame({'text': word_list, 'label': learn_fn(word_list)})