# Counting length of a string
Let's use a neural net to see if we can detect parity of a word; does it have an even or odd number of characters?

### Import the data and grok it

In [2]:
import json
from pathlib import Path

from fastai2.vision.all import *
import pandas as pd
import torch

In [3]:
# this is a list of over 370,000 "english words", from https://github.com/dwyl/english-words
# note that these aren't necessarily useful words; a lot of them seem pretty gibberish.
path = Path('/storage/data/words_dictionary.json')

In [4]:
# words is a dict object; the keys are the words, the values are irrelevant
with path.open() as f:
    words = json.load(f)

In [5]:
# we can look at some of them
from itertools import islice
print([word for word in islice(words.keys(), 10)])

['a', 'aa', 'aaa', 'aah', 'aahed', 'aahing', 'aahs', 'aal', 'aalii', 'aaliis']


In [6]:
# if we're willing to load the whole thing, we can look at random words
from random import choice, randrange, sample

word_list = list(words.keys())
print(sample(word_list, 10))

['drawtubes', 'frantically', 'stilliest', 'vapulatory', 'scrotal', 'nonresistively', 'anthropogeographic', 'antimoniated', 'overhang', 'sundra']


In [7]:
# let's define some function to learn.
# here, I'm going to learn whether the word has an even or odd number of letters in it

# as applied to a word or list of words
def learn_fn(words):
    if isinstance(words, str):
        return len(words) % 2
    return [len(word) % 2 for word in words]

# as applied to a tensor
def label_fn(vec):
    return len(vec.nonzero()) % 2

def label_str(label):
    return 'odd' if label == 1 else 'even'

In [8]:
[(word, len(word), learn_fn(word)) for word in sample(word_list, 10)]

[('attroopment', 11, 1),
 ('foresense', 9, 1),
 ('scablike', 8, 0),
 ('wite', 4, 0),
 ('churchwomen', 11, 1),
 ('submanic', 8, 0),
 ('incohesive', 10, 0),
 ('shysters', 8, 0),
 ('smuggled', 8, 0),
 ('iodophors', 9, 1)]

In [9]:
# let's try image techniques. Can get common-length numerical vectors using 
# ascii encoding of characters in the string, and 0-pad up to the maximum
# word-length present in the dataset.

# NOTE: for this dataset, we find maxlen = 31
maxlen = max(len(word) for word in words)
word_vecs = [
    tensor([ord(c) for c in word] + [0] * (maxlen - len(word)))
    for word in words.keys()
]

In [10]:
# get the word back

def vec2word(vec):
    return ''.join(chr(n) for n in vec if n > 0)

vec2word(choice(word_vecs))

'conjecturally'

In [11]:
# sanity check that we built the thing right

# should find that each vec has the same length
from collections import Counter
print(Counter(len(vec) for vec in word_vecs))

# should find that the entries correspond to the letters of the word
w = choice(word_vecs)
print(f'{vec2word(w)}\n{w}')

Counter({31: 370101})
microcosmically
tensor([109, 105,  99, 114, 111,  99, 111, 115, 109, 105,  99,  97, 108, 108,
        121,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0])


### baseline using pointwise distance to mean

In [12]:
# if we treat the words as "images", we want the entries to be in [0,1]; that means we need to know
# how large the entries can be. As they are ascii ord() values for characters in [a:z], we just need to 
# scale by ord(z)
maxchar = ord('z')
maxchar

122

In [56]:
# the tensors we construct need to be returned to integers before displaying
def tens2word(tens):
    return ''.join(chr(n * maxchar) for n in tens if n > 0)

In [13]:
# lets construct a training and validation set
# (normally we should be careful to split each class into training/validation;
# here the classes are large and of about equal size so we should be fine)

words_range = range(len(words))
valid_inds = sample(words_range, int(0.2 * len(words)))
valid_set = set(valid_inds)
train_inds = [ind for ind in words_range if ind not in valid_set]

words_stack = torch.stack(word_vecs).float()/maxchar
train_stack = words_stack[train_inds]
valid_stack = words_stack[valid_inds]

In [14]:
# separate the odds and evens out in our training set
odds = words_stack[[i for i in train_inds if label_fn(word_vecs[i]) == 1]]
evens =  words_stack[[i for i in train_inds if label_fn(word_vecs[i]) == 0]]

# the size of a datapoint will be the total number of letters possible; maxlen from earlier
print(f'all: {words_stack.size()}')
print(f'train: {train_stack.size()}')
print(f'odd train: {odds.size()}')
print(f'even train: {evens.size()}')

all: torch.Size([370101, 31])
train: torch.Size([296081, 31])
odd train: torch.Size([147869, 31])
even train: torch.Size([148212, 31])


In [15]:
# let's visualize the mean values for each class
train_mean = train_stack.mean(0)
odd_mean = odds.mean(0)
even_mean = evens.mean(0)
df = pd.DataFrame({'train': train_mean, 'odd': odd_mean, 'even': even_mean})
df.style.set_properties(**{'font-size':'6pt'}).background_gradient('Greys')

Unnamed: 0,train,odd,even
0,0.882319,0.883605,0.884008
1,0.882161,0.88213,0.882181
2,0.885533,0.886314,0.884494
3,0.875868,0.871911,0.879888
4,0.859043,0.872066,0.845894
5,0.820721,0.796266,0.845006
6,0.749339,0.794724,0.704108
7,0.649917,0.596948,0.702796
8,0.527484,0.595354,0.459819
9,0.39986,0.341841,0.457735


In [16]:
# we can use MSE between means as a metric.
ind = randrange(words_stack.size(0))
vec = words_stack[ind]
print(f'{word_list[ind]} is {label_str(label_fn(vec))}. odd mse: {F.mse_loss(vec, odd_mean):.4f}, even mse: {F.mse_loss(vec, even_mean):.4f}')

shadchanim is even. odd mse: 0.0215, even mse: 0.0179


In [17]:
# overall performance by comparing to mean via MSE
# don't see an obvious way to use mse_loss on stack vs. individual (i.e., the broadcast semantics)
def word_dist(a,b):
    return ((a - b)**2).mean(-1).sqrt()

def is_odd(x):
    return word_dist(x, odd_mean) < word_dist(x, even_mean)

In [18]:
# classes are same size so it's not insane to be lazy and average validation accuracy
odds_valid = words_stack[[i for i in valid_inds if label_fn(word_vecs[i]) == 1]]
evens_valid =  words_stack[[i for i in valid_inds if label_fn(word_vecs[i]) == 0]]

odd_accuracy = is_odd(odds_valid).float().mean()
even_accuracy = (1 - is_odd(evens_valid).float()).mean()

odd_accuracy, even_accuracy, (odd_accuracy + even_accuracy)/2

(tensor(0.8938), tensor(0.9244), tensor(0.9091))

not bad! ~92% accuracy

### train a fastai Learner

In [64]:
# let's construct a Dataset and a DataLoader
dset = L((tens, label_fn(tens)) for tens in words_stack)
dl = DataLoader(dset, batch_size=64, shuffle=True)

In [65]:
# let's examine a batch from the DataLoader
[(tens2word(tens), label.item()) for (tens, label) in zip(*first(dl))]

[('disorganised', 0),
 ('furnarius', 1),
 ('rehabilitate', 0),
 ('untallied', 1),
 ('premiated', 1),
 ('digitinerved', 0),
 ('savorlessness', 1),
 ('concernedly', 1),
 ('dalf', 0),
 ('moism', 1),
 ('epirrheme', 1),
 ('humours', 1),
 ('flotson', 1),
 ('cointers', 0),
 ('caup', 0),
 ('techne', 0),
 ('ingenu', 0),
 ('raffish', 1),
 ('trehala', 1),
 ('allocatee', 1),
 ('hypopharyngeal', 0),
 ('epicyesis', 1),
 ('quart', 1),
 ('characterological', 1),
 ('discepts', 0),
 ('stabilization', 1),
 ('priestship', 0),
 ('biennia', 1),
 ('altars', 0),
 ('nonextensible', 1),
 ('recomprehend', 0),
 ('autodialed', 0),
 ('berzeliite', 0),
 ('sultanates', 0),
 ('thiostannic', 1),
 ('demibrute', 1),
 ('hyperpyramid', 0),
 ('anthrapurpurin', 0),
 ('exantlation', 1),
 ('contentiously', 1),
 ('puddingheaded', 1),
 ('pecksniffian', 0),
 ('cowwheat', 0),
 ('misgauges', 1),
 ('karting', 1),
 ('anxiety', 1),
 ('sphenocephalous', 1),
 ('celtillyrians', 1),
 ('debituminize', 0),
 ('gimmer', 0),
 ('haleness', 0),


In [None]:
# we need a loss function, and a metric to track
# for the loss, want to push 

In [None]:
# let's start with a linear regression
linear_model = nn.Linear(len(words), 1)
learn = Learner(dls, linear_model, opt_func=SGD, loss_func=, metrics=)

In [70]:
tst = CrossEntropyLossFlat()
out = torch.randn(32, 5, 10)
tar = torch.randint(0, 10, (32, 5))

In [72]:
out.size()

torch.Size([32, 5, 10])

In [76]:
print([word for word in words if len(word) == 28])

['antidisestablishmentarianism', 'hydroxydehydrocorticosterone']


In [9]:
# let's turn this data in to a dataframe to prepare for turning it into a fastai2.text.TextDataLoader
import pandas as pd

df = pd.DataFrame({'text': word_list, 'label': learn_fn(word_list)})