# Software 2.0

In 2017, Andrej Karpathy wrote a seminal [blog post](https://karpathy.medium.com/software-2-0-a64152b37c35) where he introduced the concept of "software 2.0". In a nutshell, he argued that machine learning was just another tool in the software engineer's toolbox, and that there shouldn't really be a difference between software engineer and machine learning engineer. The innovation of machine learning is to substitute imperative programming with what I would call "programming by example". Since I've tried to make this point to many people over the years, I figured I would try to make it crystal clear with a super simple (and somewhat dumb) example.

A friend was recently preparing for a technical interview and he shared with me one of the coding exercises, which was a simple function to convert Roman numerals to integers. For example:
- IV -> 4
- MCXCIII -> 1193
- DXLVIII -> 548
- MMMDCXVI -> 3616
- XXI -> 21

I'll first write code for the "classic" way of programming this ("imperative programming"). Then I'll contrast it with the "machine learning way" or "programming by example". Finally, I'll draw some general conclusions.

In [None]:
class RomanToInteger:
    def __init__(self):
        self.mapping = {
            'I': 1, 
            'V': 5, 
            'X': 10, 
            'L': 50, 
            'C': 100, 
            'D': 500, 
            'M': 1000
        }

    def __call__(self, s):
        l = len(s)
        tot = 0
        prev_n = 0
        for i in range(l):
            current_n = self.mapping[s[i]]
            next_n = self.mapping[s[i+1]] if i+1 < l else 0
            if current_n >= next_n:
                tot += (current_n - prev_n)
                prev_n = 0
            else:
                prev_n = current_n
        return tot

In [2]:
class IntegerToRoman:
    def __init__(self):
        self.mapping = {
            1: 'I', 
            5: 'V', 
            10: 'X', 
            50: 'L', 
            100: 'C', 
            500: 'D', 
            1000: 'M'
        }

    def __call__(self, n):
        div = 1
        while n >= div: div *= 10
        div //= 10
        out = []
        while n:
            d = n // div
            if d < 4:
                o = self.mapping[div]*d
            elif d == 4:
                o = self.mapping[div] + self.mapping[div*5]
            elif d < 9:
                o = self.mapping[div*5] + (d-5)*self.mapping[div]
            else:
                o = self.mapping[div] + self.mapping[div*10]
            out.append(o)
            n = n % div
            div //= 10
        return ''.join(out)

In [178]:
import random
from pathlib import Path

default_data_path = Path('data')

def make_data(splits, path=None, max_num=3999):
    if path is None: path = default_data_path
    if not path.exists(): path.mkdir()
    mapper = IntegerToRoman()
    for split, size in splits.items():
        with open(path/(split+'.txt'), 'w') as o:
            for _ in range(int(size)):
                i = random.randint(1, max_num)
                r = mapper(i)
                o.write(' '.join([str(i), r]) + '\n')

In [272]:
splits  = {
    'train': 1e6,
    'valid': 1e5
}

make_data(splits)

In [273]:
import torch
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import IterableDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from functools import partial

vocab = build_vocab_from_iterator(
    list(map(str,range(10)))
    + ['I','V','X','L','C','D','M'],
    specials=['<bos>', '<pad>', '<eos>']
)

pad_idx = vocab['<pad>']

class NumberDataset(IterableDataset):
    valid_targets = ['roman','integer']

    def __init__(self, split, target='roman', root=None, extension='txt'):
        if root is None: root = default_data_path   
        self.url = root/('.'.join([split,extension]))
        assert target in self.valid_targets, f'Target needs to be one of {self.valid_targets}'
        self.target = target

    def __iter__(self):
        try:
            with open(self.url) as f:
                for line in f:
                    i, r = line.split()
                    yield (i,r) if self.target == 'roman' else (r,i)
        except StopIteration:
            pass

def transform(vocab, x):
    seq = ['<bos>'] + list(x) + ['<eos>']
    return [vocab[tok] for tok in seq]

def untransform(vocab, x):
    out = []
    for idx in x:
        tok = vocab.lookup_token(idx)
        if tok == '<bos>': continue
        elif tok == '<eos>': return ''.join(out)
        else: out.append(tok)
    return ''.join(out)

def collate(batch):
    src_lst, tgt_lst = [], []
    for src, tgt in batch:
        src, tgt = map(partial(transform, vocab), [src, tgt])
        src, tgt = map(torch.tensor, [src, tgt])
        src_lst.append(src)
        tgt_lst.append(tgt)
    return list(map(partial(pad_sequence, padding_value=pad_idx, batch_first=True), [src_lst, tgt_lst]))

In [274]:
train_iter = NumberDataset('train')
valid_iter = NumberDataset('valid')
train_dl = DataLoader(list(train_iter), batch_size=8, collate_fn=collate)
valid_dl = DataLoader(list(valid_iter), batch_size=8, collate_fn=collate)

In [275]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_size=100, hidden_size=100, dropout=.2, n_layers=2):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        x = self.dropout(self.emb(x))
        out, (h, c) = self.rnn(x)
        return h, c

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_size=100, hidden_size=100, dropout=.2, n_layers=2):
        super().__init__()
        self.vocab_size = vocab_size
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, batch_first=True, dropout=dropout)
        self.lin = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, h, c):
        # x is a single token (tensor shape N) 
        # but we need to feed the RNN a seq (tensor shape Nx1)
        # -> need to unsqueeze x on the time dim (dim 1)
        x = self.dropout(self.emb(x.unsqueeze(1)))
        out, (h, c) = self.rnn(x, (h, c))
        out = self.lin(out.squeeze(1)) # we squeeze the time dim back out of existence
        return out, h, c

class Net(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, tgt, teacher_forcing_proba=.5):
        bs = tgt.size(0)
        tgt_len = tgt.size(1)
        tgt_vocab_size = self.decoder.vocab_size
        h, c = self.encoder(src)
        x = tgt[:,0] # <sos> token
        outputs = []
        for t in range(1, tgt_len):
            out, h, c = self.decoder(x, h, c)
            outputs.append(out)
            teacher_force = random.random() < teacher_forcing_proba
            x = tgt[:,t] if teacher_force else out.argmax(-1)
        return torch.stack(outputs, 1)

In [276]:
vocab_size = len(vocab)
enc = Encoder(vocab_size)
dec = Decoder(vocab_size)
mdl = Net(enc, dec)
opt = optim.Adam(mdl.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [277]:
def train(mdl, dl, opt, loss_fn):
    mdl.train()
    epoch_loss = 0
    for src, tgt in dl:
        opt.zero_grad()
        out = mdl(src, tgt)
        out_dim = out.size(-1)
        assert out.size(1) == tgt.size(1)-1 # we skipped the first element in the output
        # collapse seq and batch dims
        out = out.view(-1, out_dim)
        tgt = tgt[:,1:].contiguous().view(-1) # skip the first element in the ground truth
        loss = loss_fn(out, tgt)
        loss.backward()
        opt.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dl)

In [278]:
def evaluate(mdl, dl, loss_fn):
    mdl.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, tgt in dl:
            out = mdl(src, tgt, 0) # turn off teacher forcing
            out_dim = out.size(-1)
            out = out.view(-1, out_dim)
            tgt = tgt[:,1:].contiguous().view(-1)
            loss = loss_fn(out, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(dl)

In [279]:
epochs = 2
best_valid_loss = float('inf')
for epoch in range(epochs):    
    train_loss = train(mdl, train_dl, opt, criterion)
    valid_loss = evaluate(mdl, valid_dl, criterion)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(mdl.state_dict(), 'model.pt')
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValid Loss: {valid_loss:.3f}')

Epoch: 01
	Train Loss: 0.008
	Valid Loss: 0.000
Epoch: 02
	Train Loss: 0.001
	Valid Loss: 0.000


In [280]:
tests = [('4','IV'), ('1193', 'MCXCIII'), ('548', 'DXLVIII'), ('3616', 'MMMDCXVI'), ('21', 'XXI')]

for src, tgt in tests:
    s, t = map(partial(transform, vocab), [src, tgt])
    s, t = map(torch.tensor, [s, t])
    s, t = map(lambda x: x.unsqueeze(0), [s, t])
    logits = mdl(s, t, 0)
    pred = logits.squeeze(0).argmax(-1).tolist()
    pred = untransform(vocab, pred)
    print(f'Input: {src}')
    print(f'Predicted: {pred}')
    print(f'Expected: {tgt}\n')

Input: 4
Predicted: MCL
Expected: IV

Input: 1193
Predicted: MCXCIII
Expected: MCXCIII

Input: 548
Predicted: MDXLVIII
Expected: DXLVIII

Input: 3616
Predicted: MMMDCXVI
Expected: MMMDCXVI

Input: 21
Predicted: MCXX
Expected: XXI

