# Document classification

Your task is to classify documents into the following 7 categories:

* `AA`: Amazon Appliances
* `AB`: Amazon Books
* `BC`: Broadcasting Conversations
* `BN`: Broadcasting News
* `CT`: Conversation Texts
* `NW`: Newswires
* `WB`: Web Blogs

Three files are provided:

* [`docs.trn.tsv`](res/docs.trn.tsv): training set
* [`docs.dev.tsv`](res/docs.dev.tsv): development set
* [`docs.tst.tsv`](res/docs.tst.tsv): test set

## 1. Reading

In [21]:
from typing import List, Tuple

def read_docs(filename: str) -> List[Tuple[str, List[str]]]:
    fin = open(filename)
    docs = []
    
    for line in fin:
        l = line.split('\t')
        label = l[0]
        doc = l[1].split()
        docs.append((label, doc))
    
    return docs

In [22]:
trn_docs = read_docs('res/docs.trn.tsv')
dev_docs = read_docs('res/docs.dev.tsv')
tst_docs = read_docs('res/docs.tst.tsv')

In [23]:
print('TRN: %d, DEV: %d, TST: %d' % (len(trn_docs), len(dev_docs), len(tst_docs)))

TRN: 2355, DEV: 297, TST: 297


## 2. Indexing

In [24]:
from typing import Dict

def index_maps(docs: List[Tuple[str, List[str]]]) -> Tuple[Dict[str, int], Dict[str, int]]:
    map_x = {}
    map_y = {}
    
    for label, doc in docs:
        map_y.setdefault(label, len(map_y))
        
        for token in doc:
            map_x.setdefault(token, len(map_x))
    
    return map_y, map_x

In [25]:
map_y, map_x = index_maps(trn_docs)

In [26]:
print('labels: %d, token types: %d' % (len(map_y), len(map_x)))

labels: 7, token types: 38741


## 3. Vectorizing

In [27]:
def vec_yxs(docs: List[Tuple[str, List[str]]], map_y: Dict[str, int], map_x: Dict[str, int]) -> List[Tuple[int, List[Tuple[int, int]]]]:
    yxs = []
    
    for label, doc in docs:
        y = map_y.get(label, -1)
        x = [(map_x[token], 1) for token in doc if token in map_x]
        x.sort()
        yxs.append((y, x))

    return yxs

In [28]:
trn_yxs = vec_yxs(trn_docs, map_y, map_x)
dev_yxs = vec_yxs(dev_docs, map_y, map_x)
tst_yxs = vec_yxs(tst_docs, map_y, map_x)

## 4. Learning

In [29]:
import numpy as np

def score(w: np.array, x: List[Tuple[int, int]]):
    return sum([w[i] * v for i, v in x])

def update(w: np.array, x: List[Tuple[int, int]], gradient: float):
    for i, v in x:
        w[i] += gradient * v

In [30]:
def train(yxs: List[Tuple[int, List[Tuple[int, int]]]], ws: List[np.array], learning_rate):
    for y, x in yxs:
        yhat = np.argmax([score(w, x) for w in ws])
        if y != yhat:
            for i, w in enumerate(ws):
                sign = 1 if i == y else -1
                update(w, x, sign * learning_rate)

In [31]:
def evaluate(yxs: List[Tuple[int, List[Tuple[int, int]]]], ws: List[np.array]):
    correct = 0
    
    for y, x in yxs:
        yhat = np.argmax([score(w, x) for w in ws])
        if y == yhat:
            correct += 1

    return correct

In [32]:
ws = [np.zeros(len(map_x)) for _ in range(len(map_y))]
learning_rate = 0.01
best_correct = 0
best_ws = None
epochs = 50

for epoch in range(epochs):
    train(trn_yxs, ws, learning_rate)
    correct = evaluate(dev_yxs, ws)
    if correct > best_correct:
        best_correct = correct
        best_ws = [np.array(w) for w in ws]
    
    print('%4d: %5.2f (%d/%d)' % (epoch, 100.0*correct/len(dev_yxs), correct, len(dev_yxs)))

print('==========')
print('Best: %5.2f (%d/%d)' % (100.0*best_correct/len(dev_yxs), best_correct, len(dev_yxs)))

   0: 13.13 (39/297)
   1: 18.52 (55/297)
   2: 22.22 (66/297)
   3: 25.25 (75/297)
   4: 25.93 (77/297)
   5: 26.26 (78/297)
   6: 28.62 (85/297)
   7: 28.62 (85/297)
   8: 29.97 (89/297)
   9: 33.67 (100/297)
  10: 32.66 (97/297)
  11: 35.35 (105/297)
  12: 40.40 (120/297)
  13: 40.74 (121/297)
  14: 44.78 (133/297)
  15: 47.14 (140/297)
  16: 47.81 (142/297)
  17: 47.81 (142/297)
  18: 48.82 (145/297)
  19: 49.16 (146/297)
  20: 52.19 (155/297)
  21: 53.20 (158/297)
  22: 54.21 (161/297)
  23: 56.57 (168/297)
  24: 56.57 (168/297)
  25: 56.57 (168/297)
  26: 58.25 (173/297)
  27: 58.25 (173/297)
  28: 58.59 (174/297)
  29: 58.92 (175/297)
  30: 58.92 (175/297)
  31: 59.60 (177/297)
  32: 59.60 (177/297)
  33: 60.61 (180/297)
  34: 60.94 (181/297)
  35: 59.60 (177/297)
  36: 62.63 (186/297)
  37: 60.61 (180/297)
  38: 62.29 (185/297)
  39: 62.96 (187/297)
  40: 61.28 (182/297)
  41: 62.29 (185/297)
  42: 61.62 (183/297)
  43: 61.95 (184/297)
  44: 62.63 (186/297)
  45: 62.96 (187/297

In [33]:
correct = evaluate(dev_yxs, best_ws)
print('DEV: %5.2f (%d/%d)' % (100.0*correct/len(dev_yxs), correct, len(dev_yxs)))

DEV: 63.64 (189/297)


In [34]:
correct = evaluate(tst_yxs, best_ws)
print('TST: %5.2f (%d/%d)' % (100.0*correct/len(tst_yxs), correct, len(tst_yxs)))

TST: 64.98 (193/297)
