### Imports and loads

In [1]:
from typing import List

import pickle as pkl
import numpy as np
import html
from pathlib import Path

from fastai.text import *
from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = Path('DATA/')

### Dataset properties, inspection, tokenization

In [3]:
DATASET_NAME = 'x_and_y_cleaned.pkl'
with open(DATA_PATH/DATASET_NAME, 'rb') as f:
    articles, categories = pkl.load(f)

In [4]:
# Label None as 'none'
categories = ['none' if not x else x for x in categories]

In [5]:
CLASSES = sorted(list(set(categories)))
ARTICLE_COUNT = len(articles)
CLASS_COUNT = len(CLASSES)
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag
MAX_SIZE = 250

max_vocab = 60000
min_freq = 5

print(ARTICLE_COUNT)
print(CLASS_COUNT)

48514
138


#### inspect

In [6]:
# Class balance check:
freq = Counter(o for o in categories)
freq.most_common(25)

[('uudised/eesti', 16451),
 ('melu/elu', 5883),
 ('uudised/maailm', 4285),
 ('uudised/krimi', 2211),
 ('televeeb/tvuudised', 1462),
 ('arvamus/kommentaar', 1342),
 ('naine/naised', 1163),
 ('naine/suhted', 1155),
 ('melu/seltskond', 969),
 ('sport/jalgpall', 885),
 ('naine/ilu', 817),
 ('uudised/kiiksud', 544),
 ('sport/korvpall', 541),
 ('tervis/keha', 502),
 ('blogid/londonilustiblogi', 480),
 ('uudised/ilm', 461),
 ('arvamus/juhtkiri', 454),
 ('sport/varia', 413),
 ('raha/kodu', 353),
 ('meedia/galeriid', 334),
 ('arvamus/repliik', 324),
 ('melu/saund', 323),
 ('melu/sunagukolab', 322),
 ('sport/kergejoustik', 316),
 ('meedia/videod', 315)]

In [7]:
print(CLASSES)

['aiaeri', 'arvamus', 'arvamus/intervjuu', 'arvamus/juhtkiri', 'arvamus/karikatuur', 'arvamus/kommentaar', 'arvamus/koomiks', 'arvamus/lugejakiri', 'arvamus/nadalatipud', 'arvamus/repliik', 'arvamus/seisukoht', 'blogid/avastaeestimaad', 'blogid/aveameerikas', 'blogid/filmiblogi', 'blogid/hollandiblogi', 'blogid/indoneesiablogi', 'blogid/jumestusblogi', 'blogid/korvpallimm', 'blogid/lehesaba', 'blogid/londonilustiblogi', 'blogid/malluka', 'blogid/meistriteblogi', 'blogid/moeajakiri', 'blogid/moekeeris', 'blogid/motteid', 'blogid/muusikablogi', 'blogid/opetajablogi', 'blogid/psyhholoogiablogi', 'blogid/pulmablogi', 'blogid/raamatublogi', 'blogid/raha', 'blogid/seljakotigablogi', 'blogid/spordiblogi', 'blogid/teleblogi', 'blogid/trenniblogi', 'blogid/valdojahilo', 'blogid/yksikvanem', 'eestinaine/elud-inimesed', 'eriline/horoskoop', 'eriline/mystika', 'joulud', 'kroonika/eesti', 'lemmikloom', 'linnaleht/arvamus', 'linnaleht/dilaila', 'linnaleht/karikatuur', 'linnaleht/kodusedlood', 'linna

In [8]:
# Dataset examples:
index = 0
print('ARTICLE: ', articles[index][0:110], '...')
print('CATEGORY: ', categories[index])

ARTICLE:  Kas parima aastavahetuse programmi pani eetrisse ETV, Kanal 2 või hoopis TV3? ETVst näegid vaatajad saateid "V ...
CATEGORY:  televeeb/tvuudised


In [54]:
# Get median/average word count
print(np.median([len(x.split(' ')) for x in articles]))
print(np.mean([len(x.split(' ')) for x in articles]))

261.0
387.0457805994146


In [6]:
# One hot encoding
# labels = []
# for x in categories:
#     y = [0 for x in range(CLASS_COUNT)]
#     y[CLASSES.index(x)] = 1
#     labels.append(y)

# Class index encoding
labels = []
for x in categories:
    y = CLASSES.index(x)
    labels.append(y)

In [7]:
np.random.seed(42)
train_texts, val_texts, train_labels, val_labels = train_test_split(articles, labels, test_size=0.1, random_state=42)
pickle.dump([train_texts, val_texts, train_labels, val_labels], open(DATA_PATH/'tokens'/'trnx_valx_trny_valy_ind_split.pkl', 'wb'))

### Tokenize

In [59]:
tok_train = Tokenizer(lang='xx').proc_all_mp(partition_by_cores(train_texts))
tok_val = Tokenizer(lang='xx').proc_all_mp(partition_by_cores(val_texts))

In [60]:
freq = Counter(p for o in tok_train for p in o)
print(len(tok_train))
freq.most_common(25)

43662


[(',', 657926),
 ('.', 559252),
 ('"', 217175),
 ('ja', 210514),
 ('on', 197759),
 ('et', 150766),
 ('ei', 106727),
 ('kui', 74991),
 ('ta', 66639),
 ('ka', 58212),
 ('oli', 51101),
 ('oma', 46727),
 ('-', 46020),
 ('ning', 45314),
 ('see', 45285),
 ('xbos', 43662),
 ('xfld', 43662),
 ('0', 42597),
 ('aga', 38936),
 ('t_up', 31812),
 ('mis', 31436),
 ('ma', 30478),
 ('siis', 29830),
 ('kes', 29218),
 ('tema', 28739)]

In [61]:
print(tok_val[5])

['xbos', 'vehklemisliidu', 'president', ',', 'riigikogu', 'liige', 'margus', 'hanson', 'tõdes', ',', 'et', 'naiskond', 'vehkles', 'kaunilt', 'kuni', 'finaalini', '.', '"', 'naised', 'olid', 'väga', 'tublid', '.', 'meil', 'on', 'noor', ',', 'perspektiivikas', 'ja', 'arenev', 'võistkond', ':', 'teise', 'kohaga', 'tuleb', 'igati', 'rahul', 'olla', ',', 'sest', 'ega', 'jõu', 'ja', 'võimu', 'vastu', 'ei', 'saa', '!', '"', 'hanson', 'lisas', ',', 'et', 'teda', 'rõõmustab', 'sten', 'priinitsa', 'individuaalturniiril', 'saadud', 'kaheksas', 'koht', ',', 'millega', 'mees', 'suurendab', 'ka', 't_up', 'eok', 'toetusraha', '.', '"', 'meie', 'vehklejad', 'on', 'tõestanud', ',', 'et', 'neid', 'saab', 'usaldada', '.', 'sportlased', 'seavad', 'kõrged', 'sihid', 'ja', 'on', 'võimelised', 'neid', 'täitma', ';', '"', 'kinnitas', 'ta', '.', 'ühtlasi', 'märkis', 'hanson', ',', 'et', 'suur', 'on', 'treener', 'igor', 'tšikinjovi', 'panus', '.', '"', 'ta', 'on', 'toonud', 'värsket', 'verd', 'ja', 'hingamist',

In [62]:
freq_val = Counter(p for o in tok_val for p in o)
print(len(tok_val))
freq_val.most_common(25)

4852


[(',', 72534),
 ('.', 62293),
 ('"', 23741),
 ('ja', 23599),
 ('on', 21847),
 ('et', 16600),
 ('ei', 11625),
 ('kui', 8402),
 ('ta', 7294),
 ('ka', 6594),
 ('oli', 5541),
 ('ning', 5219),
 ('oma', 5142),
 ('-', 5101),
 ('see', 4893),
 ('xbos', 4852),
 ('xfld', 4852),
 ('0', 4743),
 ('aga', 4345),
 ('mis', 3589),
 ('t_up', 3475),
 ('ma', 3323),
 ('tema', 3264),
 ('eesti', 3248),
 ('siis', 3235)]

In [63]:
np.save(DATA_PATH/'tokens/tok_train_pad.npy', tok_train)
np.save(DATA_PATH/'tokens/tok_val_pad.npy', tok_val)

In [64]:
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [65]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

In [97]:
train_lm = np.array([[stoi[o] for o in p] for p in tok_train])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

In [117]:
# Pad and crop values
train_lm_pad = [x[:MAX_SIZE] if len(x) > MAX_SIZE else x + [0 for i in range(MAX_SIZE - len(x))] for x in train_lm]
val_lm_pad = [x[:MAX_SIZE] if len(x) > MAX_SIZE else x + [0 for i in range(MAX_SIZE - len(x))] for x in val_lm]

In [118]:
np.save(DATA_PATH/'tokens'/'trn_ids.npy', train_lm_pad) # Oversaved all as padded
np.save(DATA_PATH/'tokens'/'val_ids.npy', val_lm_pad)
pickle.dump(itos, open(DATA_PATH/'tokens'/'itos.pkl', 'wb'))

### Load tokenized data

In [9]:
train_texts, val_texts, train_labels, val_labels = pickle.load(open(DATA_PATH/'tokens'/'trnx_valx_trny_valy_ind_split.pkl', 'rb'))
train_lm = np.load(DATA_PATH/'tokens'/'trn_ids.npy')
val_lm = np.load(DATA_PATH/'tokens'/'val_ids.npy')
itos = pickle.load(open(DATA_PATH/'tokens'/'itos.pkl', 'rb'))

#### Display data

In [91]:
print(train_texts[0])

xbos Peaminister Taavi Rõivas jätab võimutüli tõttu ära visiidid Leedusse ja Rootsi, teda asendab väliskaubandus- ja ettevõtlusminister Anne Sulling.  Valitsuse pressiesindaja kinnitas pühapäeva pärastlõunal, et Rõivas ei sõida esmaspäeval visiidile Leetu ja Rootsi. Pressiesindaja teatel jäävad visiidid ära "seoses ametikohustustega Eestis". Reformierakonna esimees, peaminister Taavi Rõivas pidi esmaspäeval koos teiste Balti riikide valitsusjuhtidega osalema Leedus Klaipedas aset leidval LNG ujuvterminali saabumistseremoonial. Enne tseremooniat pidi aset leidma peaministrite ning Ameerika Ühendriikide esindajate ühine töölõuna. Pärastlõunal pidi Rõivas suunduma edasi Stockholmi, kus toimub Balti- ja Põhjamaade tippkohtumine. Rootsi, Soome, Norra, Islandi, Taani, Eesti, Läti ja Leedu peaministrite kohtumisel räägitakse majanduse olukorrast Euroopas, transatlantilistest suhetest ning Ukrainaga seotud arengutest. Pühapäeval kohtuvad Reformierakonna ja Sotsiaaldemokraatliku Erakonna esimeh

In [105]:
print(train_lm[0])

[17, 425, 524, 658, 2109, 0, 254, 63, 48013, 0, 5, 563, 2, 84, 28902, 0, 5, 53171, 1428, 26646, 3, 64, 755, 588, 438, 2029, 1368, 2, 7, 658, 8, 7061, 661, 4845, 17602, 5, 563, 3, 588, 704, 1070, 48013, 63, 4, 552, 0, 136, 4, 3, 813, 829, 2, 425, 524, 658, 388, 661, 79, 383, 555, 1197, 0, 3403, 5258, 0, 1815, 0, 21, 11591, 0, 0, 3, 105, 50278, 388, 1815, 3905, 31416, 15, 542, 1406, 7017, 3196, 0, 3, 1368, 388, 658, 0, 180, 3805, 2, 45, 638, 50279, 5, 9650, 11347, 3, 563, 2, 322, 2, 954, 2, 5409, 2, 2147, 2, 27, 2, 662, 5, 1109, 31416, 2357, 2275, 3893, 3806, 938, 2, 0, 5920, 15, 12824, 433, 43618, 3, 679, 10646, 813, 5, 7930, 871, 48014, 2, 7, 3819, 1726, 1064, 194, 0, 7214, 1899, 3, 18, 37, 7215, 19, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [10]:
print(train_labels[0])
print(CLASSES[train_labels[0]])
# print(CLASSES[train_labels[0].index(1)]) # for one hot

130
uudised/eesti


#### Dataloader

In [10]:
bs=128

class TokDataset(Dataset):
    def __init__(self, x, y):
        self.x = x; self.y = y
        self.len = len(self.x)
        self.x_data = torch.from_numpy(self.x); self.x_data = self.x_data.long()
        self.y_data = torch.from_numpy(self.y); self.y_data = self.y_data.long()
        print('x shape', self.x_data.shape)
        print('y shape', self.y_data.shape)
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len
    
ds = TokDataset(train_lm, np.asarray(train_labels))
ds_val = TokDataset(val_lm, np.asarray(val_labels))
dl = torch.utils.data.DataLoader(dataset=ds, batch_size=bs, shuffle=True, num_workers=0)
dl_val = torch.utils.data.DataLoader(dataset=ds_val, batch_size=bs, shuffle=True, num_workers=0)


x shape torch.Size([43662, 250])
y shape torch.Size([43662])
x shape torch.Size([4852, 250])
y shape torch.Size([4852])


In [11]:
test_values = iter(dl)

In [12]:
xs, ys = next(test_values)

In [13]:
print(xs)

tensor([[   17, 11407, 10604,  ...,     0,     0,     0],
        [   17, 12445,  1979,  ...,     0,     0,     0],
        [   17,     4, 35948,  ...,   362,     0,     0],
        ...,
        [   17,   176,   807,  ...,    59,     0,     0],
        [   17,     4,  3036,  ...,  5367,     0,     0],
        [   17,   526,    92,  ...,     0,     0,     0]])


### Feed-forward NN

In [15]:
torch.__version__

'0.4.1'

In [40]:
class SimpleFNN(nn.Module):
    def __init__(self, input_size, vocab_size, num_outputs, num_l, neurons: List[int], e_size=200):
        super(SimpleFNN, self).__init__()
        self.e = nn.Embedding(vocab_size, e_size)
        self.input_l = nn.Linear(e_size * input_size, neurons[0])
        self.middle_l = nn.ModuleList()
        for i in range(num_l):
            self.middle_l.append(nn.Linear(neurons[i], neurons[i+1]))
        self.output_l = nn.Linear(neurons[-1], num_outputs)
        
    def forward(self, x):
        i_sz = x.shape[-1]
        x = F.relu(self.e(x))
        x = x.view(-1,  i_sz * x.shape[-1])
        x = F.relu(self.input_l(x))
        for l in self.middle_l:
            x = F.relu(l(x))
        return self.output_l(x) # No softmax for crossentropy
        

In [42]:
fnn = SimpleFNN(MAX_SIZE, max_vocab, CLASS_COUNT, 4, [200, 300, 100, 50, 20]).cuda()
print(fnn)

SimpleFNN(
  (e): Embedding(60000, 200)
  (input_l): Linear(in_features=50000, out_features=200, bias=True)
  (middle_l): ModuleList(
    (0): Linear(in_features=200, out_features=300, bias=True)
    (1): Linear(in_features=300, out_features=100, bias=True)
    (2): Linear(in_features=100, out_features=50, bias=True)
    (3): Linear(in_features=50, out_features=20, bias=True)
  )
  (output_l): Linear(in_features=20, out_features=138, bias=True)
)


In [43]:
# Test pass through
fnn(xs.cuda()).shape

torch.Size([32, 138])

In [None]:
crit = torch.nn.CrossEntropyLoss()
opt = optim.Adam(fnn.parameters(), 1e-3)

### Fit function

In [14]:
def fit(epochs, model, train_dl, val_dl, crit, opt, verb=250):
    for ep in range(epochs):
        model.train()
        for i, data in tqdm(enumerate(train_dl)):
            x, y = data
            x = x.cuda(); y = y.cuda()

            y_h = model(x)
            loss = crit(y_h, y)
            
            # For accuracy
            total = 0
            correct = 0
            total += y.size(0)
            correct += (torch.argmax(y_h, 1) == y).sum().item()

            if i % verb == 0:
                print(f' Epoch: {ep} | b_loss: {loss.item():.{4}f}, b_acc: {100 * correct / total}')

            opt.zero_grad()
            loss.backward()
            opt.step()
        
        # Validate
        val_correct = 0
        model.eval()
        for i, data_val in enumerate(val_dl):
            x_val, y_val = data_val
            x_val = x_val.cuda(); y_val = y_val.cuda()
            
            # .eval() doesn't turn off gradient tracking
            with torch.no_grad():
                y_h_val = model(x_val)
                val_correct += (torch.argmax(y_h_val, 1) == y_val).sum().item()
        print(f'\nEPOCH {ep} - Val acc: {100 * val_correct / len(val_dl.dataset)}\n')
            

### CNN model

In [20]:
# https://github.com/Shawn1993/cnn-text-classification-pytorch/blob/master/model.py
# https://arxiv.org/pdf/1408.5882.pdf

# Draft implementation
class SimpleCNN(nn.Module):
    def __init__(self, input_size, vocab_size, num_outputs, e_size=300, k_num=100, k_sizes=[3, 4, 5]):
        super(SimpleCNN, self).__init__()
        self.e = nn.Embedding(vocab_size, e_size)
        self.dropout_e = nn.Dropout(0.3)
        self.convs = nn.ModuleList([nn.Conv2d(1, k_num, (k, e_size)) for k in k_sizes])
        
        self.dropout = nn.Dropout(0.5)
        self.output_l = nn.Linear(len(k_sizes)*k_num, num_outputs)

    def forward(self, x):
        x = self.e(x)  # (N, W, D)
        x = self.dropout_e(x)
        x = x.unsqueeze(1)  # (N, Ci, W, D)
        
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] 
        x = torch.cat(x, 1)

        x = self.dropout(x)  # (N, len(Ks)*Co)
        
        return self.output_l(x)

    
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x
        

In [21]:
cnn = SimpleCNN(MAX_SIZE, max_vocab, CLASS_COUNT).cuda()
print(cnn)

SimpleCNN(
  (e): Embedding(60000, 300)
  (dropout_e): Dropout(p=0.3)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5)
  (output_l): Linear(in_features=300, out_features=138, bias=True)
)


In [22]:
cnn(xs.cuda()).shape

torch.Size([128, 138])

In [24]:
crit = torch.nn.CrossEntropyLoss()
opt = optim.Adam(cnn.parameters(), 1e-3)

In [25]:
fit(15, cnn, dl, dl_val, crit, opt)

0it [00:00, ?it/s] Epoch: 0 | b_loss: 6.4499, b_acc: 0.0
250it [00:11, 21.01it/s] Epoch: 0 | b_loss: 2.2496, b_acc: 48.4375
342it [00:16, 21.05it/s]

EPOCH 0 - Val acc: 66.32316570486397

0it [00:00, ?it/s] Epoch: 1 | b_loss: 1.9087, b_acc: 57.8125
250it [00:12, 20.78it/s] Epoch: 1 | b_loss: 1.6934, b_acc: 61.71875
342it [00:16, 20.64it/s]

EPOCH 1 - Val acc: 76.95795548227535

0it [00:00, ?it/s] Epoch: 2 | b_loss: 1.5263, b_acc: 66.40625
249it [00:12, 20.14it/s] Epoch: 2 | b_loss: 1.0044, b_acc: 75.78125
342it [00:16, 20.12it/s]

EPOCH 2 - Val acc: 82.46084089035449

0it [00:00, ?it/s] Epoch: 3 | b_loss: 1.0977, b_acc: 74.21875
250it [00:12, 20.62it/s] Epoch: 3 | b_loss: 0.6896, b_acc: 85.15625
342it [00:16, 20.80it/s]

EPOCH 3 - Val acc: 85.6760098928277

0it [00:00, ?it/s] Epoch: 4 | b_loss: 0.6964, b_acc: 81.25
249it [00:12, 20.74it/s] Epoch: 4 | b_loss: 1.0089, b_acc: 78.125
342it [00:16, 20.70it/s]

EPOCH 4 - Val acc: 87.92250618301732

0it [00:00, ?it/s] Epoch: 5 | b_loss: 0.681

In [34]:
torch.save(cnn, DATA_PATH/'models/cnn_es128_knum100_ksz345_90d8acc')

  "type " + obj.__name__ + ". It won't be checked "


### RNN Model

In [33]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, e_size, bs, n_layers, n_outputs, n_hidden=128):
        super().__init__()
        self.vocab_size, self.n_layers, self.n_hidden, self.n_outputs = vocab_size, n_layers, n_hidden, n_outputs
        self.e = nn.Embedding(vocab_size, e_size)
        self.rnn = nn.LSTM(e_size, n_hidden, n_layers, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, n_outputs)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        outp = outp[:,-1,:] # all batches, last word, all output values
        #self.h = repackage_var(h)
        #return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.n_outputs)
        return self.l_out(outp)
            
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.n_layers, bs, self.n_hidden)),
                  V(torch.zeros(self.n_layers, bs, self.n_hidden)))

In [34]:
rnn = LSTM(max_vocab, 300, bs, 2, CLASS_COUNT).cuda()
print(rnn)

LSTM(
  (e): Embedding(60000, 300)
  (rnn): LSTM(300, 128, num_layers=2, dropout=0.5)
  (l_out): Linear(in_features=128, out_features=138, bias=True)
)


In [35]:
rnn(xs.cuda()).shape

torch.Size([128, 138])

In [36]:
#crit = F.nll_loss
crit = torch.nn.CrossEntropyLoss()
opt = optim.Adam(rnn.parameters(), 1e-3)

In [37]:
fit(15, rnn, dl, dl_val, crit, opt)

0it [00:00, ?it/s] Epoch: 0 | b_loss: 4.9263, b_acc: 0.0
250it [00:14, 17.30it/s] Epoch: 0 | b_loss: 2.8786, b_acc: 32.03125
342it [00:19, 17.41it/s]

EPOCH 0 - Val acc: 32.8318219291014

0it [00:00, ?it/s] Epoch: 1 | b_loss: 2.9183, b_acc: 36.71875
250it [00:14, 17.52it/s] Epoch: 1 | b_loss: 3.1635, b_acc: 26.5625
342it [00:19, 17.57it/s]

EPOCH 1 - Val acc: 32.8318219291014

0it [00:00, ?it/s] Epoch: 2 | b_loss: 2.8324, b_acc: 34.375
250it [00:14, 17.75it/s] Epoch: 2 | b_loss: 2.8677, b_acc: 37.5
342it [00:19, 17.78it/s]

EPOCH 2 - Val acc: 32.8318219291014

0it [00:00, ?it/s] Epoch: 3 | b_loss: 3.0707, b_acc: 32.8125
250it [00:14, 17.55it/s] Epoch: 3 | b_loss: 3.1012, b_acc: 29.6875
342it [00:19, 17.58it/s]

EPOCH 3 - Val acc: 32.81121187139324

0it [00:00, ?it/s] Epoch: 4 | b_loss: 2.7873, b_acc: 37.5
250it [00:14, 17.58it/s] Epoch: 4 | b_loss: 2.8920, b_acc: 36.71875
342it [00:19, 17.60it/s]

EPOCH 4 - Val acc: 32.81121187139324

0it [00:00, ?it/s] Epoch: 5 | b_loss: 2.9444, b_acc

# TODO:

##### Implementation
- Disable dropout during test time (proper eval)+
- Try a properly tuned model
    - seems to get 92.47% acc, high variance (overfitting) on training data
        - e_size=300, bs=128, lr=1e-3 got same acc in 6 epochs opposed to ~40
    - might need a bigger batch size
- Try learning rate finder
- Learning rate cosine annealing
- SGD with restarts
- Try to predict multiple labels

##### Analysis
+ Can also check how balanced the classes are+
    - Rework unbalanced classes or set class weights
+ Try a simple shallow learning model+
    - 94.5% accuracy, without fiddling with hyperparams and models much.




### Shallow SGD Classifier

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-5, random_state=42,
                                           max_iter=25, tol=None)),])

In [47]:
text_clf.fit(train_texts, train_labels)  
predicted = text_clf.predict(val_texts)      

In [48]:
np.mean(predicted == np.asarray(val_labels))

0.9453833470733718