### Imports and loads

In [1]:
from typing import List

import pickle
import numpy as np
import html
from pathlib import Path

#from fastai.text import *
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, CuDNNLSTM
from keras.layers import Dropout
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import BatchNormalization
from keras.layers.embeddings import Embedding

from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
DATA_PATH = Path('DATA/')

### Dataset properties, inspection, tokenization

In [3]:
DATASET_NAME = 'x_and_y_cleaned.pkl'
with open(DATA_PATH/DATASET_NAME, 'rb') as f:
    articles, categories = pickle.load(f)

In [4]:
# Label None as 'none'
categories = ['none' if not x else x for x in categories]

In [5]:
CLASSES = sorted(list(set(categories)))
ARTICLE_COUNT = len(articles)
CLASS_COUNT = len(CLASSES)
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag
MAX_SIZE = 250

MAX_VOCAB = 60000
min_freq = 5

print(ARTICLE_COUNT)
print(CLASS_COUNT)

48514
138


#### inspect

In [8]:
# Class balance check:
from collections import Counter
freq = Counter(o for o in categories)
freq.most_common(138)

[('uudised/eesti', 16451),
 ('melu/elu', 5883),
 ('uudised/maailm', 4285),
 ('uudised/krimi', 2211),
 ('televeeb/tvuudised', 1462),
 ('arvamus/kommentaar', 1342),
 ('naine/naised', 1163),
 ('naine/suhted', 1155),
 ('melu/seltskond', 969),
 ('sport/jalgpall', 885),
 ('naine/ilu', 817),
 ('uudised/kiiksud', 544),
 ('sport/korvpall', 541),
 ('tervis/keha', 502),
 ('blogid/londonilustiblogi', 480),
 ('uudised/ilm', 461),
 ('arvamus/juhtkiri', 454),
 ('sport/varia', 413),
 ('raha/kodu', 353),
 ('meedia/galeriid', 334),
 ('arvamus/repliik', 324),
 ('melu/saund', 323),
 ('melu/sunagukolab', 322),
 ('sport/kergejoustik', 316),
 ('meedia/videod', 315),
 ('sport/talisport', 295),
 ('blogid/spordiblogi', 288),
 ('raha/tarbija', 267),
 ('arvamus/seisukoht', 251),
 ('naine/toit', 251),
 ('tervis/uudised', 236),
 ('tervis/toitumine', 202),
 ('lemmikloom', 200),
 ('mehele/tehnika', 198),
 ('joulud', 197),
 ('melu/film', 171),
 ('blogid/teleblogi', 159),
 ('tervis', 152),
 ('sport/vormel', 149),
 ('uu

In [7]:
print(CLASSES)

['aiaeri', 'arvamus', 'arvamus/intervjuu', 'arvamus/juhtkiri', 'arvamus/karikatuur', 'arvamus/kommentaar', 'arvamus/koomiks', 'arvamus/lugejakiri', 'arvamus/nadalatipud', 'arvamus/repliik', 'arvamus/seisukoht', 'blogid/avastaeestimaad', 'blogid/aveameerikas', 'blogid/filmiblogi', 'blogid/hollandiblogi', 'blogid/indoneesiablogi', 'blogid/jumestusblogi', 'blogid/korvpallimm', 'blogid/lehesaba', 'blogid/londonilustiblogi', 'blogid/malluka', 'blogid/meistriteblogi', 'blogid/moeajakiri', 'blogid/moekeeris', 'blogid/motteid', 'blogid/muusikablogi', 'blogid/opetajablogi', 'blogid/psyhholoogiablogi', 'blogid/pulmablogi', 'blogid/raamatublogi', 'blogid/raha', 'blogid/seljakotigablogi', 'blogid/spordiblogi', 'blogid/teleblogi', 'blogid/trenniblogi', 'blogid/valdojahilo', 'blogid/yksikvanem', 'eestinaine/elud-inimesed', 'eriline/horoskoop', 'eriline/mystika', 'joulud', 'kroonika/eesti', 'lemmikloom', 'linnaleht/arvamus', 'linnaleht/dilaila', 'linnaleht/karikatuur', 'linnaleht/kodusedlood', 'linna

In [8]:
# Dataset examples:
index = 0
print('ARTICLE: ', articles[index][0:110], '...')
print('CATEGORY: ', categories[index])

ARTICLE:  Kas parima aastavahetuse programmi pani eetrisse ETV, Kanal 2 või hoopis TV3? ETVst näegid vaatajad saateid "V ...
CATEGORY:  televeeb/tvuudised


In [54]:
# Get median/average word count
print(np.median([len(x.split(' ')) for x in articles]))
print(np.mean([len(x.split(' ')) for x in articles]))

261.0
387.0457805994146


In [6]:
# One hot encoding
# labels = []
# for x in categories:
#     y = [0 for x in range(CLASS_COUNT)]
#     y[CLASSES.index(x)] = 1
#     labels.append(y)

# Class index encoding
labels = []
for x in categories:
    y = CLASSES.index(x)
    labels.append(y)

In [7]:
np.random.seed(42)
train_texts, val_texts, train_labels, val_labels = train_test_split(articles, labels, test_size=0.1, random_state=42)
pickle.dump([train_texts, val_texts, train_labels, val_labels], open(DATA_PATH/'tokens'/'trnx_valx_trny_valy_ind_split.pkl', 'wb'))

### Tokenize

In [59]:
tok_train = Tokenizer(lang='xx').proc_all_mp(partition_by_cores(train_texts))
tok_val = Tokenizer(lang='xx').proc_all_mp(partition_by_cores(val_texts))

In [60]:
freq = Counter(p for o in tok_train for p in o)
print(len(tok_train))
freq.most_common(25)

43662


[(',', 657926),
 ('.', 559252),
 ('"', 217175),
 ('ja', 210514),
 ('on', 197759),
 ('et', 150766),
 ('ei', 106727),
 ('kui', 74991),
 ('ta', 66639),
 ('ka', 58212),
 ('oli', 51101),
 ('oma', 46727),
 ('-', 46020),
 ('ning', 45314),
 ('see', 45285),
 ('xbos', 43662),
 ('xfld', 43662),
 ('0', 42597),
 ('aga', 38936),
 ('t_up', 31812),
 ('mis', 31436),
 ('ma', 30478),
 ('siis', 29830),
 ('kes', 29218),
 ('tema', 28739)]

In [61]:
print(tok_val[5])

['xbos', 'vehklemisliidu', 'president', ',', 'riigikogu', 'liige', 'margus', 'hanson', 'tõdes', ',', 'et', 'naiskond', 'vehkles', 'kaunilt', 'kuni', 'finaalini', '.', '"', 'naised', 'olid', 'väga', 'tublid', '.', 'meil', 'on', 'noor', ',', 'perspektiivikas', 'ja', 'arenev', 'võistkond', ':', 'teise', 'kohaga', 'tuleb', 'igati', 'rahul', 'olla', ',', 'sest', 'ega', 'jõu', 'ja', 'võimu', 'vastu', 'ei', 'saa', '!', '"', 'hanson', 'lisas', ',', 'et', 'teda', 'rõõmustab', 'sten', 'priinitsa', 'individuaalturniiril', 'saadud', 'kaheksas', 'koht', ',', 'millega', 'mees', 'suurendab', 'ka', 't_up', 'eok', 'toetusraha', '.', '"', 'meie', 'vehklejad', 'on', 'tõestanud', ',', 'et', 'neid', 'saab', 'usaldada', '.', 'sportlased', 'seavad', 'kõrged', 'sihid', 'ja', 'on', 'võimelised', 'neid', 'täitma', ';', '"', 'kinnitas', 'ta', '.', 'ühtlasi', 'märkis', 'hanson', ',', 'et', 'suur', 'on', 'treener', 'igor', 'tšikinjovi', 'panus', '.', '"', 'ta', 'on', 'toonud', 'värsket', 'verd', 'ja', 'hingamist',

In [62]:
freq_val = Counter(p for o in tok_val for p in o)
print(len(tok_val))
freq_val.most_common(25)

4852


[(',', 72534),
 ('.', 62293),
 ('"', 23741),
 ('ja', 23599),
 ('on', 21847),
 ('et', 16600),
 ('ei', 11625),
 ('kui', 8402),
 ('ta', 7294),
 ('ka', 6594),
 ('oli', 5541),
 ('ning', 5219),
 ('oma', 5142),
 ('-', 5101),
 ('see', 4893),
 ('xbos', 4852),
 ('xfld', 4852),
 ('0', 4743),
 ('aga', 4345),
 ('mis', 3589),
 ('t_up', 3475),
 ('ma', 3323),
 ('tema', 3264),
 ('eesti', 3248),
 ('siis', 3235)]

In [63]:
np.save(DATA_PATH/'tokens/tok_train_pad.npy', tok_train)
np.save(DATA_PATH/'tokens/tok_val_pad.npy', tok_val)

In [64]:
itos = [o for o,c in freq.most_common(MAX_VOCAB) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [65]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

In [97]:
train_lm = np.array([[stoi[o] for o in p] for p in tok_train])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

In [117]:
# Pad and crop values
train_lm_pad = [x[:MAX_SIZE] if len(x) > MAX_SIZE else x + [0 for i in range(MAX_SIZE - len(x))] for x in train_lm]
val_lm_pad = [x[:MAX_SIZE] if len(x) > MAX_SIZE else x + [0 for i in range(MAX_SIZE - len(x))] for x in val_lm]

In [118]:
np.save(DATA_PATH/'tokens'/'trn_ids.npy', train_lm_pad) # Oversaved all as padded
np.save(DATA_PATH/'tokens'/'val_ids.npy', val_lm_pad)
pickle.dump(itos, open(DATA_PATH/'tokens'/'itos.pkl', 'wb'))

### Load tokenized data

In [6]:
train_texts, val_texts, train_labels, val_labels = pickle.load(open(DATA_PATH/'tokens'/'trnx_valx_trny_valy_ind_split.pkl', 'rb'))
train_lm = np.load(DATA_PATH/'tokens'/'trn_ids.npy')
val_lm = np.load(DATA_PATH/'tokens'/'val_ids.npy')
itos = pickle.load(open(DATA_PATH/'tokens'/'itos.pkl', 'rb'))

#### Display data

In [91]:
print(train_texts[0])

xbos Peaminister Taavi Rõivas jätab võimutüli tõttu ära visiidid Leedusse ja Rootsi, teda asendab väliskaubandus- ja ettevõtlusminister Anne Sulling.  Valitsuse pressiesindaja kinnitas pühapäeva pärastlõunal, et Rõivas ei sõida esmaspäeval visiidile Leetu ja Rootsi. Pressiesindaja teatel jäävad visiidid ära "seoses ametikohustustega Eestis". Reformierakonna esimees, peaminister Taavi Rõivas pidi esmaspäeval koos teiste Balti riikide valitsusjuhtidega osalema Leedus Klaipedas aset leidval LNG ujuvterminali saabumistseremoonial. Enne tseremooniat pidi aset leidma peaministrite ning Ameerika Ühendriikide esindajate ühine töölõuna. Pärastlõunal pidi Rõivas suunduma edasi Stockholmi, kus toimub Balti- ja Põhjamaade tippkohtumine. Rootsi, Soome, Norra, Islandi, Taani, Eesti, Läti ja Leedu peaministrite kohtumisel räägitakse majanduse olukorrast Euroopas, transatlantilistest suhetest ning Ukrainaga seotud arengutest. Pühapäeval kohtuvad Reformierakonna ja Sotsiaaldemokraatliku Erakonna esimeh

In [105]:
print(train_lm[0])

[17, 425, 524, 658, 2109, 0, 254, 63, 48013, 0, 5, 563, 2, 84, 28902, 0, 5, 53171, 1428, 26646, 3, 64, 755, 588, 438, 2029, 1368, 2, 7, 658, 8, 7061, 661, 4845, 17602, 5, 563, 3, 588, 704, 1070, 48013, 63, 4, 552, 0, 136, 4, 3, 813, 829, 2, 425, 524, 658, 388, 661, 79, 383, 555, 1197, 0, 3403, 5258, 0, 1815, 0, 21, 11591, 0, 0, 3, 105, 50278, 388, 1815, 3905, 31416, 15, 542, 1406, 7017, 3196, 0, 3, 1368, 388, 658, 0, 180, 3805, 2, 45, 638, 50279, 5, 9650, 11347, 3, 563, 2, 322, 2, 954, 2, 5409, 2, 2147, 2, 27, 2, 662, 5, 1109, 31416, 2357, 2275, 3893, 3806, 938, 2, 0, 5920, 15, 12824, 433, 43618, 3, 679, 10646, 813, 5, 7930, 871, 48014, 2, 7, 3819, 1726, 1064, 194, 0, 7214, 1899, 3, 18, 37, 7215, 19, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [10]:
print(train_labels[0])
print(CLASSES[train_labels[0]])
# print(CLASSES[train_labels[0].index(1)]) # for one hot

130
uudised/eesti


### Preprocess, trim, one-hot, calculate class weights

##### trimmed classes

In [7]:
# Remove small count classes
min_count = 50

def trim(x_set, y_set, min_count):
    trim_set_x = []
    trim_set_y = []
    for x, y in zip(x_set, y_set):
        if y_set.count(y)>= min_count:
            trim_set_y.append(y)
            trim_set_x.append(x)

    return (np.asarray(trim_set_x), trim_set_y)
    
trim_train_x, trim_train_y = trim(train_lm, train_labels, min_count)
trim_val_x, trim_val_y = trim(val_lm, val_labels, min_count)
class_counts = [trim_train_y.count(x) for x in set(trim_train_y)]

##### class weights

In [117]:
class_counts = [val_labels.count(x) for x in set(val_labels)]

In [118]:
print(class_counts)

[8, 41, 147, 11, 4, 33, 39, 5, 1, 1, 1, 1, 12, 56, 7, 3, 4, 2, 3, 5, 2, 26, 14, 7, 6, 3, 1, 16, 1, 14, 8, 2, 2, 3, 2, 2, 4, 1, 1, 2, 32, 28, 12, 13, 571, 4, 15, 28, 7, 88, 40, 4, 6, 87, 113, 126, 30, 16, 9, 4, 8, 38, 13, 26, 1, 11, 5, 3, 95, 8, 2, 32, 58, 4, 1, 6, 5, 10, 2, 28, 13, 3, 40, 3, 7, 12, 170, 13, 8, 1, 50, 4, 8, 2, 3, 21, 1, 22, 1593, 44, 59, 213, 450, 16, 6]


In [63]:
class_counts = [train_labels.count(x) for x in set(train_labels)]
balanced_class_count = len(train_labels) / CLASS_COUNT
class_weights = {k: balanced_class_count / train_labels.count(k) for k in set(train_labels)}

In [64]:
class_weights

{0: 316.39130434782606,
 1: 6.327826086956521,
 2: 39.54891304347826,
 3: 0.7660806400673754,
 4: 79.09782608695652,
 5: 0.26476259778060757,
 6: 105.46376811594202,
 7: 3.132587171760654,
 8: 15.819565217391304,
 9: 1.087255341401464,
 10: 1.4924118129614437,
 11: 5.455022488755622,
 12: 35.15458937198068,
 13: 17.57729468599034,
 15: 45.19875776397515,
 16: 63.278260869565216,
 17: 52.73188405797101,
 18: 3.295742753623188,
 19: 0.7462059064807218,
 20: 3.101875532821824,
 21: 26.365942028985504,
 22: 16.652173913043477,
 23: 105.46376811594202,
 24: 15.819565217391304,
 25: 158.19565217391303,
 26: 316.39130434782606,
 27: 10.20617110799439,
 28: 6.591485507246376,
 29: 158.19565217391303,
 30: 35.15458937198068,
 31: 9.887228260869565,
 32: 1.2076003982741452,
 33: 2.182008995502249,
 34: 316.39130434782606,
 35: 7.030917874396135,
 36: 13.756143667296785,
 37: 316.39130434782606,
 38: 12.655652173913042,
 40: 1.7480182560653375,
 41: 316.39130434782606,
 42: 1.701028517999065,
 43

##### One hot encode

In [8]:
trim_train_y = to_categorical(trim_train_y)
trim_val_y = to_categorical(trim_val_y, num_classes=trim_train_y.shape[-1])

### Models

In [9]:
bs=128

#### Model 1

###### CuDNNLSTM

In [12]:
# CuDNNLSTM
e_size = 300
model = Sequential()
model.add(Embedding(MAX_VOCAB, e_size, input_length=MAX_SIZE))
model.add(CuDNNLSTM(100))
model.add(Dense(CLASS_COUNT, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
Use the retry module or similar alternatives.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 300)          18000000  
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 100)               160800    
_________________________________________________________________
dense_2 (Dense)              (None, 138)               13938     
Total params: 18,174,738
Trainable params: 18,174,738
Non-trainable params: 0
_________________________________________________________________
None


###### LSTM

In [99]:
# Regular
e_size = 300
model = Sequential()
model.add(Embedding(MAX_VOCAB, e_size, input_length=MAX_SIZE))
model.add(LSTM(100))
model.add(Dense(CLASS_COUNT, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 250, 300)          18000000  
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_6 (Dense)              (None, 138)               13938     
Total params: 18,174,338
Trainable params: 18,174,338
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
model.fit(trim_train_x, trim_train_y, validation_data=(trim_val_x, trim_val_y), epochs=10, batch_size=bs)#, class_weight=class_weights)

Train on 42427 samples, validate on 3876 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x204a80ed208>

In [112]:
model.fit(trim_train_x, trim_train_y, validation_data=(trim_val_x, trim_val_y), epochs=10, batch_size=bs)#, class_weight=class_weights)

Train on 42427 samples, validate on 3876 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15756ff7898>

#### Model 2

In [113]:
model_2 = Sequential()
model_2.add(Embedding(MAX_VOCAB, e_size, input_length=MAX_SIZE))
model_2.add(Dropout(0.25))
model_2.add(Conv1D(64, 5, padding='valid', activation='relu', strides=1))
model_2.add(MaxPooling1D(pool_size=4))
model_2.add(LSTM(70))
model_2.add(Dense(CLASS_COUNT, activation='softmax'))
model_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [116]:
model_2.fit(trim_train_x, trim_train_y, validation_data=(trim_val_x, trim_val_y), epochs=10, batch_size=bs)#, class_weight=class_weights)

Train on 42427 samples, validate on 3876 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1596e8a9710>

#### Model 3

In [101]:
model_3 = Sequential()
model_3.add(Embedding(MAX_VOCAB, e_size, input_length=MAX_SIZE))
model_3.add(Dropout(0.5))
model_3.add(Conv1D(128, 5, padding='valid', activation='relu', strides=1))
model_3.add(MaxPooling1D(pool_size=4))
model_3.add(Dropout(0.5))
model_3.add(Flatten())
model_3.add(Dense(256, activation='relu'))
model_3.add(Dense(CLASS_COUNT, activation='softmax'))
model_3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


#### Model 4

In [141]:
model_4 = Sequential()
model_4.add(Embedding(MAX_VOCAB, e_size, input_length=MAX_SIZE))
model_4.add(Dropout(0.5))
model_4.add(Conv1D(100, 3, padding='valid', activation='relu', strides=1))
model_4.add(BatchNormalization())
model_4.add(Conv1D(100, 4, padding='valid', activation='relu', strides=1))
model_4.add(BatchNormalization())
model_4.add(Conv1D(100, 5, padding='valid', activation='relu', strides=1))
model_4.add(Dropout(0.75))
model_4.add(Flatten())
model_4.add(Dense(CLASS_COUNT, activation='softmax'))
model_4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [143]:
model_4.fit(trim_train_x, trim_train_y, validation_data=(trim_val_x, trim_val_y), epochs=10, batch_size=bs)#, class_weight=class_weights)

Train on 42427 samples, validate on 3876 samples
Epoch 1/10


ResourceExhaustedError: OOM when allocating tensor with shape[60000,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: training_13/Adam/mul_3 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Adam_18/beta_2/read, training_13/Adam/Variable_13/read)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Node: dense_21/BiasAdd/_3075 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_589_dense_21/BiasAdd", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'training_13/Adam/mul_3', defined at:
  File "C:\Users\Ranet\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\Ranet\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 486, in start
    self.io_loop.start()
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2856, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-142-f7d0ebd02bf2>", line 1, in <module>
    model_4.fit(trim_train_x, trim_train_y, validation_data=(trim_val_x, trim_val_y), epochs=10, batch_size=bs)#, class_weight=class_weights)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\keras\engine\training.py", line 1008, in fit
    self._make_train_function()
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\keras\engine\training.py", line 498, in _make_train_function
    loss=self.total_loss)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\keras\legacy\interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\keras\optimizers.py", line 492, in get_updates
    v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\tensorflow\python\ops\variables.py", line 790, in _run_op
    return getattr(ops.Tensor, operator)(a._AsTensor(), *args)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 971, in binary_op_wrapper
    return func(x, y, name=name)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1198, in _mul_dispatch
    return gen_math_ops.mul(x, y, name=name)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 4991, in mul
    "Mul", x=x, y=y, name=name)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3290, in create_op
    op_def=op_def)
  File "C:\Users\Ranet\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1654, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[60000,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: training_13/Adam/mul_3 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Adam_18/beta_2/read, training_13/Adam/Variable_13/read)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Node: dense_21/BiasAdd/_3075 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_589_dense_21/BiasAdd", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



# TODO:

- class weights+
- refactor classes seperate all, use tags with softmax top two with some certain probability

In [107]:
trim_train_texts, trim_train_labels = trim(train_texts, train_labels, min_count)

In [108]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-5, random_state=42,
                                           max_iter=25, tol=None)),])

In [110]:
text_clf.fit(trim_train_texts, trim_train_labels)  
predicted = text_clf.predict(val_texts)      

In [111]:
np.mean(predicted == np.asarray(val_labels))

0.9225061830173125