In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from functools import reduce
from tqdm import tqdm
import tensorflow as tf
import gensim.downloader

def get_newsgroups_data(categories, samples_per_category=500):
    newsgroups = fetch_20newsgroups(subset='all')
    data = reduce(lambda x,y: x+y, [fetch_20newsgroups(categories=[x], remove=('headers', 'footers'))['data'][:samples_per_category] for x in categ])
    targets = []
    for c in categories:
        targets += [c] * samples_per_category
    return data, targets

categ = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball']

data, targets = get_newsgroups_data(categ, 50)

In [None]:
w2v = gensim.downloader.load('word2vec-google-news-300')



In [None]:
def encode_sentence(sent):
    vectors = []
    for word in sent:
        if word:
            try:
                vectors.append(w2v[word])
            except KeyError:
                pass
    return np.stack(vectors)


def prepare_dataset(dataset):
    encoded_dataset = []
    for doc in tqdm(dataset):
        encoded_dataset.append(encode_sentence(doc))

    return tf.keras.preprocessing.sequence.pad_sequences(encoded_dataset,
                                                         value=np.zeros((300,)),
                                                         dtype='float32')

1. (9, 300) --> (9, 300)
2. (5, 300) --> (9, 300)
3. (7, 300) --> (9, 300)


2. [z, z, z, z, e1, e2, e3, e4, e5]
z = [0, 0, 0, ...]


[(9, 300), (9, 300), (9, 300)]  -- stack --> (3, 9, 300)

In [None]:
X = prepare_dataset(data)

100%|██████████| 150/150 [00:00<00:00, 253.47it/s]


In [None]:
X.shape, X.dtype

((150, 10938, 300), dtype('float32'))

`(num_docs, max_seq_length, embedding_dim)`

In [None]:
doc1 = X[0]
doc1.shape

(10938, 300)

In [None]:
doc1[7033]

array([ 0.07910156, -0.0050354 ,  0.11181641,  0.21289062,  0.13085938,
       -0.01470947, -0.03540039, -0.07763672,  0.04077148,  0.11474609,
        0.00147247, -0.29101562,  0.00457764, -0.20019531, -0.19238281,
        0.08007812,  0.10107422,  0.04858398,  0.15722656, -0.09521484,
       -0.05004883,  0.25      ,  0.33007812, -0.09716797, -0.05566406,
       -0.0071106 , -0.16796875, -0.13574219,  0.05102539, -0.00598145,
        0.10791016,  0.16503906, -0.03955078, -0.03955078,  0.04321289,
        0.12060547,  0.13476562,  0.09375   ,  0.00909424,  0.1640625 ,
        0.21289062, -0.05322266,  0.33398438,  0.01586914,  0.10449219,
        0.24121094, -0.0189209 , -0.04199219,  0.05834961,  0.03271484,
        0.09863281,  0.18945312,  0.04125977,  0.01501465, -0.05883789,
        0.10253906,  0.01538086,  0.03198242,  0.02722168, -0.13769531,
        0.12695312,  0.06396484, -0.13574219, -0.012146  ,  0.07617188,
       -0.02319336, -0.21191406,  0.20996094, -0.01953125,  0.02

In [None]:
y = tf.keras.utils.to_categorical(pd.Series(targets).map(dict(zip(categ, range(3)))), num_classes=3)

In [None]:
y.shape  # (150, 3)

(150, 3)

In [None]:
y

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(10938, 300)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

`(None, 10938, 300)`

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 64)                93440     
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                                 
 dense_5 (Dense)             (None, 3)                 195       
                                                                 
Total params: 97,795
Trainable params: 97,795
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X, y)

Epoch 1/5


In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])