In [2]:
import tensorflow as tf
import numpy as np

In [3]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import csv

In [6]:
vocab_size = 6000
embedding_dim =64
max_length = 200
trunc_type = 'pist'
padding_type = 'post'
oov_tok ='OOV'
training_portion =0.8
STOPWORDS = set(stopwords.words('english'))

In [7]:
articles = []
labels = []

with open("bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        article = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)
print(len(labels))
print(len(articles))

2225
2225


In [8]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_articles))
print(len(train_labels))
print(len(validation_articles))
print(len(validation_labels))

1780
1780
1780
445
445


In [9]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

In [None]:
word_index

In [10]:
#Now replace values eith numbers
text_sequences = tokenizer.texts_to_sequences(train_articles)

In [None]:
print(text_sequences[:1])
print(train_articles[:1])

In [None]:
'''
Nlp requires the length of each seqience to be same, hence we use max length which is 200, 
sequence greater than 200 will be reduced to 200 and less yhan 200 will be padded as 0
'''

In [11]:
train_pad = pad_sequences(text_sequences,maxlen=max_length,padding= 'post',truncating = 'post')
train_pad

array([[  91,  160, 1141, ..., 5319,  294,  756],
       [1857,  558,  222, ...,    0,    0,    0],
       [4293,    1, 3655, ...,    0,    0,    0],
       ...,
       [   1, 2672, 2212, ...,  629,  171,    1],
       [1931,   67,    1, ...,   81, 1977,    1],
       [  91, 3813,    1, ...,    0,    0,    0]])

In [12]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding='post', truncating='post')

print(len(validation_sequences))
print(validation_padded.shape)

445
(445, 200)


In [13]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [14]:
rev_word_ind = dict([(values,keys) for (keys , values) in word_index.items()])
rev_word_ind

{1: 'OOV',
 2: 'said',
 3: 'mr',
 4: 'would',
 5: 'year',
 6: 'also',
 7: 'people',
 8: 'new',
 9: 'us',
 10: 'one',
 11: 'could',
 12: 'last',
 13: 'first',
 14: 'time',
 15: 'two',
 16: 'government',
 17: 'world',
 18: 'uk',
 19: 'best',
 20: 'years',
 21: 'make',
 22: 'film',
 23: 'told',
 24: 'made',
 25: 'get',
 26: 'music',
 27: 'game',
 28: 'like',
 29: 'back',
 30: 'many',
 31: '000',
 32: 'labour',
 33: 'three',
 34: 'well',
 35: '1',
 36: 'next',
 37: 'bbc',
 38: 'take',
 39: 'set',
 40: 'number',
 41: 'added',
 42: 'way',
 43: 'market',
 44: '2',
 45: 'company',
 46: 'may',
 47: 'says',
 48: 'election',
 49: 'home',
 50: 'party',
 51: 'good',
 52: 'going',
 53: 'much',
 54: 'work',
 55: '2004',
 56: 'still',
 57: 'win',
 58: 'show',
 59: 'think',
 60: 'games',
 61: 'go',
 62: 'top',
 63: 'second',
 64: 'million',
 65: '6',
 66: 'england',
 67: 'firm',
 68: 'since',
 69: 'week',
 70: 'say',
 71: 'play',
 72: 'part',
 73: 'public',
 74: 'use',
 75: 'blair',
 76: '3',
 77: 'wan

In [15]:
def decode_articles(text):
    return ' '.join([rev_word_ind.get(i,'???')for i in text])
decode_articles(train_pad[1])

'worldcom boss left books alone former worldcom boss OOV ebbers accused OOV 11bn â£5 8bn fraud never made accounting decisions witness told jurors david OOV made comments questioning defence lawyers arguing mr ebbers responsible worldcom problems phone company collapsed 2002 prosecutors claim losses hidden protect firm shares mr OOV already pleaded guilty fraud OOV prosecutors monday defence lawyer reid OOV tried distance client allegations cross OOV asked mr OOV ever knew mr ebbers make accounting decision aware mr OOV replied ever know mr ebbers make accounting entry worldcom books mr OOV OOV replied witness mr OOV admitted ordered false accounting entries request former worldcom chief financial officer scott sullivan defence lawyers trying OOV mr sullivan admitted fraud OOV later trial OOV behind worldcom accounting house cards mr ebbers team meanwhile looking OOV OOV boss admission OOV OOV economist whatever OOV mr ebbers OOV worldcom OOV unknown OOV telecoms giant investor OOV lat

In [None]:
#LSTMb`

In [16]:
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 6 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(6, 'softmax')])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          384000    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 390       
Total params: 458,694
Trainable params: 458,694
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10
history = model.fit(train_pad, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Epoch 1/10
56/56 - 8s - loss: 1.5705 - accuracy: 0.2596 - val_loss: 1.3396 - val_accuracy: 0.3865
Epoch 2/10
56/56 - 6s - loss: 1.1419 - accuracy: 0.5360 - val_loss: 0.6676 - val_accuracy: 0.7506
Epoch 3/10
56/56 - 7s - loss: 0.4081 - accuracy: 0.8612 - val_loss: 0.3661 - val_accuracy: 0.8989
Epoch 4/10
56/56 - 6s - loss: 0.1186 - accuracy: 0.9702 - val_loss: 0.3043 - val_accuracy: 0.9169
Epoch 5/10
56/56 - 7s - loss: 0.0480 - accuracy: 0.9865 - val_loss: 0.2502 - val_accuracy: 0.9213
Epoch 6/10
56/56 - 7s - loss: 0.0239 - accuracy: 0.9944 - val_loss: 0.2666 - val_accuracy: 0.9281
Epoch 7/10
56/56 - 6s - loss: 0.0281 - accuracy: 0.9910 - val_loss: 0.4531 - val_accuracy: 0.8652
Epoch 8/10
56/56 - 7s - loss: 0.0267 - accuracy: 0.9927 - val_loss: 0.3216 - val_accuracy: 0.9236
Epoch 9/10
56/56 - 6s - loss: 0.0165 - accuracy: 0.9966 - val_loss: 0.3237 - val_accuracy: 0.9146
Epoch 10/10
56/56 - 7s - loss: 0.0056 - accuracy: 0.9989 - val_loss: 0.3580 - val_accuracy: 0.9281


In [19]:
txt = ["A WeWork shareholder has taken the company to court over the near-$1.7bn (£1.3bn) leaving package approved for ousted co-founder Adam Neumann."]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_length)
pred = model.predict(padded)
np.argmax(pred)

2

{'business', 'entertainment', 'politics', 'sport', 'tech'}