In [1]:
import tensorflow as tf
import csv
import numpy as np

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [3]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

articles = []
labels = []

with open("bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        article = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)
print(len(labels))
print(len(articles))

2225
2225


In [4]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

In [5]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

dictionary = dict(list(word_index.items())[0:10])
print(dictionary)
train_sequences = tokenizer.texts_to_sequences(train_articles)

print(train_sequences[10])

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


{'<OOV>': 1, 'said': 2, 'mr': 3, 'would': 4, 'year': 5, 'also': 6, 'people': 7, 'new': 8, 'us': 9, 'one': 10}
[2431, 1, 225, 4996, 22, 641, 587, 225, 4996, 1, 1, 1662, 1, 1, 2431, 22, 565, 1, 1, 140, 278, 1, 140, 278, 796, 823, 662, 2307, 1, 1144, 1693, 1, 1720, 4997, 1, 1, 1, 1, 1, 4739, 1, 1, 122, 4515, 1, 2, 2874, 1505, 352, 4740, 1, 52, 341, 1, 352, 2173, 3962, 41, 22, 3795, 1, 1, 1, 1, 543, 1, 1, 1, 835, 631, 2366, 347, 4741, 1, 365, 22, 1, 787, 2367, 1, 4303, 138, 10, 1, 3665, 682, 3532, 1, 22, 1, 414, 823, 662, 1, 90, 13, 633, 1, 225, 4996, 1, 599, 1, 1693, 1021, 1, 4998, 808, 1864, 117, 1, 1, 1, 2974, 22, 1, 99, 278, 1, 1608, 4999, 543, 492, 1, 1445, 4742, 778, 1320, 1, 1861, 10, 33, 641, 319, 1, 62, 478, 565, 301, 1506, 22, 479, 1, 1, 1665, 1, 797, 1, 3066, 1, 1365, 6, 1, 2431, 565, 22, 2971, 4736, 1, 1, 1, 1, 1, 850, 39, 1825, 675, 297, 26, 979, 1, 882, 22, 361, 22, 13, 301, 1506, 1343, 374, 20, 63, 883, 1096, 4304, 247]


In [6]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

print(set(labels))

label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

print(training_label_seq.shape)
print(validation_label_seq.shape)

445
(445, 200)
{'business', 'tech', 'entertainment', 'politics', 'sport'}
(1780, 1)
(445, 1)


In [8]:
def decode_article(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])


In [9]:
print(set(labels))

label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

print(training_label_seq.shape)
print(validation_label_seq.shape)



reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

{'business', 'tech', 'entertainment', 'politics', 'sport'}
(1780, 1)
(445, 1)


In [10]:
print(decode_article(train_padded[10]))
print('---')
print(train_articles[10])

berlin <OOV> anti nazi film german movie anti nazi <OOV> <OOV> drawn <OOV> <OOV> berlin film festival <OOV> <OOV> final days <OOV> final days member white rose movement <OOV> 21 arrested <OOV> brother hans <OOV> <OOV> <OOV> <OOV> <OOV> tyranny <OOV> <OOV> director marc <OOV> said feeling responsibility keep legacy <OOV> going must <OOV> keep ideas alive added film drew <OOV> <OOV> <OOV> <OOV> trial <OOV> <OOV> <OOV> east germany secret police discovery <OOV> behind film <OOV> worked closely <OOV> relatives including one <OOV> sisters ensure historical <OOV> film <OOV> members white rose <OOV> group first started <OOV> anti nazi <OOV> summer <OOV> arrested dropped <OOV> munich university calling day <OOV> <OOV> <OOV> regime film <OOV> six days <OOV> arrest intense trial saw <OOV> initially deny charges ended <OOV> appearance one three german films <OOV> top prize festival south african film version <OOV> <OOV> opera <OOV> shot <OOV> town <OOV> language also <OOV> berlin festival film en

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 6 units and softmax activation.
    # using softmax for multi-class classification 
    tf.keras.layers.Dense(6, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          320000    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 390       
Total params: 394,694
Trainable params: 394,694
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10

history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)
#history = model.fit(train_padded, training_label_seq, epochs=num_epochs, verbose=2)

Epoch 1/10
56/56 - 4s - loss: 1.5469 - accuracy: 0.3096 - val_loss: 1.3418 - val_accuracy: 0.4494
Epoch 2/10
56/56 - 4s - loss: 0.9147 - accuracy: 0.6640 - val_loss: 0.7493 - val_accuracy: 0.7438
Epoch 3/10
56/56 - 4s - loss: 0.2930 - accuracy: 0.9140 - val_loss: 0.2718 - val_accuracy: 0.9079
Epoch 4/10
56/56 - 4s - loss: 0.0858 - accuracy: 0.9770 - val_loss: 0.1829 - val_accuracy: 0.9416
Epoch 5/10
56/56 - 4s - loss: 0.0286 - accuracy: 0.9944 - val_loss: 0.2231 - val_accuracy: 0.9393
Epoch 6/10
56/56 - 4s - loss: 0.0136 - accuracy: 0.9972 - val_loss: 0.2066 - val_accuracy: 0.9393
Epoch 7/10
56/56 - 4s - loss: 0.0134 - accuracy: 0.9966 - val_loss: 0.1685 - val_accuracy: 0.9551
Epoch 8/10
56/56 - 4s - loss: 0.0075 - accuracy: 0.9989 - val_loss: 0.1570 - val_accuracy: 0.9573
Epoch 9/10
56/56 - 4s - loss: 0.0065 - accuracy: 0.9989 - val_loss: 0.2005 - val_accuracy: 0.9551
Epoch 10/10
56/56 - 4s - loss: 0.0013 - accuracy: 1.0000 - val_loss: 0.2251 - val_accuracy: 0.9528


In [13]:
test_loss, test_acc = model.evaluate(validation_padded, validation_label_seq)
print(test_acc)

0.9528089761734009


In [14]:
CLASSES = set(labels)
print(CLASSES)

{'business', 'tech', 'entertainment', 'politics', 'sport'}


In [18]:
from sklearn.metrics import classification_report, confusion_matrix
predictions = model.predict_classes(validation_padded)
y_pred = predictions 

type(y_pred)
cm = confusion_matrix(validation_label_seq, y_pred)
print(cm)

print(classification_report(validation_label_seq, y_pred, target_names = CLASSES))

[[ 97   1   0   1   2]
 [  0 100   4   2   0]
 [  0   1  82   2   1]
 [  1   0   0  85   0]
 [  0   1   2   3  60]]
               precision    recall  f1-score   support

     business       0.99      0.96      0.97       101
         tech       0.97      0.94      0.96       106
entertainment       0.93      0.95      0.94        86
     politics       0.91      0.99      0.95        86
        sport       0.95      0.91      0.93        66

     accuracy                           0.95       445
    macro avg       0.95      0.95      0.95       445
 weighted avg       0.95      0.95      0.95       445

