# Importing the packages First!

In [1]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

# Deciding on the hyper parameters of the Model

**Vocab_Size**: The number of words that it'll use to train the model

**embedding_dim**: No of the vectors for each word

**max_length**: Truncating the sentence to 200 word length

In [2]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = 0.6
validation_portion = 0.2
test_portion = 0.2

# Loading the dataset

Splitting the sentences and it's sub classes into articles and labels

In [3]:
articles = []
labels = []

with open("bbc-text.csv","r") as csvfile:
    reader = csv.reader(csvfile,delimiter=',')
    
    #Skipping the heading section
    next(reader)
    
    for row in reader:
        labels.append(row[0])
        article = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token,' ')
            article = article.replace(' ',' ')
        articles.append(article)
        
#The length of the dataset
print(len(labels))
print(len(articles))

2251
2251


In [4]:
labels

['tech',
 'business',
 'sport',
 'sport',
 'entertainment',
 'politics',
 'politics',
 'sport',
 'sport',
 'entertainment',
 'entertainment',
 'business',
 'business',
 'politics',
 'sport',
 'business',
 'politics',
 'sport',
 'business',
 'tech',
 'tech',
 'tech',
 'sport',
 'sport',
 'tech',
 'sport',
 'entertainment',
 'tech',
 'politics',
 'entertainment',
 'politics',
 'tech',
 'entertainment',
 'entertainment',
 'business',
 'politics',
 'tech',
 'entertainment',
 'politics',
 'business',
 'politics',
 'sport',
 'business',
 'sport',
 'tech',
 'entertainment',
 'politics',
 'politics',
 'politics',
 'business',
 'sport',
 'politics',
 'business',
 'business',
 'sport',
 'politics',
 'business',
 'sport',
 'sport',
 'business',
 'business',
 'sport',
 'business',
 'sport',
 'business',
 'tech',
 'business',
 'entertainment',
 'tech',
 'business',
 'politics',
 'business',
 'politics',
 'sport',
 'business',
 'tech',
 'business',
 'sport',
 'sport',
 'business',
 'business',
 'spo

# Splitting the Training and Test Dataset

In [5]:
train_size = int(len(articles) * training_portion)
validation_size = train_size + int(len(articles) * validation_portion)
test_size = validation_size + int(len(articles) * test_portion)

train_articles = articles[0:train_size]
train_labels = labels[0:train_size]

validation_articles = articles[train_size:validation_size]
validation_labels = labels[train_size:validation_size]

test_articles = articles[validation_size:test_size]
test_labels = labels[validation_size:test_size]

# Intializing Tokenizer

Tokenizer is used to tokenize the words. In other words, we get how much words are there in the overall dataset.

In [6]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'said': 2,
 'mr': 3,
 'would': 4,
 'year': 5,
 'also': 6,
 'people': 7,
 'new': 8,
 'us': 9,
 'one': 10}

# Converting the text to sequences

The tokenizer maps each unique word in the dataset to an index

In [7]:
train_sequences = tokenizer.texts_to_sequences(train_articles)
print(train_sequences[10])

validation_sequences = tokenizer.texts_to_sequences(validation_articles)
test_sequences = tokenizer.texts_to_sequences(test_articles)

[2328, 1, 263, 4173, 19, 648, 529, 263, 4173, 1, 1, 1604, 1, 1, 2328, 19, 491, 1, 1, 130, 257, 1, 130, 257, 878, 742, 656, 2329, 1, 993, 1745, 1, 1501, 1, 1, 1, 1, 1, 1, 3970, 1, 1, 105, 4441, 1, 2, 4442, 1394, 322, 4174, 1, 56, 373, 1, 322, 2479, 3773, 40, 19, 3586, 1, 1, 1, 1, 521, 1, 1, 1, 793, 772, 2171, 416, 4175, 1, 326, 19, 1, 744, 2409, 1, 1, 140, 10, 1, 4443, 671, 4176, 1, 19, 1, 456, 742, 656, 1, 83, 13, 599, 1, 263, 4173, 1, 588, 1, 1745, 954, 1, 1, 806, 2051, 113, 1, 1, 1, 2841, 19, 1, 128, 257, 1, 1790, 1, 521, 492, 1, 1495, 4444, 794, 1175, 1, 1885, 10, 36, 648, 261, 1, 73, 517, 491, 289, 1502, 19, 503, 1, 1, 1926, 1, 825, 1, 3251, 1, 1273, 6, 1, 2328, 491, 19, 3245, 1, 1, 1, 1, 1, 1, 826, 43, 1991, 570, 279, 24, 819, 1, 807, 19, 353, 19, 13, 289, 1502, 1361, 474, 20, 68, 761, 1203, 4177, 267]


# Padding the Text Sequences

In [8]:
train_padded = pad_sequences(train_sequences,maxlen=max_length,
                            padding=padding_type,truncating=trunc_type)
validation_padded = pad_sequences(validation_sequences,maxlen=max_length,
                                 padding=padding_type,truncating=trunc_type)
test_padded = pad_sequences(test_sequences,maxlen=max_length,
                           padding=padding_type,truncating=trunc_type)

In [9]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

label_seq = label_tokenizer.texts_to_sequences(train_labels)

#print(label_seq)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))
test_label_seq = np.array(label_tokenizer.texts_to_sequences(test_labels))

In [10]:
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

def decode_article(text):
    return ' '.join([reverse_word_index.get(i,'?') for i in text])

decode_article(train_sequences[10])

'berlin <OOV> anti nazi film german movie anti nazi <OOV> <OOV> drawn <OOV> <OOV> berlin film festival <OOV> <OOV> final days <OOV> final days member white rose movement <OOV> 21 arrested <OOV> brother <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> tyranny <OOV> <OOV> director marc <OOV> said feeling responsibility keep legacy <OOV> going must <OOV> keep ideas alive added film drew <OOV> <OOV> <OOV> <OOV> trial <OOV> <OOV> <OOV> east germany secret police discovery <OOV> behind film <OOV> worked closely <OOV> <OOV> including one <OOV> sisters ensure historical <OOV> film <OOV> members white rose <OOV> group first started <OOV> anti nazi <OOV> summer <OOV> arrested dropped <OOV> <OOV> university calling day <OOV> <OOV> <OOV> regime film <OOV> six days <OOV> arrest <OOV> trial saw <OOV> initially deny charges ended <OOV> appearance one three german films <OOV> top prize festival south african film version <OOV> <OOV> opera <OOV> shot <OOV> town <OOV> language also <OOV> berlin festival film entitle

# Defining the Neural Net

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(embedding_dim,activation='relu'),
    tf.keras.layers.Dense(6,activation='softmax')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          320000    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 390       
Total params: 394,694
Trainable params: 394,694
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(loss='sparse_categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

# Training the dataset

In [13]:
num_epochs=10
history = model.fit(train_padded,training_label_seq, 
                    epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), 
                    verbose=2)

Train on 1350 samples, validate on 450 samples
Epoch 1/10
1350/1350 - 5s - loss: 1.6421 - accuracy: 0.2378 - val_loss: 1.5101 - val_accuracy: 0.3978
Epoch 2/10
1350/1350 - 1s - loss: 1.3306 - accuracy: 0.4607 - val_loss: 0.9883 - val_accuracy: 0.6489
Epoch 3/10
1350/1350 - 1s - loss: 0.7848 - accuracy: 0.6911 - val_loss: 0.6720 - val_accuracy: 0.7467
Epoch 4/10
1350/1350 - 1s - loss: 0.5779 - accuracy: 0.7852 - val_loss: 0.5747 - val_accuracy: 0.8578
Epoch 5/10
1350/1350 - 1s - loss: 0.3483 - accuracy: 0.9044 - val_loss: 0.5319 - val_accuracy: 0.8111
Epoch 6/10
1350/1350 - 1s - loss: 0.3389 - accuracy: 0.8622 - val_loss: 0.5080 - val_accuracy: 0.7689
Epoch 7/10
1350/1350 - 1s - loss: 0.1243 - accuracy: 0.9622 - val_loss: 0.3793 - val_accuracy: 0.9022
Epoch 8/10
1350/1350 - 1s - loss: 0.1261 - accuracy: 0.9659 - val_loss: 0.3999 - val_accuracy: 0.8644
Epoch 9/10
1350/1350 - 1s - loss: 0.0813 - accuracy: 0.9778 - val_loss: 0.2824 - val_accuracy: 0.9156
Epoch 10/10
1350/1350 - 1s - loss: 

In [14]:
results = model.evaluate(test_padded, test_label_seq, verbose=1)
print("test loss, test acc:", results)

test loss, test acc: [nan, 0.88]


In [15]:
eg = ["Indian team has won the test match against australia"]
#print(training_label_seq[:10])


model.predict_classes(tokenizer.texts_to_sequences(eg))

array([1], dtype=int64)