In [None]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


# BBC Article Categorization Model

using data from https://www.kaggle.com/yufengdev/bbc-fulltext-and-category

**Goal: Train model to predict category [sport/business/politics/tech/entertainment] of BBC article, given article text.**

## Import Data

In [None]:
# Upload kaggle.json file
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
# Import dependencies for downloading datasets from Kaggle platform
! git clone https://github.com/mdda/colab_helper
from colab_helper import utils as chu

In [None]:
! pip install kaggle
chu.kaggle_credentials(file='./kaggle.json')

In [None]:
# Import BBC article dataset from Kaggle platform
## Description page : https://www.kaggle.com/yufengdev/bbc-fulltext-and-category
! kaggle datasets download -d yufengdev/bbc-fulltext-and-category
print('Dataset downloaded')

Downloading bbc-fulltext-and-category.zip to /content
  0% 0.00/1.83M [00:00<?, ?B/s]
100% 1.83M/1.83M [00:00<00:00, 60.5MB/s]
Dataset downloaded


## Unzip Data and Split into Train / Test DataFrames

In [None]:
! unzip -qq bbc-fulltext-and-category.zip

In [None]:
! rm -r bbc-fulltext-and-category.zip

In [None]:
import pandas as pd
ds = pd.read_csv('bbc-text.csv')

In [None]:
!wget -qq https://www.dropbox.com/s/v14xhvjmfniraf3/glove6b100dtxt.zip
# !wget -qq http://nlp.stanford.edu/data/glove.6B.zip
  
!unzip glove6b100dtxt.zip

Archive:  glove6b100dtxt.zip
  inflating: glove.6B.100d.txt       


In [None]:
ds.category.value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [None]:
ds['category'] = pd.Categorical(ds['category'])
ds['category'] = ds.category.cat.codes

In [None]:
# Split data by category and distribute randomly into test / train dataframes (10:90)
import math
from sklearn.utils import shuffle

train_data = pd.DataFrame()
test_data = pd.DataFrame()

for i in ds.category.unique():
  category_data = ds[ds.category == i]
  # print(category_data.head())
  new_category_data = shuffle(category_data, random_state = 0)
  train_data = pd.concat([train_data, new_category_data.iloc[:math.floor(category_data.text.size * 0.9)]])
  test_data = pd.concat([test_data, new_category_data.iloc[math.ceil(category_data.text.size * 0.9):]])

print(train_data.head())
print(test_data.head())

      category                                               text
1146         4  bbc leads interactive bafta wins the bbc and t...
1828         4  go-ahead for new internet names the internet c...
512          4  digital guru floats sub-$100 pc nicholas negro...
446          4  broadband challenges tv viewing the number of ...
      category                                               text
1634         4  sony psp handheld console hits us the latest h...
2074         4  apple sues  tiger  file sharers apple has take...
859          4  finding new homes for old phones re-using old ...
1605         4  sony psp console hits us in march us gamers wi...
2103         4  ds aims to touch gamers the mobile gaming indu...


## Separate Prediction Target from Features

In [None]:
train_target = train_data.pop('category')
test_target = test_data.pop('category')

## Tokenize Training Data (Article Text)

In [None]:
import os
import sys
import numpy as np

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.utils import to_categorical

from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import Constant

In [None]:
# Preview data in Train DataFrame
train_data.head()

Unnamed: 0,text
1146,bbc leads interactive bafta wins the bbc and t...
1828,go-ahead for new internet names the internet c...
402,warnings about junk mail deluge the amount of ...
512,digital guru floats sub-$100 pc nicholas negro...
446,broadband challenges tv viewing the number of ...


In [None]:
# Take each row of train text and append to train_text_list
train_text_list = []
for i in train_data.text:
  train_text_list.append(i)

In [None]:
# Take each row of test text and append to test_text_list
test_text_list = []
for i in test_data.text:
  test_text_list.append(i)

In [None]:
# Init Tokenizer
MAX_SEQUENCE_LENGTH = 5000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_text_list)
train_sequences = tokenizer.texts_to_sequences(train_text_list)
test_sequences = tokenizer.texts_to_sequences(test_text_list)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 28667 unique tokens.


In [None]:
# Preview first sequence
train_sequences[0][:20]

[110,
 2957,
 2025,
 2680,
 1154,
 1,
 110,
 4,
 1,
 313,
 1267,
 20,
 625,
 1,
 1164,
 21,
 33,
 41,
 7,
 2025]

In [None]:
# Prepare Training dataset
X_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
# Prepare Test dataset
X_val = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
# Prepare Training Target dataset
## Take each row of category values and append to train_target_list
train_target_list = []
for i in train_target:
  train_target_list.append(i)

y_train = to_categorical(np.asarray(train_target_list))

In [None]:
# Prepare Test Target dataset
## Take each row of category values and append to test_target_list
test_target_list = []
for i in test_target:
  test_target_list.append(i)

y_val = to_categorical(np.asarray(test_target_list))

In [None]:
print('Shape of train data tensor:', X_train.shape)
print('Shape of label tensor:', y_train.shape)

Shape of train data tensor: (2000, 5000)
Shape of label tensor: (2000, 5)


## Prepare Embedding Matrix

In [None]:
# Use word embeddings index from pre-trained GloVe
print('Indexing word vectors.')

BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, '')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [None]:
print('Preparing embedding matrix.')

# Prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix.


## Test GloVe Embedding

In [None]:
embeddings_index['news']

array([-0.66842  , -0.41713  ,  0.42473  , -0.9329   , -0.36823  ,
       -0.26647  , -0.10715  ,  0.093359 ,  0.25288  , -0.42413  ,
        0.67356  ,  0.092664 ,  0.43201  , -0.25714  , -0.11222  ,
        0.059157 ,  0.33147  , -1.2479   , -0.35577  , -0.21875  ,
       -0.22346  ,  0.10209  , -0.4843   ,  0.7824   ,  0.3118   ,
       -0.083924 ,  0.56489  ,  0.98637  , -0.12308  ,  0.92539  ,
        0.28811  ,  0.4003   , -0.64225  ,  0.12647  , -0.27778  ,
        0.045568 , -0.18598  , -0.15247  , -0.42322  ,  0.29807  ,
       -0.68476  , -0.11121  , -1.1391   ,  0.072205 , -0.038877 ,
       -0.54775  , -0.0032873, -0.85587  ,  0.3267   , -0.79493  ,
        0.33434  ,  0.29464  ,  0.44074  ,  0.69114  , -0.10615  ,
       -2.5303   , -0.5923   ,  0.4648   ,  2.2093   ,  0.77166  ,
       -0.60216  ,  0.46264  , -0.70728  , -1.1414   ,  0.40916  ,
       -0.31745  ,  0.41431  ,  0.49908  ,  0.49434  ,  1.0044   ,
       -0.37273  , -0.16246  ,  0.23608  , -0.71456  ,  0.5331

## Define Model and Compile

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = LSTM(128)(embedded_sequences)
x = Dropout(0.3)(x)
preds = Dense(len(ds.category.unique()), activation='softmax')(x)

In [None]:
model = Model(sequence_input, preds)

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 5000)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 5000, 100)         2000100   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               117248    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 645       
Total params: 2,117,993
Trainable params: 117,893
Non-trainable params: 2,000,100
_________________________________________________________________


In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [None]:
model.fit(X_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(X_val, y_val))

Train on 2000 samples, validate on 221 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd42c028048>