<a href="https://colab.research.google.com/github/chrismoroney/natural-language-processing/blob/main/BBC_News_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!gdown --id 14oUWdE8AL6ahSDMgGkR6KkMeW2IKjDre

Downloading...
From: https://drive.google.com/uc?id=14oUWdE8AL6ahSDMgGkR6KkMeW2IKjDre
To: /content/learn-ai-bbc.zip
100% 1.94M/1.94M [00:00<00:00, 39.8MB/s]


In [77]:
import os
import zipfile
import pandas

import string
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder


In [12]:
local_zip = './learn-ai-bbc.zip'
zip_file = zipfile.ZipFile(local_zip, 'r')
zip_file.extractall()
zip_file.close()

In [19]:
train_tweets = os.path.join('./BBC News Train.csv')
test_tweets = os.path.join('./BBC News Test.csv')

In [20]:
train_df = pandas.read_csv(train_tweets)
test_df = pandas.read_csv(test_tweets)
print(train_df[:1])
print(test_df[:1]) # No Category provided

   ArticleId                                               Text  Category
0       1833  worldcom ex-boss launches defence lawyers defe...  business
   ArticleId                                               Text
0       1018  qpr keeper day heads for preston queens park r...


In [72]:
train_headlines = []
train_labels = []
for i in range(len(train_df.index)):
  train_headlines.append(train_df.iloc[i]['Text'])
  train_labels.append(train_df.iloc[i]['Category'])

train_headlines = np.array(train_headlines)
train_labels = np.array(train_labels)

print(train_headlines.shape)
print(train_labels.shape)

(1490,)
(1490,)


In [103]:
vocab_size = 80000
embedding_dim = 12
max_length = 200

padding_type='post'
trunc_type='post'
OOV_token = "<OOV>"

In [106]:
label_encoder = LabelEncoder()
label_encoder.fit(train_labels)

train_labels_encoded = label_encoder.transform(train_labels)

num_classes = len(label_encoder.classes_)
train_labels_one_hot = tf.keras.utils.to_categorical(train_labels_encoded, num_classes)

[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]]


business: [1, 0, 0, 0, 0]
entertainment: [0, 1, 0, 0, 0]
politics: [0, 0, 1, 0, 0]
sport: [0, 0, 0, 1, 0]
tech: [0, 0, 0, 0, 1]


In [42]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=OOV_token)
tokenizer.fit_on_texts(train_headlines)

word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_headlines)
train_padded = pad_sequences(train_sequences, maxlen = max_length, padding=padding_type, truncating = trunc_type)

In [97]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(5, activation = 'softmax'),
])

In [98]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 200, 12)           960000    
                                                                 
 global_average_pooling1d_1   (None, 12)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_3 (Dropout)         (None, 12)                0         
                                                                 
 flatten_9 (Flatten)         (None, 12)                0         
                                                                 
 dense_15 (Dense)            (None, 5)                 65        
                                                                 
Total params: 960,065
Trainable params: 960,065
Non-trainable params: 0
________________________________________________

In [99]:
num_epochs = 20
model.fit(train_padded, train_labels_one_hot, epochs=num_epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x79a99f849b40>

In [154]:
sentence = [

"sony wares win innovation award sony has taken the prize for top innovator at the annual awards of pc pro magazine.  it won the award for taking risks with products and for its  brave  commitment to good design. conferring the award  pc pro s staff picked out sony s pcg-x505/p vaio laptop as a  stunning piece of engineering . the electronics giant beat off strong competition from toshiba and chip makers amd and intel to take the gong.  paul trotter  news and features editor of pc pro  said several sony products helped it to take the innovation award.  he said sony s clie peg ux50 media player with its swivel screen and qwerty keyboard  broke the design rules yet again . other sony products that helped included the vaio w1 desktop computer and the ra-104 media server. mr trotter said sony s combining of computer  screen and keyboard in the w1 was likely to be widely copied in future home pcs. the company has also become one of the first to use organic leds in its products.  while not always inventing new technology itself  sony was never afraid to innovate around various formats   said mr trotter.  other awards decided by pc pro s staff and contributors included one for canon s eos 300d digital camera in the most wanted hardware category.  microsoft s media player 10 took the award for most wanted software. this year was the 10th anniversary of the pc pro awards  which splits its prizes into two sections. the first are chosen by the magazine s writers and consultants  the second are voted for by readers. mr trotter said more than 13 000 people voted for the reliability and service awards  twice as many as in 2003. net-based memory and video card shop crucial shared the award for online vendor of the year with novatech."

]
sequence = tokenizer.texts_to_sequences(sentence)
print(sequence)

[[645, 7945, 136, 2211, 397, 645, 19, 459, 2, 779, 9, 138, 1, 22, 2, 686, 314, 4, 564, 2580, 1063, 12, 144, 2, 397, 9, 354, 2043, 18, 747, 5, 9, 43, 6957, 2264, 3, 128, 1212, 17287, 2, 397, 564, 2580, 8, 1115, 1448, 55, 645, 8, 1, 1, 2041, 1, 2371, 20, 6, 4845, 3250, 4, 2998, 2, 1263, 723, 587, 130, 349, 630, 28, 4881, 5, 1636, 1617, 12545, 5, 2237, 3, 118, 2, 13657, 715, 10310, 189, 5, 1835, 1510, 4, 564, 2580, 14, 533, 645, 747, 788, 12, 3, 118, 2, 2211, 397, 16, 14, 645, 8, 1, 8828, 1, 296, 356, 18, 43, 1, 856, 5, 9061, 3183, 1554, 2, 1212, 735, 425, 367, 71, 645, 747, 11, 788, 831, 2, 1, 1, 1955, 321, 5, 2, 1, 5511, 296, 5146, 31, 10310, 14, 645, 8, 5903, 4, 321, 856, 5, 3183, 7, 2, 1, 15, 295, 3, 17, 907, 9828, 7, 286, 124, 892, 2, 140, 19, 46, 368, 51, 4, 2, 64, 3, 145, 8151, 1, 7, 43, 747, 105, 29, 455, 1, 48, 178, 777, 645, 15, 398, 5813, 3, 24837, 242, 2725, 4346, 14, 31, 10310, 71, 314, 995, 23, 564, 2580, 8, 1115, 5, 6396, 831, 51, 9, 1, 8, 1, 1, 229, 1040, 7, 2, 115, 704, 2

In [155]:
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))
vals = model.predict(padded)

[[0.19692634 0.18344161 0.08454067 0.06698115 0.46811026]]


In [156]:
print(vals)
print()
print("index:")
for i, class_name in enumerate(label_encoder.classes_):
    one_hot = [0] * num_classes
    one_hot[i] = 1
    print(f"{class_name}: {one_hot}")

print()
largest_val_idx = np.argmax(vals)
print("Answer:")
if largest_val_idx == 0:
  print("The headline classifies as: business")
elif largest_val_idx == 1:
  print("The headline classifies as: entertainment")
elif largest_val_idx == 2:
  print("The headline classifies as: politics")
elif largest_val_idx == 3:
  print("The headline classifies as: sport")
elif largest_val_idx == 4:
  print("The headline classifies as: tech")

[[0.19692634 0.18344161 0.08454067 0.06698115 0.46811026]]

index:
business: [1, 0, 0, 0, 0]
entertainment: [0, 1, 0, 0, 0]
politics: [0, 0, 1, 0, 0]
sport: [0, 0, 0, 1, 0]
tech: [0, 0, 0, 0, 1]

Answer:
The headline classifies as: tech
