In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from   tensorflow.keras.preprocessing.text import Tokenizer
from   tensorflow.keras.preprocessing.sequence import pad_sequences

%matplotlib inline

In [2]:
tf.__version__

'2.0.0'

In [3]:
!ls ../../../data

aclImdb_v1.tar.gz bbc-text.csv      [34mnmt[m[m               [34msurnames[m[m
[34mag_news[m[m           [34mbooks[m[m             sarcasm.json      [34myelp[m[m


In [4]:
#!head -5 ../../../data/bbc-text.csv

In [5]:
DATA = '../../../data'

In [6]:
VOCAB_SIZE = 10000
EMBEDDING_DIM = 50
MAX_LEN = 150
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOK = '<UNK>'
TRAINING_PORTION = 0.8

In [7]:
sentences = []
labels = []
stopwords = [
    "a", "about", "above", "after", "again", "against", "all", "am", "an",
    "and", "any", "are", "as", "at", "be", "because", "been", "before", 
    "being", "below", "between", "both", "but", "by", "could", "did", 
    "do", "does", "doing", "down", "during", "each", "few", "for", "from",
    "further", "had", "has", "have", "having", "he", "he'd", "he'll", 
    "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
    "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", 
    "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", 
    "most", "my", "myself", "nor", "of", "on", "once", "only", "or", 
    "other", "ought", "our", "ours", "ourselves", "out", "over", "own", 
    "same", "she", "she'd", "she'll", "she's", "should", "so", "some", 
    "such", "than", "that", "that's", "the", "their", "theirs", "them", 
    "themselves", "then", "there", "there's", "these", "they", "they'd", 
    "they'll", "they're", "they've", "this", "those", "through", "to", 
    "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
    "we're", "we've", "were", "what", "what's", "when", "when's", "where",
    "where's", "which", "while", "who", "who's", "whom", "why", "why's", 
    "with", "would", "you", "you'd", "you'll", "you're", "you've", "your",
    "yours", "yourself", "yourselves"]
len(stopwords)

153

In [8]:
with open(f'{DATA}/bbc-text.csv', 'r') as f:
    i = 0
    for line in f:
        if i != 0:
            lab, sent = line.split(',')
            sentences.append(sent[:-1])
            labels.append(lab)
        i += 1

In [9]:
print(len(labels), len(sentences))

2225 2225


In [10]:
sentences[0][:100]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital'

In [11]:
train_size = 1780

train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_sentences))
print(len(train_labels))
print(len(validation_sentences))
print(len(validation_labels))

# Expected output (if training_portion=.8)
# 1780
# 1780
# 1780
# 445
# 445

1780
1780
1780
445
445


In [12]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOK)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(
    train_sequences, maxlen=MAX_LEN, truncating=TRUNC_TYPE)

print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

# Expected Ouput
# 449
# 120
# 200
# 120
# 192
# 120

750
150
300
150
311
150


In [13]:
validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(
    validation_sequences, maxlen=MAX_LEN, truncating=TRUNC_TYPE)

print(len(validation_sequences))
print(validation_padded.shape)

# Expected output
# 445
# (445, 120)

445
(445, 150)


In [14]:
set(labels)

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [15]:
label_tokenizer = Tokenizer(num_words=len(set(labels)), oov_token=OOV_TOK)
label_tokenizer.fit_on_texts(train_labels)
label_index = label_tokenizer.word_index

training_label_seq = pad_sequences(
    label_tokenizer.texts_to_sequences(train_labels),
    maxlen=1,
    truncating=TRUNC_TYPE)
validation_label_seq = pad_sequences(
    label_tokenizer.texts_to_sequences(validation_labels),
    maxlen=1,
    truncating=TRUNC_TYPE)

print(training_label_seq[0])
print(training_label_seq[1])
print(training_label_seq[2])
print(training_label_seq.shape)

print(validation_label_seq[0])
print(validation_label_seq[1])
print(validation_label_seq[2])
print(validation_label_seq.shape)

# Expected output
# [4]
# [2]
# [1]
# (1780, 1)
# [5]
# [4]
# [3]
# (445, 1)

[1]
[3]
[2]
(1780, 1)
[1]
[1]
[4]
(445, 1)


In [16]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(6, activation='sigmoid')])

model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 50)           500000    
_________________________________________________________________
flatten (Flatten)            (None, 7500)              0         
_________________________________________________________________
dense (Dense)                (None, 24)                180024    
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 150       
Total params: 680,174
Trainable params: 680,174
Non-trainable params: 0
_________________________________________________________________


In [17]:
EPOCHS = 30
history = model.fit(
    train_padded,
    training_label_seq,
    validation_data=(validation_padded, validation_label_seq),
    epochs=EPOCHS)

Train on 1780 samples, validate on 445 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [18]:
history.history['accuracy']

[0.3494382,
 0.44662923,
 0.9831461,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382,
 0.9994382]

In [None]:
plt.plot(history.history['accuracy'])
## ??? For some reason, this cell keeps killing my kernel!!

In [18]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_' + string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_' + string])

In [None]:
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")