using embeddings to convert words into a T x D matrix of numbers<br>
** read about word3vec and GloVe

<ol>
<li>Convert words into integers (indices into the word embedding matrix)</li>
<li>Mapping from words to integer</li>
<li>integers start from 1 not 0 because 0 is used for padding</li>
<li>convert strings to sequences</li>
<li>Tokenize: each string is a single word</li>
<li>The process is string to tokens to integers to vectors</li>
<li>Padding: to make sequences the same length</li>
<li>N is the number of lists of integers</li>
<li>T is the length of the sequence</li>
<li>Truncating pre or post</li>
<li>Padding: pre or post</li>
</ol>

We have an N x T matrix of word indices. When passed by the embedding layer, we get N x T x D tensor converting it into a word vector. 

In [20]:
# preprocessing text
import csv
from nltk.corpus import stopwords
import numpy as np 
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# using list of stop words to remove from text
stop_words = stopwords.words("english")
print(len(stop_words))

153


In [6]:
# preprocessing text
texts = []
labels = []
# reading the data. 
# for each row, the first item is the label
with open('data/bbc-text.csv','r') as file: # reading the files
    data = csv.reader(file)
    next(data, None)
    for row in data:
        labels.append(row[0])
        sentence = row[1]
        for word in stop_words: # replacing tokens with an empty space, and then replacing double space with a single space
            token = ' ' + word + ' '
            sentence = sentence.replace(token,' ')
            sentence = sentence.replace('  ', ' ')
        texts.append(sentence)

        
print(len(labels))
print(len(texts))


2225
2225


In [45]:
MAX_VOCAB_SIZE = 10000
# out of words token
oov_tok = '<OOV>'
# padding
padding_type = 'post'
trunc_type = 'post'
max_len = 120

# splitting data into train and validation
training_size = int(np.ceil(0.8 * len(texts)))
X_train, X_valid = texts[:training_size], texts[training_size:]
y_train, y_valid = labels[:training_size], labels[training_size:]

tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X_train)
# a dictionary of words as key and integer as value
word_index = tokenizer.word_index
V = len(word_index)
print(f"There are {V} unique tokens")
# convert to sequences
sequences = tokenizer.texts_to_sequences(X_train)
# padding
padded = pad_sequences(sequences, maxlen=max_len ,truncating=trunc_type, padding=trunc_type)
print(f"Shape of padded sequences: {padded.shape}")

valid_seq = tokenizer.texts_to_sequences(X_valid)
valid_padded = pad_sequences(valid_seq, maxlen=max_len, padding=padding_type, truncating=trunc_type)
print(f"Validation data shape: {valid_padded.shape}")

There are 27269 unique tokens
Shape of padded sequences: (1780, 120)
Validation data shape: (445, 120)


In [42]:
# tokenizing labels
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(y_train)
label_index = label_tokenizer.word_index
print(label_index)
label_seq = np.array(label_tokenizer.texts_to_sequences(y_train))
print(label_seq.shape)
valid_label_seq = np.array(label_tokenizer.texts_to_sequences(y_valid))
print(valid_label_seq.shape)



{'sport': 1, 'business': 2, 'politics': 3, 'entertainment': 4, 'tech': 5}
(1780, 1)
(445, 1)


# Modeling

In [46]:
from tensorflow.keras.layers import Input, GlobalMaxPool1D, Dense
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras.models import Model

In [48]:
# model

# embedding dimensions
T = max_len
D = 16
M = 24 # hidden state dimensions
O = len(label_index)
i = Input(shape=(T,)) # input layer
# size of embedding is V+1 because index starts from 1 instead of 0
# If the final index of the embedding matrix is V, then it must have size V+1
x = Embedding(V +1, D)(i)
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dense(O, activation='sigmoid')(x)

model = Model(i,x)


In [49]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)