# Building Vocabulary

In [None]:
# book example on Listing 6.1 (one-hot encoding words)
import numpy as np
# 2 sentences as an input
samples = ['I study at CityU', 'I study at CityU at Seattle']
token_index = {} # builds an index of all tokens in the data using a dictionary for uniuqe words
                 # key = word, value = running index from 1 -> N
for sample in samples:
    for word in sample.split(): # getting individual word from the each sentence
        if word not in token_index:
            #token_index[word] = len(token_index) + 1 # starting from 1
            token_index[word] = len(token_index) # starting index at 0
            print(token_index[word], word)


0 I
1 study
2 at
3 CityU
4 Seattle


# Decide maximum number of words for a feature


In [None]:
# an arbitrary number of words you will consider for a feature
# max_length = 6
# Calculate the length of each sentence (number of words)
sentence_lengths = [len(sentence.split()) for sentence in samples]
# Find the maximum sentence length
max_length = max(sentence_lengths)

print("max_length: ", max_length)
# we are creating a 3D matrix of samples x max_length x # of tokens
results = np.zeros(shape = (len(samples), max_length, max(token_index.values()) + 1))

max_length:  6


# One-hot encoding

In [None]:
# Iterate through each sample with its index (i)
for i, sample in enumerate(samples):
    # Iterate through each word in the sample with its index (j), up to max_length
    for j, word in list(enumerate(sample.split()))[:max_length]:
        # Get the index of the current word from the token_index dictionary
        index = token_index.get(word)
        # Perform one-hot encoding: set the corresponding element in the results array to 1.0
        results[i, j, index] = 1.

# The results array now contains the one-hot encoded representation of the samples
results

array([[[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]],

       [[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.]]])

# Building Vocabulary and one-hot-encoding using keras

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
samples = ['I study at CityU', 'I study at CityU at Seattle']

sentence_lengths = [len(sentence.split()) for sentence in samples]
# Find the maximum sentence length
max_length = max(sentence_lengths)

# Let's create a tokenizer, configured to only take into account the top-1000 most common words
tokenizer = Tokenizer(num_words = max_length)
# Updates internal vocabulary based on a list of texts. This method creates the
# vocabulary based on word frequency.
# The output of the vocabulary is word_index["I"] = 1, word_index["study"] = 2,
# word(key):index(value)
tokenizer.fit_on_texts(samples)
# Transforms those strings into a sequence of interger indices.
# Basically, it takes each word in the text and replaces it with
# its corresponding integer value from the word_index dictionary
sequences = tokenizer.texts_to_sequences(samples)
# Get the one-hot binary representation of given sentences
one_hot_results = tokenizer.texts_to_matrix(samples, mode = 'binary') # mode='count'
# one_hot_results = tokenizer.texts_to_sequences(samples) # another helper function to produce the encoded sequence
# Obtain the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.' %len(word_index))
print("word_index: ", tokenizer.word_index)
print("\033[1m\033[94m{ Sequences: }\033[0m ", sequences)
print("\n\033[1m\033[94m{ one hot results: }\033[0m\n", one_hot_results)



Found 5 unique tokens.
word_index:  {'at': 1, 'i': 2, 'study': 3, 'cityu': 4, 'seattle': 5}
[1m[94m{ Sequences: }[0m  [[2, 3, 1, 4], [2, 3, 1, 4, 1, 5]]

[1m[94m{ one hot results: }[0m
 [[0. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1.]]


In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
one_hot_results


array([[0., 1., 1., 1., 1., 0.],
       [0., 1., 1., 1., 1., 1.]])

word embedding examples using an embedding layer in Keras
learning an embedding layer
source: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/


In [None]:
from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding
from keras.models import Sequential

# define documents
docs = ['Well done!',           # + = 1
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Fine work!',
        'Bravo!',
        'Tremendous idea',
        'Awesome!',
        'Perfect work',
        'Weak',                 # - = 0
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.',
        'Sucks',
        'Inferior to your previous work',
        'Substandard',
        'Faulty thoughts',
        'Terrible work to be presented'
        ]
# define class labels
labels = array([1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0])
# integer encode the documents (hash
vocab_size = 1000 # hyper parameter#1 affects hash collision (make sure you have enough entries to avoid any hash collision)
# one_hot converts an input sentence into a vector
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)
# pad documents to a mavx length of 4 words
max_length = 8 # vector space to accommodate the input text sequence, [1, max_length].
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
# define the model
model = Sequential()
# This Embedding has a vocabulary of 50 and an input length of 4. We will choose a small embedding space of 8 dimensions.
num_layer = 8 # hyper parameter#2 {8, 16, 24, 128 ...}
model.add(Embedding(vocab_size, num_layer, input_length=max_length))
# Importantly, the output from the Embedding layer will be 4 vectors of 8 dimensions each, one for each word. We flatten this to a one 32-element vector to pass on to the Dense output layer.
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

# The model architecture, and training configuration (including the optimizer, losses, and metrics) are stored in saved_model.pb.
# The weights are saved in the variables/ directory.
model.save('my_model.keras')

[[432, 957], [895, 588], [117, 971], [559, 588], [282], [198, 588], [126], [640, 264], [566], [43, 588], [188], [38, 971], [303, 895], [38, 588], [315, 936, 957, 630], [806], [257, 435, 267, 510, 588], [659], [985, 445], [65, 588, 435, 301, 954]]
[[432 957   0   0   0   0   0   0]
 [895 588   0   0   0   0   0   0]
 [117 971   0   0   0   0   0   0]
 [559 588   0   0   0   0   0   0]
 [282   0   0   0   0   0   0   0]
 [198 588   0   0   0   0   0   0]
 [126   0   0   0   0   0   0   0]
 [640 264   0   0   0   0   0   0]
 [566   0   0   0   0   0   0   0]
 [ 43 588   0   0   0   0   0   0]
 [188   0   0   0   0   0   0   0]
 [ 38 971   0   0   0   0   0   0]
 [303 895   0   0   0   0   0   0]
 [ 38 588   0   0   0   0   0   0]
 [315 936 957 630   0   0   0   0]
 [806   0   0   0   0   0   0   0]
 [257 435 267 510 588   0   0   0]
 [659   0   0   0   0   0   0   0]
 [985 445   0   0   0   0   0   0]
 [ 65 588 435 301 954   0   0   0]]




None
Accuracy: 100.000000


How to use pre-trained network (GloVe) in Keras

In [None]:
import requests
import zipfile
import os

# The URL for the GloVe 6B dataset (specifically the 100d version)
# This is a common source, but URLs can change. If this link fails,
# you might need to find an alternative source or download it manually.
glove_url = "https://nlp.stanford.edu/data/glove.6B.zip"
zip_file_name = "glove.6B.zip"
extracted_file_name = "glove.6B.100d.txt"

print(f"Attempting to download GloVe embeddings from: {glove_url}")

# Download the zip file
try:
    response = requests.get(glove_url, stream=True)
    response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

    with open(zip_file_name, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    print(f"Downloaded {zip_file_name}")

    # Extract the specific file
    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        if extracted_file_name in zip_ref.namelist():
            zip_ref.extract(extracted_file_name)
            print(f"Extracted {extracted_file_name}")
        else:
            print(f"Error: {extracted_file_name} not found in the zip archive.")
            print("Available files in the archive:", zip_ref.namelist())

    # Clean up the zip file (optional)
    # os.remove(zip_file_name)
    # print(f"Removed temporary zip file {zip_file_name}")

except requests.exceptions.RequestException as e:
    print(f"Error downloading the file: {e}")
except zipfile.BadZipFile:
    print(f"Error: Downloaded file {zip_file_name} is not a valid zip file.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Attempting to download GloVe embeddings from: https://nlp.stanford.edu/data/glove.6B.zip
Downloaded glove.6B.zip
Extracted glove.6B.100d.txt


In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros
# Corrected imports from tensorflow.keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print("\033[1m\033[94m{ Encoded Documents: }\033[0m ", encoded_docs)
# pad documents to a max length of 4 words
#max_length = 4

# Calculate the length of each sentence (number of words)
sentence_lengths = [len(sentence.split()) for sentence in docs]

# Find the maximum sentence length
max_length = max(sentence_lengths)

padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print("\033[1m\033[94m{ Padded Documents: }\033[0m ", padded_docs)

# load the whole embedding into memory
embeddings_index = dict()
# Ensure the path to the GloVe file is correct for your environment
# If the file is not in the same directory, update the path accordingly
# Example: f = open('/path/to/your/glove.6B.100d.txt', encoding='utf8')
try:
    f = open('glove.6B.100d.txt', encoding='utf8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
except FileNotFoundError:
    print("Error: glove.6B.100d.txt not found.")
    print("Please download the GloVe embeddings from https://nlp.stanford.edu/projects/glove/")
    print("and place the glove.6B.100d.txt file in the same directory as the notebook,")
    print("or update the file path in the code.")
    # Exit or handle the error appropriately if the file is crucial
    # sys.exit("GloVe file not found.") # Uncomment this line to stop execution if the file is missing

# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

[1m[94m{ Encoded Documents: }[0m  [[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
[1m[94m{ Padded Documents: }[0m  [[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]
Loaded 400000 word vectors.


None
Accuracy: 100.000000
