# Text Preprocessing with Keras

In [83]:
#conda install -c conda-forge tensorflow

In [84]:
#pip install tensorflow --ignore-installed --user

In [85]:
#pip install tensorflow --user

In [86]:
# importing libraries

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D
import numpy as np
import tsensor


ModuleNotFoundError: No module named 'tsensor'

## Tokenization

In [None]:
# Tokenising sentences
sentences = [
    'The quick brown fox jumps over the lazy dog.'
]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

In [None]:
tokenizer.word_index

{'the': 1,
 'quick': 2,
 'brown': 3,
 'fox': 4,
 'jumps': 5,
 'over': 6,
 'lazy': 7,
 'dog': 8}

In [None]:
train_sequence = tokenizer.texts_to_sequences(sentences)
train_sequence = np.array(train_sequence)
print(train_sequence)

[[1 2 3 4 5 6 1 7 8]]


## Creating Embedding Layer

In [None]:
# Create a random embedding layer

embedding = Embedding(input_dim=len(train_sequence[0]), output_dim=128)

In [None]:
# Get the embeddings of the train sample

train_sample = embedding(train_sequence)

In [None]:
train_sequence.shape

(1, 9)

In [None]:
train_sample.shape

TensorShape([1, 9, 128])

In [None]:
with tsensor.explain(fontname='Hack', dimfontname='Hack'):
    train_sample = embedding(train_sequence)

AttributeError: module 'tensorflow' has no attribute 'explain'

In [None]:
train_sample[0]

<tf.Tensor: shape=(9, 128), dtype=float32, numpy=
array([[-0.02754259, -0.00622163,  0.01395966, ...,  0.02350713,
        -0.02449713, -0.03975808],
       [ 0.03644684, -0.02320917, -0.02154242, ...,  0.03704382,
         0.02428811,  0.01571624],
       [-0.03918707, -0.00645021, -0.01573436, ...,  0.00229568,
         0.03425712,  0.03940317],
       ...,
       [-0.02754259, -0.00622163,  0.01395966, ...,  0.02350713,
        -0.02449713, -0.03975808],
       [ 0.0080936 ,  0.02572003, -0.02612194, ...,  0.02001256,
         0.01147754, -0.01811665],
       [-0.00085966,  0.01514722, -0.04220143, ..., -0.00865097,
         0.01689402, -0.01373191]], dtype=float32)>

## Averaging across tokens

In [None]:
GlobalAveragePooling1D()(train_sample)

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[-0.00755447,  0.00638426, -0.00048863,  0.00334065,  0.01660661,
        -0.01373461,  0.00454175, -0.00855707,  0.00276748, -0.00557863,
        -0.00344877,  0.00337631, -0.00176419, -0.01563725,  0.00422146,
        -0.00960024, -0.00052694, -0.002578  ,  0.00889957, -0.00852455,
         0.01651548,  0.01202281,  0.0046281 , -0.01135564, -0.01165041,
        -0.00187996,  0.00624368,  0.00379701,  0.00149153, -0.03422722,
         0.00485646, -0.00706385,  0.00679624,  0.01282168,  0.01323583,
        -0.013819  ,  0.00361721,  0.00443395, -0.01033955,  0.00320042,
         0.01193966,  0.01031262, -0.00589713, -0.00364272, -0.00298244,
         0.01321912,  0.00921641,  0.00424303,  0.01250464, -0.0076866 ,
        -0.00703624,  0.01517027,  0.00675028,  0.00272987, -0.00418889,
         0.01971414, -0.00696675, -0.01278776, -0.00327972,  0.00908293,
        -0.0087077 , -0.01367136,  0.00418269,  0.01186581,  0.00475907,
 

![](images/Emb6.png)

In [None]:
with tsensor.explain(fontname='Hack', dimfontname='Hack'):
    z = GlobalAveragePooling1D()(train_sample)

AttributeError: module 'tensorflow' has no attribute 'explain'

## Creating Word Embeddings for more than one sentence

In [None]:
# More than one sentence

test_corpus = [
    'The quick brown fox jumps over the lazy dog.',
    'The quick brown fox.',
    'The lazy dog.',
    'The dog.',
    'Dog and the fox.',
    'Hello, world!'
]
encoded_sentences = tokenizer.texts_to_sequences(test_corpus)
for sentence, encoded_sentence in zip(test_corpus, encoded_sentences):
    print(sentence, encoded_sentence)

The quick brown fox jumps over the lazy dog. [1, 2, 3, 4, 5, 6, 1, 7, 8]
The quick brown fox. [1, 2, 3, 4]
The lazy dog. [1, 7, 8]
The dog. [1, 8]
Dog and the fox. [8, 1, 4]
Hello, world! []


## Padding Sequences

In [None]:
# Length of each sentence in the corpus

[len(sentence) for sentence in encoded_sentences]

[9, 4, 3, 2, 3, 0]

In [None]:
# Length of the longest sentence

max([len(sentence) for sentence in encoded_sentences])

9

In [None]:
MAX_SEQUENCE_LENGTH = 9

In [None]:
# Padding sequences that are shorter than the longest sequence

X = pad_sequences(encoded_sentences, maxlen=MAX_SEQUENCE_LENGTH)
X

array([[1, 2, 3, 4, 5, 6, 1, 7, 8],
       [0, 0, 0, 0, 0, 1, 2, 3, 4],
       [0, 0, 0, 0, 0, 0, 1, 7, 8],
       [0, 0, 0, 0, 0, 0, 0, 1, 8],
       [0, 0, 0, 0, 0, 0, 8, 1, 4],
       [0, 0, 0, 0, 0, 0, 0, 0, 0]])

## Embedding Layer

In [None]:
# Training data with more than 1 sentences

X.shape

(6, 9)

In [None]:
# Embeddings of the larger corpus

X_embedded = embedding(X)

In [None]:
X_embedded.shape

TensorShape([6, 9, 128])

In [None]:
X_embedded

<tf.Tensor: shape=(6, 9, 128), dtype=float32, numpy=
array([[[-0.02754259, -0.00622163,  0.01395966, ...,  0.02350713,
         -0.02449713, -0.03975808],
        [ 0.03644684, -0.02320917, -0.02154242, ...,  0.03704382,
          0.02428811,  0.01571624],
        [-0.03918707, -0.00645021, -0.01573436, ...,  0.00229568,
          0.03425712,  0.03940317],
        ...,
        [-0.02754259, -0.00622163,  0.01395966, ...,  0.02350713,
         -0.02449713, -0.03975808],
        [ 0.0080936 ,  0.02572003, -0.02612194, ...,  0.02001256,
          0.01147754, -0.01811665],
        [-0.00085966,  0.01514722, -0.04220143, ..., -0.00865097,
          0.01689402, -0.01373191]],

       [[-0.04908931, -0.00467245, -0.02280524, ...,  0.04522233,
         -0.04453618,  0.01031039],
        [-0.04908931, -0.00467245, -0.02280524, ...,  0.04522233,
         -0.04453618,  0.01031039],
        [-0.04908931, -0.00467245, -0.02280524, ...,  0.04522233,
         -0.04453618,  0.01031039],
        ...,
 

In [87]:
with tsensor.explain(fontname='Hack', dimfontname='Hack'):
    x_em = embedding(X)

AttributeError: module 'tensorflow' has no attribute 'explain'

In [None]:
X.shape

(6, 9)

In [None]:
x_em.shape

NameError: name 'x_em' is not defined

## Averaging across tokens

![](images/Emb6.png)

In [None]:
with tsensor.explain(fontname='Hack', dimfontname='Hack'):
    z = GlobalAveragePooling1D()(x_em)

AttributeError: module 'tensorflow' has no attribute 'explain'

In [None]:
z.shape

NameError: name 'z' is not defined