In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import string
import sys
import re
import os

In [2]:
np.set_printoptions(threshold=sys.maxsize, suppress = True)

In [3]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br/', ' ')
    final_data = tf.strings.regex_replace(stripped_html, re.escape(string.punctuation), '')
    return final_data

In [4]:
path = os.path.dirname("aclImdb_v1.tar.gz")
print(path)
dataset_dir  = os.path.join(path, "aclImdb")
print(dataset_dir)
train_dir    = os.path.join(dataset_dir, "train")
print(train_dir)


aclImdb
aclImdb\train


In [5]:
vocab_size      = 10000
sequence_length = 100
batch_size      = 1024
seed            = 64
train_ds        = tf.keras.preprocessing.text_dataset_from_directory(train_dir, batch_size = batch_size, validation_split=0.2 , subset="training", seed = seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [6]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Vectorization Layer

In [7]:
vectorize_layer_int = tf.keras.layers.experimental.preprocessing.TextVectorization(
    standardize = custom_standardization,
    max_tokens  = vocab_size,
    output_mode = 'int', 
    output_sequence_length = 100
)
text_ds = train_ds.map(lambda x, y: x)

In [8]:
vectorize_layer_int.adapt(text_ds)

In [9]:
model_vec_int = tf.keras.models.Sequential()
model_vec_int.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model_vec_int.add(vectorize_layer_int)
pred_vec_int = model_vec_int.predict(["the Movie was really koko bad and boring"])
pred_vec_int

array([[  2,  20,  14,  62,   1,  97,   4, 511,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int64)

In [10]:
pred_vec_int.shape

(1, 100)

In [11]:
model_vec_int.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 100)               0         
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


In [12]:
vectorize_layer_int_without_len = tf.keras.layers.experimental.preprocessing.TextVectorization(
    standardize = custom_standardization,
    max_tokens  = vocab_size,
    output_mode = 'int'
)

In [13]:
vectorize_layer_int_without_len.adapt(text_ds)

In [14]:
model_vec_int_without_len = tf.keras.models.Sequential()
model_vec_int_without_len.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model_vec_int_without_len.add(vectorize_layer_int_without_len)
pred_vec_int_without_len = model_vec_int_without_len.predict(["the Movie was really koko bad and boring", "the Movie was really koko bad and boring"])
pred_vec_int_without_len

array([[  2,  20,  14,  62,   1,  97,   4, 511],
       [  2,  20,  14,  62,   1,  97,   4, 511]], dtype=int64)

In [18]:
model_vec_int_without_len.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_1 (TextVe (None, None)              0         
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


In [15]:
vectorize_layer_tf_idf = tf.keras.layers.experimental.preprocessing.TextVectorization(
    standardize = custom_standardization,
    max_tokens  = vocab_size,
    output_mode = 'tf-idf'
)
vectorize_layer_tf_idf.adapt(text_ds)
model_vec_tf_idf = tf.keras.models.Sequential()
# Start by creating an explicit input layer. It needs to have a shape of
# (1,) (because we need to guarantee that there is exactly one string
# input per batch), and the dtype needs to be 'string'.
model_vec_tf_idf.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model_vec_tf_idf.add(vectorize_layer_tf_idf)
pred_vec_tf_idf = model_vec_tf_idf.predict(["this movie is bad"])
pred_vec_tf_idf

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.75165707, 0.        , 0.        , 0.75328904,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.060146  ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [16]:
pred_vec_tf_idf.shape

(1, 10000)

In [17]:
model_vec_tf_idf.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_2 (TextVe (None, 10000)             0         
Total params: 10,000
Trainable params: 0
Non-trainable params: 10,000
_________________________________________________________________


In [19]:
vectorize_layer_binary = tf.keras.layers.experimental.preprocessing.TextVectorization(
    standardize = custom_standardization,
    max_tokens  = vocab_size,
    output_mode = 'binary'
)
vectorize_layer_binary.adapt(text_ds)
model_vec_binary = tf.keras.models.Sequential()
# Start by creating an explicit input layer. It needs to have a shape of
# (1,) (because we need to guarantee that there is exactly one string
# input per batch), and the dtype needs to be 'string'.
model_vec_binary.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model_vec_binary.add(vectorize_layer_binary)
pred_vec_binary = model_vec_binary.predict(["a deepak Movie was really bad and boring"])
pred_vec_binary

array([[1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [20]:
pred_vec_binary.shape

(1, 10000)

In [21]:
model_vec_binary.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_3 (TextVe (None, 10000)             0         
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


In [22]:
vectorize_layer_count = tf.keras.layers.experimental.preprocessing.TextVectorization(
    standardize = custom_standardization,
    max_tokens  = vocab_size,
    output_mode = 'count'
)
vectorize_layer_count.adapt(text_ds)
model_vec_count = tf.keras.models.Sequential()
# Start by creating an explicit input layer. It needs to have a shape of
# (1,) (because we need to guarantee that there is exactly one string
# input per batch), and the dtype needs to be 'string'.
model_vec_count.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model_vec_count.add(vectorize_layer_count)
pred_vec_count = model_vec_count.predict(["the a deepak a a a a a Movie was really bad and boring"])
pred_vec_count



array([[1., 1., 6., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [23]:
pred_vec_count.shape

(1, 10000)

In [24]:
model_vec_count.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_4 (TextVe (None, 10000)             0         
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


In [25]:
vectorize_layer_count.get_vocabulary()

['[UNK]',
 'the',
 'a',
 'and',
 'of',
 'to',
 'is',
 'in',
 'i',
 'this',
 'that',
 'it',
 '/><br',
 'was',
 'as',
 'for',
 'with',
 'but',
 'on',
 'movie',
 'his',
 'are',
 'not',
 'film',
 'you',
 'have',
 'he',
 'be',
 'at',
 'one',
 'by',
 'an',
 'they',
 'from',
 'all',
 'who',
 'like',
 'so',
 'just',
 'or',
 'has',
 'about',
 'her',
 "it's",
 'some',
 'if',
 'out',
 'what',
 'very',
 'when',
 'more',
 'there',
 'she',
 'would',
 'good',
 'even',
 'my',
 'only',
 'no',
 'their',
 'had',
 'really',
 'which',
 'can',
 'up',
 'were',
 'see',
 'than',
 '-',
 'we',
 'been',
 'get',
 'will',
 'into',
 'story',
 'because',
 'much',
 'most',
 'how',
 'other',
 'also',
 'its',
 "don't",
 'time',
 'do',
 'first',
 'great',
 'people',
 'me',
 'could',
 'make',
 'any',
 '/>the',
 'after',
 'made',
 'then',
 'bad',
 'think',
 'him',
 'many',
 'never',
 'being',
 'two',
 'too',
 'where',
 'little',
 'well',
 '<br',
 'way',
 'watch',
 'your',
 'it.',
 'did',
 'does',
 'them',
 'know',
 'best',

In [26]:
len(vectorize_layer_count.get_vocabulary())

10000

# Embedding Layers With Int Vetorization

In [27]:
embedding_dim= 16
vocab_size   = 10000
model = tf.keras.models.Sequential([
  vectorize_layer_int,
  tf.keras.layers.Embedding(vocab_size, embedding_dim, name="embedding"),
])

In [28]:
pred = model.predict(["this movie is bad"])



In [29]:
# vectorize_layer_int.get_vocabulary()

In [30]:
pred.shape

(1, 100, 16)

In [31]:
weights = model.get_layer('embedding').get_weights()

In [32]:
len(weights[0])

10000

In [33]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 100)               0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 16)           160000    
Total params: 160,000
Trainable params: 160,000
Non-trainable params: 0
_________________________________________________________________


In [34]:
pred

array([[[-0.03964602,  0.01462117, -0.01722366, -0.00461922,
          0.04142122, -0.02802364, -0.01637769,  0.02637789,
          0.01143032, -0.01346372,  0.04435874,  0.0004735 ,
          0.00219128,  0.03345858, -0.02376347,  0.04832763],
        [-0.02721878, -0.03058792, -0.0286446 ,  0.0438813 ,
         -0.04952816, -0.03319709,  0.02221153,  0.02126617,
          0.0427759 ,  0.03776583, -0.01262645,  0.03785534,
          0.00313846, -0.00665363, -0.03051694,  0.03481789],
        [-0.02758157, -0.02625328,  0.04044187, -0.01228838,
         -0.01220066,  0.00250106,  0.01609298, -0.02561284,
          0.0317313 , -0.02091286,  0.0413949 ,  0.02654613,
         -0.04525752, -0.03071308,  0.0174776 , -0.01911584],
        [ 0.04398331, -0.0461146 ,  0.00761453, -0.04490256,
         -0.04219859, -0.02417998,  0.04580816,  0.03462455,
         -0.00550918,  0.00089792, -0.0173601 ,  0.01831504,
          0.00510139,  0.00468586, -0.02777874, -0.01396937],
        [ 0.02460277

# Embedding Layer With TF-IDF TextVectorizer

In [35]:
embedding_dim=16

model_tf_idf = tf.keras.models.Sequential([
  vectorize_layer_tf_idf,
  tf.keras.layers.Embedding(vocab_size, embedding_dim, name="embedding"),
])

In [36]:
pred_tf_idf = model_tf_idf.predict(["this movie is bad"])



In [37]:
pred_tf_idf.shape

(1, 10000, 16)

In [38]:
# pred_tf_idf

# Embedding Layer with count vectorization

In [39]:
embedding_dim=16

model_count = tf.keras.models.Sequential([
  vectorize_layer_count,
  tf.keras.layers.Embedding(vocab_size, embedding_dim, name="embedding"),
])

In [40]:
pred_count = model_count.predict(["this movie is bad"])
pred_count.shape



(1, 10000, 16)

In [41]:
pred_count

array([[[ 0.03551934,  0.03760685,  0.01972163, -0.00410461,
          0.03136972,  0.04518351, -0.01120218, -0.03450096,
          0.01148633,  0.03220774, -0.01547889, -0.01336026,
          0.04035569,  0.02740279, -0.02665987, -0.01954621],
        [ 0.03551934,  0.03760685,  0.01972163, -0.00410461,
          0.03136972,  0.04518351, -0.01120218, -0.03450096,
          0.01148633,  0.03220774, -0.01547889, -0.01336026,
          0.04035569,  0.02740279, -0.02665987, -0.01954621],
        [ 0.03551934,  0.03760685,  0.01972163, -0.00410461,
          0.03136972,  0.04518351, -0.01120218, -0.03450096,
          0.01148633,  0.03220774, -0.01547889, -0.01336026,
          0.04035569,  0.02740279, -0.02665987, -0.01954621],
        [ 0.03551934,  0.03760685,  0.01972163, -0.00410461,
          0.03136972,  0.04518351, -0.01120218, -0.03450096,
          0.01148633,  0.03220774, -0.01547889, -0.01336026,
          0.04035569,  0.02740279, -0.02665987, -0.01954621],
        [ 0.03551934

In [42]:
model_count.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_4 (TextVe (None, 10000)             0         
_________________________________________________________________
embedding (Embedding)        (None, 10000, 16)         160000    
Total params: 160,000
Trainable params: 160,000
Non-trainable params: 0
_________________________________________________________________


# Embedding Layer with Binary vectorization

In [43]:
embedding_dim=16

model_binary = tf.keras.models.Sequential([
  vectorize_layer_binary,
  tf.keras.layers.Embedding(vocab_size, embedding_dim, name="embedding"),
])
pred_binary = model_binary.predict(["this movie is bad"])
pred_binary.shape



(1, 10000, 16)

In [44]:
# pred_binary

In [45]:
model_binary.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_3 (TextVe (None, 10000)             0         
_________________________________________________________________
embedding (Embedding)        (None, 10000, 16)         160000    
Total params: 160,000
Trainable params: 160,000
Non-trainable params: 0
_________________________________________________________________


# Embedding Layer With Dense Layer

In [None]:
embedding_dim= 16
vocab_size   = 10000
model = tf.keras.models.Sequential([
  vectorize_layer_int,
  tf.keras.layers.Embedding(vocab_size, embedding_dim, name="embedding")
#   tf.keras.layers.GlobalAveragePooling1D()  
])

In [None]:
pred_int_dense = model.predict(["The Movie Was Bad"])

In [None]:
pred_int_dense.shape

In [None]:
pred_int_dense

In [None]:
vectorize_layer_int.get_vocabulary()

In [None]:
tf.print(model.get_layer("embedding").get_weights())

In [None]:
embedding_dim= 16
vocab_size   = 10000
model_with_dense = tf.keras.models.Sequential([
  vectorize_layer_int,
  tf.keras.layers.Embedding(vocab_size, embedding_dim, name="embedding"),
#   tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(150, activation = 'relu'),
#   tf.keras.layers.Dense(1, activation  = 'sigmoid')
])

In [None]:
q = model_with_dense.predict(["The Movie Was Bad"])

In [None]:
q.shape

In [None]:
model_with_dense.summary()

In [None]:
# %%capture cap --no-stderr
z = model_with_dense.get_layer("embedding").get_weights()
print(z[0])
# tf.print(model_with_dense.get_layer("embedding").get_weights())

In [None]:
# with open('output.txt', 'w') as f:
#     f.write(cap.stdout)

In [None]:
model_with_dense.compile(optimizer='adam', loss=tf.keras.losses.binary_crossentropy, metrics=["accuracy"])

In [None]:
model_with_dense.fit(x=[["the movie was bad"],["the movie was bad"],["the movie was bad"],["the movie was bad"]], y =[1, 1, 1, 1],  epochs=10, batch_size = 1 )

In [None]:
weights =  model_with_dense.get_layer("embedding").get_weights()

In [None]:
tf.print(weights[0])

In [None]:
# %%capture cap --no-stderr
print(weights[0])

In [None]:
# with open('output1.txt', 'w') as f:
#     f.write(cap.stdout)

# Embedding Layer With Dense Layer And TF-IDF VECTORIZATION

In [None]:
embedding_dim= 16
vocab_size   = 10000
model_with_dense_tf_idf = tf.keras.models.Sequential([
  vectorize_layer_tf_idf,
  tf.keras.layers.Embedding(vocab_size, embedding_dim, name="embedding"),
  tf.keras.layers.Dense(150, activation = 'relu'),
])

In [None]:
model_with_dense_tf_idf.compile(optimizer='adam', loss=tf.keras.losses.mean_squared_error, metrics=["accuracy"])

In [None]:
output = model_with_dense_tf_idf.predict(["The Movie Was Bad"])
weights_tf_idf = model_with_dense_tf_idf.get_layer("embedding").get_weights()

In [None]:
output

In [None]:
output.shape

In [None]:
len(weights_tf_idf[0][1])

In [None]:
weights_tf_idf[0]

In [None]:
model_with_dense_tf_idf.fit(x=[["the movie was bad"],["the movie was bad"],["the movie was bad"],["the movie was bad"]], y =[1, 1, 1, 1],  epochs=10, batch_size = 2 )

In [None]:
weights_tf_idf1 = model_with_dense_tf_idf.get_layer("embedding").get_weights()

In [None]:
len(weights_tf_idf1[0][1])

In [None]:
weights_tf_idf1

In [None]:
model_with_dense_tf_idf.summary()