#Integer Encoding

In [38]:
import numpy as np

docs = ['go india',
		'india india',
		'hip hip hurray',
		'jeetega bhai jeetega india jeetega',
		'bharat mata ki jai',
		'kohli kohli',
		'sachin sachin',
		'dhoni dhoni',
		'modi ji ki jai',
		'inquilab zindabad']

In [39]:
#tokenize these sentences
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

tokenized_docs = [] # Create a list to store the tokenized sentences
for doc in docs: # Iterate over each document in the list
    tokenized_docs.append(word_tokenize(doc)) # Tokenize each document and add it to the list

print(tokenized_docs)

[['go', 'india'], ['india', 'india'], ['hip', 'hip', 'hurray'], ['jeetega', 'bhai', 'jeetega', 'india', 'jeetega'], ['bharat', 'mata', 'ki', 'jai'], ['kohli', 'kohli'], ['sachin', 'sachin'], ['dhoni', 'dhoni'], ['modi', 'ji', 'ki', 'jai'], ['inquilab', 'zindabad']]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
# tokenize directly by using Tensorflow

from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='<nothing>')


**oov_token='nothing' :** parameter pass becuse it replace every new word with nothing we can pass any word here.

**oov :**  out of vocabulary

In [41]:
tokenizer.fit_on_texts(docs) #provide index to each uniques values
tokenizer.word_index

{'<nothing>': 1,
 'india': 2,
 'jeetega': 3,
 'hip': 4,
 'ki': 5,
 'jai': 6,
 'kohli': 7,
 'sachin': 8,
 'dhoni': 9,
 'go': 10,
 'hurray': 11,
 'bhai': 12,
 'bharat': 13,
 'mata': 14,
 'modi': 15,
 'ji': 16,
 'inquilab': 17,
 'zindabad': 18}

In [42]:
tokenizer.word_counts

OrderedDict([('go', 1),
             ('india', 4),
             ('hip', 2),
             ('hurray', 1),
             ('jeetega', 3),
             ('bhai', 1),
             ('bharat', 1),
             ('mata', 1),
             ('ki', 2),
             ('jai', 2),
             ('kohli', 2),
             ('sachin', 2),
             ('dhoni', 2),
             ('modi', 1),
             ('ji', 1),
             ('inquilab', 1),
             ('zindabad', 1)])

In [43]:
tokenizer.document_count #find no. of rows or sentences

10

In [44]:
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[10, 2],
 [2, 2],
 [4, 4, 11],
 [3, 12, 3, 2, 3],
 [13, 14, 5, 6],
 [7, 7],
 [8, 8],
 [9, 9],
 [15, 16, 5, 6],
 [17, 18]]

**sequence means :** use index value and generate array.

ex- go-10, india-2

[[10,2]]

In [45]:
from keras.utils import pad_sequences #padding means adding 0 after/before this sequence to make same size
sequences = pad_sequences(sequences,padding='post')
sequences

array([[10,  2,  0,  0,  0],
       [ 2,  2,  0,  0,  0],
       [ 4,  4, 11,  0,  0],
       [ 3, 12,  3,  2,  3],
       [13, 14,  5,  6,  0],
       [ 7,  7,  0,  0,  0],
       [ 8,  8,  0,  0,  0],
       [ 9,  9,  0,  0,  0],
       [15, 16,  5,  6,  0],
       [17, 18,  0,  0,  0]], dtype=int32)

# import data from keras

In [46]:
from keras.datasets import imdb #imdb is a predefined dataset there is reviews
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten

In [47]:
#load imdb dataset
(X_train,y_train),(X_test,y_test) = imdb.load_data() #dataset is already is in array

In [48]:
X_train.shape

(25000,)

In [49]:
X_test.shape

(25000,)

In [50]:
X_train #data already preprocessed and integer coded

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

In [51]:
X_test

array([list([1, 591, 202, 14, 31, 6, 717, 10, 10, 18142, 10698, 5, 4, 360, 7, 4, 177, 5760, 394, 354, 4, 123, 9, 1035, 1035, 1035, 10, 10, 13, 92, 124, 89, 488, 7944, 100, 28, 1668, 14, 31, 23, 27, 7479, 29, 220, 468, 8, 124, 14, 286, 170, 8, 157, 46, 5, 27, 239, 16, 179, 15387, 38, 32, 25, 7944, 451, 202, 14, 6, 717]),
       list([1, 14, 22, 3443, 6, 176, 7, 5063, 88, 12, 2679, 23, 1310, 5, 109, 943, 4, 114, 9, 55, 606, 5, 111, 7, 4, 139, 193, 273, 23, 4, 172, 270, 11, 7216, 10626, 4, 8463, 2801, 109, 1603, 21, 4, 22, 3861, 8, 6, 1193, 1330, 10, 10, 4, 105, 987, 35, 841, 16873, 19, 861, 1074, 5, 1987, 17975, 45, 55, 221, 15, 670, 5304, 526, 14, 1069, 4, 405, 5, 2438, 7, 27, 85, 108, 131, 4, 5045, 5304, 3884, 405, 9, 3523, 133, 5, 50, 13, 104, 51, 66, 166, 14, 22, 157, 9, 4, 530, 239, 34, 8463, 2801, 45, 407, 31, 7, 41, 3778, 105, 21, 59, 299, 12, 38, 950, 5, 4521, 15, 45, 629, 488, 2733, 127, 6, 52, 292, 17, 4, 6936, 185, 132, 1988, 5304, 1799, 488, 2693, 47, 6, 392, 173, 4, 21686, 4

In [52]:
#len of train dataset
print(len(X_train[0]))
print(len(X_train[1]))
print('length of each sentence is different')


218
189
length of each sentence is different


In [53]:
#padding
X_train = pad_sequences(X_train,padding='post')
X_test = pad_sequences(X_test,padding='post')

In [54]:
X_train[0]

array([ 1, 14, 22, ...,  0,  0,  0], dtype=int32)

In [55]:
#to check all sentence are in same length
print(len(X_train[0]))
print(len(X_train[1]))

2494
2494


In [56]:
model = Sequential()  #explaination in copy

model.add(SimpleRNN(32,input_shape=(50,1),return_sequences=False))
model.add(Dense(1,activation='sigmoid'))

model.summary()

  super().__init__(**kwargs)


**explanation in copy**

In [74]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.fit(X_train,y_train,epochs=5,validation_data=(X_test,y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 16ms/step - accuracy: 0.9440 - loss: 0.1853 - val_accuracy: 0.7681 - val_loss: 0.6289
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.9677 - loss: 0.1026 - val_accuracy: 0.7773 - val_loss: 0.6374
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.9798 - loss: 0.0669 - val_accuracy: 0.7677 - val_loss: 0.7531
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 17ms/step - accuracy: 0.9863 - loss: 0.0426 - val_accuracy: 0.7610 - val_loss: 0.8338
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 18ms/step - accuracy: 0.9908 - loss: 0.0328 - val_accuracy: 0.7672 - val_loss: 0.9349


<keras.src.callbacks.history.History at 0x78c5fc94f610>

#Embedding Technique

In [58]:
docs = ['go india',
		'india india',
		'hip hip hurray',
		'jeetega bhai jeetega india jeetega',
		'bharat mata ki jai',
		'kohli kohli',
		'sachin sachin',
		'dhoni dhoni',
		'modi ji ki jai',
		'inquilab zindabad']

In [59]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [60]:
tokenizer.fit_on_texts(docs)

In [61]:
len(tokenizer.word_index)

17

In [62]:
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[9, 1],
 [1, 1],
 [3, 3, 10],
 [2, 11, 2, 1, 2],
 [12, 13, 4, 5],
 [6, 6],
 [7, 7],
 [8, 8],
 [14, 15, 4, 5],
 [16, 17]]

In [63]:
from keras.utils import pad_sequences
sequences = pad_sequences(sequences,padding='post')
sequences

array([[ 9,  1,  0,  0,  0],
       [ 1,  1,  0,  0,  0],
       [ 3,  3, 10,  0,  0],
       [ 2, 11,  2,  1,  2],
       [12, 13,  4,  5,  0],
       [ 6,  6,  0,  0,  0],
       [ 7,  7,  0,  0,  0],
       [ 8,  8,  0,  0,  0],
       [14, 15,  4,  5,  0],
       [16, 17,  0,  0,  0]], dtype=int32)

In [64]:
model.compile('adam','accuracy')

In [65]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

# Sample documents
docs =  ['go india',
		'india india',
		'hip hip hurray',
		'jeetega bhai jeetega india jeetega',
		'bharat mata ki jai',
		'kohli kohli',
		'sachin sachin',
		'dhoni dhoni',
		'modi ji ki jai',
		'inquilab zindabad']

# Tokenize the texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)

# Convert texts to sequences of integers
sequences = tokenizer.texts_to_sequences(docs)

# Pad sequences to have the required input length (5)
padded_sequences = pad_sequences(sequences, maxlen=5)

# Define a model with an Embedding layer
model = Sequential()
model.add(Embedding(input_dim=18, output_dim=2, input_length=5))

# Build the model by specifying input shape
model.build(input_shape=(None, 5))

# Check model summary
model.summary()

# Predict using the padded sequences
pred = model.predict(padded_sequences)
print(pred)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[[[ 0.00880583  0.03563062]
  [ 0.00880583  0.03563062]
  [ 0.00880583  0.03563062]
  [ 0.00979711  0.03821543]
  [ 0.039379    0.01192134]]

 [[ 0.00880583  0.03563062]
  [ 0.00880583  0.03563062]
  [ 0.00880583  0.03563062]
  [ 0.039379    0.01192134]
  [ 0.039379    0.01192134]]

 [[ 0.00880583  0.03563062]
  [ 0.00880583  0.03563062]
  [-0.03365226 -0.04131786]
  [-0.03365226 -0.04131786]
  [-0.02034224 -0.04174242]]

 [[-0.02106109 -0.04663775]
  [ 0.02019498  0.00348751]
  [-0.02106109 -0.04663775]
  [ 0.039379    0.01192134]
  [-0.02106109 -0.04663775]]

 [[ 0.00880583  0.03563062]
  [ 0.04094407 -0.02936649]
  [ 0.01663606 -0.04750391]
  [-0.03271724  0.04261116]
  [ 0.03328173 -0.02267591]]

 [[ 0.00880583  0.03563062]
  [ 0.00880583  0.03563062]
  [ 0.00880583  0.03563062]
  [-0.00017355 -0.03612006]
  [-0.00017355 -0.03612006]]

 [[ 0.00880583  0.03563062]
  [ 0.00880583  0.03563062]
  [ 0.00880583  0.03

In [66]:
# Sample documents
docs =  ['go india',
        'india india',
        'hip hip hurray',
        'jeetega bhai jeetega india jeetega',
        'bharat mata ki jai',
        'kohli kohli',
        'sachin sachin',
        'dhoni dhoni',
        'modi ji ki jai',
        'inquilab zindabad']

# Tokenize the texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)

# Convert texts to sequences of integers
sequences = tokenizer.texts_to_sequences(docs)

# Pad sequences to have the required input length (5)
padded_sequences = pad_sequences(sequences, maxlen=5)

# Define a model with an Embedding layer
# Add 1 to the vocabulary size to accommodate the 0 index
model = Sequential()
model.add(Embedding(input_dim=18, output_dim=2, input_length=5))

# Build the model by specifying input shape
model.build(input_shape=(None, 5))

# Check model summary
model.summary()

# Predict using the padded sequences
pred = model.predict(padded_sequences)
print(pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[[[ 1.9289553e-05  2.1523032e-02]
  [ 1.9289553e-05  2.1523032e-02]
  [ 1.9289553e-05  2.1523032e-02]
  [-1.7429210e-02 -3.0310346e-02]
  [-3.1916983e-03 -9.2140585e-04]]

 [[ 1.9289553e-05  2.1523032e-02]
  [ 1.9289553e-05  2.1523032e-02]
  [ 1.9289553e-05  2.1523032e-02]
  [-3.1916983e-03 -9.2140585e-04]
  [-3.1916983e-03 -9.2140585e-04]]

 [[ 1.9289553e-05  2.1523032e-02]
  [ 1.9289553e-05  2.1523032e-02]
  [-5.0061122e-03 -1.7297208e-02]
  [-5.0061122e-03 -1.7297208e-02]
  [-3.7775505e-02  3.1552579e-02]]

 [[-3.8846113e-02  3.3400688e-02]
  [ 1.0874581e-02 -2.9828239e-02]
  [-3.8846113e-02  3.3400688e-02]
  [-3.1916983e-03 -9.2140585e-04]
  [-3.8846113e-02  3.3400688e-02]]

 [[ 1.9289553e-05  2.1523032e-02]
  [ 4.8761260e-02 -3.5371840e-02]
  [ 2.8873947e-02  3.1344187e-02]
  [-1.6391087e-02  4.6880055e-02]
  [-3.4388434e-02  1.2965072e-02]]

 [[ 1.9289553e-05  2.1523032e-02]
  [ 1.9289553e-05  2.1523032e-02]


In [67]:
from keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten

In [68]:
(X_train,y_train),(X_test,y_test) = imdb.load_data()

In [69]:
X_train = pad_sequences(X_train,padding='post',maxlen=50)
X_test = pad_sequences(X_test,padding='post',maxlen=50)

In [70]:
X_train.shape

(25000, 50)

**Explanation in copy**

In [71]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense
from tensorflow.keras.models import Model

# Input layer with the correct shape
inputs = Input(shape=(50,))  # Changed this to 50 to reflect the padding

# Embedding layer
embedding = Embedding(input_dim=4000, output_dim=2)(inputs)

# SimpleRNN layer
rnn = SimpleRNN(32, return_sequences=False)(embedding)

# Output layer
outputs = Dense(1, activation='sigmoid')(rnn)


# Create the model
model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

In [72]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense
from tensorflow.keras.models import Model

# 1. Determine the maximum index value in your training data
max_index_value = X_train.max()

# 2. Adjust input_dim to accommodate the maximum index value
#    Add 1 to include the maximum index
input_dim = max_index_value + 1

# Input layer with the correct shape
inputs = Input(shape=(50,))

# Embedding layer with adjusted input_dim
embedding = Embedding(input_dim=input_dim, output_dim=2)(inputs)

# SimpleRNN layer
rnn = SimpleRNN(32, return_sequences=False)(embedding)

# Output layer
outputs = Dense(1, activation='sigmoid')(rnn)

# Create the model
model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

In [73]:
# Fit the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.5435 - loss: 0.6819 - val_accuracy: 0.7850 - val_loss: 0.4626
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.8209 - loss: 0.4037 - val_accuracy: 0.8008 - val_loss: 0.4365
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.9046 - loss: 0.2456 - val_accuracy: 0.7984 - val_loss: 0.4504
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.9448 - loss: 0.1596 - val_accuracy: 0.7942 - val_loss: 0.5448
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.9649 - loss: 0.1060 - val_accuracy: 0.7878 - val_loss: 0.6085
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.9795 - loss: 0.0652 - val_accuracy: 0.7740 - val_loss: 0.6887
Epoch 7/10
[1m625/6