<a href="https://colab.research.google.com/github/ejdogar/NLP/blob/main/Integer_Encoding_Simple_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np

In [11]:
with open("/content/drive/MyDrive/Atomcamp/NLP/CNN/labels.txt", "r") as f:
  labels = f.read()

with open("/content/drive/MyDrive/Atomcamp/NLP/CNN/reviews.txt", "r") as f:
  reviews = f.read()

In [12]:
print("Labels : ", labels[:10])
print("Reviews: ", reviews[:100])

Labels :  positive
n
Reviews:  bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life


# Step 1: Data Preprocessing

#### 1.1 Removing Punctuations

In [13]:
from string import punctuation
import re as r

reviews = reviews.lower()


all_text = "".join([c for c in reviews if c not in punctuation])

#splitting the all_text based on line split so that we can get out all_reviews array
all_reviews = all_text.split("\n")

#getting our vocabulary
all_words = all_text.split()
len(all_words)

6020196

In [14]:
labels = labels.split("\n")

In [15]:
print(f"Length of reviews before removing outliers: {len(all_reviews)}")
print(f"Length of labels before removing outliers: {len(labels)}")
zero_indexes = []
for index, review in enumerate(all_reviews):
  if (len(review) == 0):
    zero_indexes.append(index)
    print("Reviews length at ",index," is : ",len(reviews[index]))
    print("Label length at ",index," is : ",len(labels[index]))

for i in zero_indexes:
  labels.pop(i)
  all_reviews.pop(i)

print(f"Length of reviews after removing outliers: {len(all_reviews)}")
print(f"Length of labels after removing outliers: {len(labels)}")

Length of reviews before removing outliers: 25001
Length of labels before removing outliers: 25001
Reviews length at  25000  is :  1
Label length at  25000  is :  0
Length of reviews after removing outliers: 25000
Length of labels after removing outliers: 25000


#### 1.2 Converting Words into Integers

In [16]:
from collections import Counter

#Counter function will return of number of occurence of each word in the text
counts = Counter(all_words)

#sorting the Counter based on values and saving it into the list
vocab = sorted(counts, key=counts.get, reverse = True)

vocab_to_int = {word: index for index, word in enumerate(vocab, start = 1)}


### 1.3 Converting reviews to integers

In [17]:
all_reviews_int = [] #tokenizzed reviews
for review in all_reviews:
  temp_array = []
  for word in review.split():
    temp_array.append(vocab_to_int[word])
  all_reviews_int.append(temp_array)

In [18]:
# stats about vocabulary
print('Unique words: ', len((vocab_to_int)))
print()

# print tokens in first review
print('Tokenized review: \n', all_reviews_int[:1])

Unique words:  74072

Tokenized review: 
 [[21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1, 171, 57, 15, 49, 81, 5785, 44, 382, 110, 140, 15, 5194, 60, 154, 9, 1, 4975, 5852, 475, 71, 5, 260, 12, 21025, 308, 13, 1978, 6, 74, 2395, 5, 613, 73, 6, 5194, 1, 24103, 5, 1983, 10166, 1, 5786, 1499, 36, 51, 66, 204, 145, 67, 1199, 5194, 19869, 1, 37442, 4, 1, 221, 883, 31, 2988, 71, 4, 1, 5787, 10, 686, 2, 67, 1499, 54, 10, 216, 1, 383, 9, 62, 3, 1406, 3686, 783, 5, 3483, 180, 1, 382, 10, 1212, 13583, 32, 308, 3, 349, 341, 2913, 10, 143, 127, 5, 7690, 30, 4, 129, 5194, 1406, 2326, 5, 21025, 308, 10, 528, 12, 109, 1448, 4, 60, 543, 102, 12, 21025, 308, 6, 227, 4146, 48, 3, 2211, 12, 8, 215, 23]]


#### 1.4 Encoding Labels

In [19]:
encoded_labels = np.array([1 if label == "positive" else 0 for label in labels])
encoded_labels[0:4]

array([1, 0, 1, 0])

#### 1.5 Padding

Padding/ truncating the data, so that all our reviews have same length



In [20]:
def pad_features(all_reviews_int, seq_length):
  features = np.zeros((len(all_reviews_int), seq_length), dtype=int)

  for index, review in enumerate(all_reviews_int):
    #-len(review):] indicates that the row would be added in the matrix from right to left
    features[index, -len(review):] = np.array(review)[:seq_length]

  return features


In [21]:
seq_length = 200
#setting all sequence length to 200
features = pad_features(all_reviews_int, seq_length=seq_length)

In [22]:
assert len(features)==len(all_reviews_int), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

In [23]:
print(features[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
 21025   308     6     3  1050   207     8  2138    32     1   171    57
    15    49    81  5785    44   382   110   140    15  5194    60   154
     9     1  4975  5852   475    71     5   260    12 21025   308    13
  1978     6    74  2395     5   613    73     6  5194     1 24103     5
  1983 10166     1  5786  1499    36    51    66   204   145    67  1199
  5194 19869     1 37442     4     1   221   883    31  2988    71     4
     1  5787    10   686     2    67  1499    54    10   216     1   383
     9    62     3  1406  3686   783     5  3483   180     1   382    10
  1212 13583    32   308     3   349   341  2913   

# Step 2: Training, Validation, Test

With our data in nice shape, we'll split it into training, validation, and test sets.

In [24]:
split_frac = 0.8

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [28]:
from keras.layers import Dense, SimpleRNN
from keras.models import Sequential

In [29]:
model = Sequential()
#input_shape = timesteps x no of features feature
# return_sequence = False because we want out output at one end and don't want to collect it after every hidden_state
model.add(SimpleRNN(32, input_shape=(200, 1), return_state = False))
model.add(Dense(1, activation = "sigmoid"))

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_1 (SimpleRNN)    (None, 32)                1088      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1121 (4.38 KB)
Trainable params: 1121 (4.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [32]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

model.fit(train_x, train_y, epochs = 5, validation_data = (val_x, val_y))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7eb964408e80>