In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
import os

In [3]:
#processing the labels of the raw imdb data
imdb_dir='../Downloads/IMDB'
train_dir=os.path.join(imdb_dir,'train')

labels=[]
texts=[]

for label_type in ['neg','pos']:
    dir_name=os.path.join(train_dir,label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:]=='.txt':
            f=open(os.path.join(dir_name,fname))
            texts.append(f.read())
            f.close()
            if label_type=='neg':
                labels.append(0)
            else:
                labels.append(1)
        

In [4]:
texts[0]

"Working with one of the best Shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.<br /><br />Branagh steals the film from under Fishburne's nose, and there's a talented cast on good form."

In [9]:
len(texts)

25000

In [10]:
#tokenize the raw text
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen=100
num_words=10000
training_samples=200 #since we are using pretrained word embeddings
validation_samples=10000

tokenizer=Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(texts)

sequences=tokenizer.texts_to_sequences(texts)


In [11]:
sequences[0]

[777,
 16,
 28,
 4,
 1,
 115,
 2278,
 6887,
 11,
 19,
 1025,
 5,
 27,
 5,
 42,
 2425,
 1861,
 128,
 2270,
 5,
 3,
 6985,
 308,
 7,
 7,
 3383,
 2373,
 1,
 19,
 36,
 463,
 3169,
 2,
 222,
 3,
 1016,
 174,
 20,
 49,
 808]

In [12]:
word_index=tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Found 88582 unique tokens


In [13]:
#padding the sequences to equal length of 100
data=pad_sequences(sequences,maxlen=maxlen)


In [14]:
labels=np.asarray(labels)

In [17]:
print('shape of data tensor',data.shape)
print('shape of label tensor',labels.shape)

shape of data tensor (25000, 100)
shape of label tensor (25000,)


In [21]:
indices=np.arange(data.shape[0])
np.random.shuffle(indices)

data=data[indices]
labels=labels[indices]

x_train=data[:10000]
y_train=labels[:10000]

x_val=data[10000:12000]
y_val=labels[10000:12000]


In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Flatten,Dense

In [24]:
model=Sequential([
    Embedding(10000,30,input_length=maxlen),
    Flatten(),
    Dense(1,activation='sigmoid')
])

In [26]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

In [27]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 30)           300000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 3000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 3001      
Total params: 303,001
Trainable params: 303,001
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_val,y_val))

Train on 10000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x13cdeedd8>

## model 2

In [30]:
model1=Sequential([
    Embedding(10000,50,input_length=maxlen),
    Flatten(),
    Dense(1,activation='sigmoid')
])

In [31]:
model1.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

In [32]:
model1.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_val,y_val))

Train on 10000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x13f28dcf8>