In [8]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
print(tf.__version__)
import matplotlib.pyplot as plt

2.0.0


In [9]:
imdb,info=tfds.load("imdb_reviews",with_info=True,as_supervised=True)

In [10]:
train,test=imdb['train'],imdb['test']

In [11]:
training_sentences=[]
train_labels=[]
test_sentences=[]
test_labels=[]

In [12]:
for s,l in train:
    training_sentences.append(str(s.numpy()))
    train_labels.append(l.numpy())
for s,l in test:
    test_sentences.append(str(s.numpy()))
    test_labels.append(l.numpy())
print(len(training_sentences),len(train_labels))
print(len(test_sentences),len(test_labels))

25000 25000
25000 25000


In [13]:
train_labels=np.asarray(train_labels)
test_labels=np.asarray(test_labels)

In [14]:
vocab_size=10000
max_length=300
embedding_dim=16
oov_token='<OOV>'
trunc_type='post'
tokenizer=Tokenizer(num_words=vocab_size,oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index=tokenizer.word_index
train_sequences=tokenizer.texts_to_sequences(training_sentences)
train_padded=pad_sequences(train_sequences,maxlen=max_length,truncating=trunc_type)
test_sequences=tokenizer.texts_to_sequences(test_sentences)
test_padded=pad_sequences(test_sequences,maxlen=max_length,truncating=trunc_type)

In [15]:
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6,activation=tf.nn.relu),
    tf.keras.layers.Dense(1,activation=tf.nn.sigmoid)
])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 4800)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 28806     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 188,813
Trainable params: 188,813
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
z1=model.fit(train_padded,train_labels,epochs=10,validation_data=(test_padded,test_labels))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
 3776/25000 [===>..........................] - ETA: 3s - loss: 0.1200 - accuracy: 0.9619

In [None]:
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6,activation=tf.nn.relu),
    tf.keras.layers.Dense(1,activation=tf.nn.sigmoid)
])

In [None]:
model.summary()

In [None]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
z2=model.fit(train_padded,train_labels,epochs=10,validation_data=(test_padded,test_labels))

In [None]:
plt.plot(z1.history['accuracy'],'r',label='train_accuracy')
plt.plot(z1.history['val_accuracy'],label='test_accuracy')
plt.legend()
plt.title("Using Flatten")

In [None]:
plt.plot(z2.history['accuracy'],'r',label='train_accuracy')
plt.plot(z2.history['val_accuracy'],label='test_accuracy')
plt.legend()
plt.title("Using Global Average")

In [58]:
e=model.layers[0]
weights=e.get_weights()[0]
print(weights.shape) #shape (vocab_size,embeddings)

(10000, 16)


In [62]:
reversed_word_index=dict([(value,key) for (key,value) in word_index.items()])

In [64]:
import io

In [67]:
out_v=io.open('vecs.tsv',mode='w',encoding='utf-8')
out_m=io.open('meta.tsv',mode='w',encoding='utf-8')
for i in range(1,vocab_size):
    word=reversed_word_index[i]
    embeddings=weights[i]
    out_v.write('/t'.join([str(x) for x in embeddings])+'/n')
    out_m.write(word+'\n')
out_v.close()
out_m.close()