# ws 02 IMDB csv

- download csv - https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/version/1


In [None]:
import numpy as np
import pandas as pd
import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, GRU, LSTM, Dense

gensim.__version__ , np.__version__

In [None]:
!pip install --upgrade gensim

## Load pre-trained

In [None]:
import gensim.downloader as api

wv = api.load('glove-wiki-gigaword-100')   
wv

In [None]:
len(wv.index_to_key)

In [None]:
wv.vector_size

In [None]:
wv.similar_by_word('horrible')

In [None]:
wv.similar_by_word('movie')

In [None]:
wv.most_similar('crap',topn=8)

 Upload CSV

In [None]:
# upload IMDb CSV
from google.colab import files

uploaded = files.upload()
%ls

In [None]:
df = pd.read_csv('IMDB Dataset.csv', encoding='utf-8') 
df.head()

In [None]:
df['label'] = np.where(df.sentiment == 'positive', 1, 0)
df.sample(10)

 Preprocess and Tokenizer

In [None]:
df['review2'] = df.review.str.lower()

In [None]:
df.iloc[[1,25319,19574,49804]]

In [None]:
# !pip install beautifulsoup4
from bs4 import BeautifulSoup

def remove_tag(x):
    soup = BeautifulSoup(x)
    return soup.get_text()

df['review2'] = df.review2.apply(remove_tag)

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
df['review2'] = df.review2.apply(remove_stopwords) # 

In [None]:
tk = Tokenizer() # 

tk.fit_on_texts(df.review2)  

In [None]:
tk.word_index


In [None]:
tk.word_index['this']
testwords = ['this','movie','is','great']
testwords = ['i','like','this','movie']
for i in testwords:
    print(tk.word_index[i])

[tk.word_index[w] for w in testwords]

In [None]:
vocab_size = len(tk.word_index)+1
vocab_size      # 

In [None]:
sents_enc = tk.texts_to_sequences(df.review2)

In [None]:
sents_enc[1][:15]

In [None]:
sents_len = [len(i) for i in sents_enc]

print("Average Review length:", np.mean(sents_len))
print("Standard Deviation:", round(np.std(sents_len)))
print('Min length:', np.min(sents_len))
print('Max length:', np.max(sents_len))

In [None]:
sents_len[:5]

In [None]:
import pandas as pd
df2 = pd.DataFrame(sents_len, columns=['words'])
df2.describe().round(2)
df2.head()

In [None]:
df2.hist(bins=50)

In [None]:
sents_len[:10]

In [None]:
max_len = 200 # 

In [None]:
sents_pad = pad_sequences(sents_enc, truncating='post', padding='post', maxlen=max_len)

In [None]:
check_reviews = sents_pad[:2]
tk.sequences_to_texts(check_reviews)  # 

### save and load Tokenizer tk

In [None]:
import pickle

with open('IMDB_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tk, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# load
with open('IMDB_tokenizer.pickle', 'rb') as handle:
    tk2 = pickle.load(handle)

# Train test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sents_pad, df.label, test_size=.5, random_state=1)

In [None]:
X_train.shape, X_test.shape

## Embedding matrix

In [None]:
embed_size = wv.vector_size  #   

embedding_matrix = np.zeros((vocab_size, embed_size))
for word, i in tk.word_index.items():
    if i>=vocab_size: 
        continue
    try:
        embedding_vector = wv[word]
        embedding_matrix[i] = embedding_vector[:embed_size]
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25), embed_size)

embedding_matrix.shape

In [None]:
embedding_matrix.shape

In [None]:
embedding_matrix[0][:25]

Model

#### LSTM

In [None]:
# model=Sequential()
# model.add(Embedding(vocab_size, embed_size, 
#                      weights=[embedding_matrix],
#                      trainable=True # False #,
#                 #    input_shape=(X_train.shape[1],) #
#                     ))
# # model.add(LSTM(units=128, activation='tanh'))
# model.add(Bidirectional(LSTM(units=128, activation='tanh')))
# model.add(Dense(units=1, activation='sigmoid'))
# model.summary()

#### GRU

In [None]:
from keras.layers import GRU, Bidirectional

model=Sequential()
model.add(Embedding(vocab_size, embed_size, 
                     weights=[embedding_matrix],
                     trainable=True 
                #    input_shape=(X_train.shape[1],) # 
                    ))
model.add(Bidirectional(GRU(units=128, activation='tanh')))
model.add(Dense(units=1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) #

history = model.fit(X_train, y_train, epochs=10, batch_size=128,
                    verbose=1, validation_data=(X_test,y_test))

In [None]:
loss, acurracy = model.evaluate(X_test, y_test)
print("Test accuracy: {:.3f}".format(acurracy))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 3.5))
plt.subplot(1, 2, 1)

plt.title('Loss')
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'],'r', lw=3.2, label='Validation loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.title('Accuracy')

plt.plot(history.history['accuracy'], label='Training')
plt.plot(history.history['val_accuracy'], 'r', lw=3.2, label='Validation')
plt.legend()
plt.show()

## Predict

In [None]:
from keras.preprocessing.text import text_to_word_sequence

# define the document
comment1 = 'this film is about politics. i do not like it. It sucks, horrible. Can i get money back i do not recommend it'
comment2 = 'this film is about politics. people talk about it. the film is not good. i do not recommend it'
comment3= 'this film is about politics. people like it. my opinion is different.'

words_tk = text_to_word_sequence(comment1)
words_tk

In [None]:
tk.word_index['this'] , tk.word_index['film']

In [None]:
new_words_enc = tk.texts_to_sequences([comment2])
new_words_enc

In [None]:

X_new = pad_sequences(new_words_enc, truncating='post', padding='post', maxlen=max_len)

X_new

In [None]:

tk.sequences_to_texts(X_new) 

In [None]:
result = model.predict(X_new)
print(np.squeeze(result).round(3))
print('Positive' if result[0] > 0.5 else 'Negative')

In [None]:
reviews_new = [comment1, comment2, comment3]
# print(reviews_new)

for i in reviews_new:
    print(i)

In [None]:
new_words_enc = tk.texts_to_sequences(reviews_new)
# new_words_enc
X_new = pad_sequences(new_words_enc, truncating='post', padding='post', maxlen=max_len)
results = model.predict(X_new)
print(results.round(3))

In [None]:
y_test[:15].tolist()

In [None]:
result = model.predict(X_test[:15])
print(np.squeeze(result).round(2))