### Questions

### Objectives
YWBAT
- explain how gru and lstm models work
- compare and contrast them
- build them in keras
- build an lstm/gru model using a previous embedding from gensim

### when do we use LSTMs and GRUs?
- Natural Language Processing (Classify Text)
- Classify patterns

### Outline

In [67]:
import pandas as pd
import numpy as np

import keras
from keras.layers import LSTM, GRU, Dense, GlobalMaxPool1D, Embedding, Dropout
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras.activations import relu, sigmoid, softmax
from keras.optimizers import adam
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split



import matplotlib.pyplot as plt

In [20]:
# load in cleaned text from previous lesson

df = pd.read_json("News_Dataset_Cleaned.json")
df.head()

Unnamed: 0,authors,category,clean_combined_text
0,Melissa Jeltsen,CRIME,there were 2 mass shootings in texas last week...
1,Andy McDonald,ENTERTAINMENT,will smith joins diplo and nicky jam for the 2...
10,Sebastian Murdock,ENTERTAINMENT,justin timberlake visits texas school shooting...
100,Kevin Robillard,POLITICS,the koch network is going after one of trumps ...
1000,"Mubasher Bukhari and Asif Shahzad, Reuters",WORLD NEWS,pakistani interior minister ahsan iqbal shot b...


In [57]:
num_labels = len(df.category.unique())

label_dict = dict(zip(df.category.unique(), range(num_labels)))

In [58]:
df["category_num"] = df.category.map(label_dict)
df.head()

Unnamed: 0,authors,category,clean_combined_text,category_num
0,Melissa Jeltsen,CRIME,there were 2 mass shootings in texas last week...,0
1,Andy McDonald,ENTERTAINMENT,will smith joins diplo and nicky jam for the 2...,1
10,Sebastian Murdock,ENTERTAINMENT,justin timberlake visits texas school shooting...,1
100,Kevin Robillard,POLITICS,the koch network is going after one of trumps ...,2
1000,"Mubasher Bukhari and Asif Shahzad, Reuters",WORLD NEWS,pakistani interior minister ahsan iqbal shot b...,3


In [52]:
enc = OneHotEncoder(n_values=df.category.unique().shape[0])

In [61]:
labels = enc.fit_transform(df["category_num"].values.reshape(-1, 1))
labels

<200853x41 sparse matrix of type '<class 'numpy.float64'>'
	with 200853 stored elements in Compressed Sparse Row format>

In [62]:
labels.shape

(200853, 41)

### Building LSTM with Keras

In [21]:
tokenizer = text.Tokenizer(num_words=20000)

In [23]:
tokenizer.fit_on_texts(df["clean_combined_text"])

In [None]:
list_tokenized_train = tokenizer.texts_to_sequences(df["clean_combined_text"])

In [28]:
len(list_tokenized_train), df.shape

(200853, (200853, 3))

In [29]:
type(list_tokenized_train)

list

In [38]:
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=100)

In [66]:
X_t.shape, labels.shape

((200853, 100), (200853, 41))

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X_t, labels, train_size=0.85)

In [69]:
# LSTM Model

lstm_model = Sequential()
lstm_model.add(Embedding(20000, 30)) # this is what we did previously

# passing embeddings into our LSTM
lstm_model.add(LSTM(50, return_sequences=True))
lstm_model.add(GlobalMaxPool1D())
# lstm_model.add(Dropout(0.5)) # dropout will randomly drop half of the data, in this case

# pass in the half that was't dropped into a dense layer
lstm_model.add(Dense(50, activation='relu'))

# drop half of our dense layer information
# lstm_model.add(Dropout(0.5))

# classify
lstm_model.add(Dense(41, activation='softmax'))

In [70]:
lstm_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 30)          600000    
_________________________________________________________________
lstm_4 (LSTM)                (None, None, 50)          16200     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_4 (Dense)              (None, 41)                2091      
Total params: 620,841
Trainable params: 620,841
Non-trainable params: 0
_________________________________________________________________


In [74]:
optimizer = adam()
lstm_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [75]:
lstm_model.fit(X_train, y_train, epochs=4, validation_split=0.10)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 153652 samples, validate on 17073 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1a5064b9e8>

In [5]:
# load in model from previous lesson
embedding_model = Word2Vec.load("w2v_model_s30")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [6]:
# check vector size for input layer of LSTM model
embedding_model.wv.vector_size

30

In [14]:
vocab_size = len(embedding_model.wv.vocab)
vocab_size

41760

In [16]:
embedding_matrix = np.zeros((len(embedding_model.wv.vocab), 30))

In [17]:
for i in range(len(embedding_model.wv.vocab)):
    embedding_vector = embedding_model.wv[embedding_model.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Assessment