# Text generating by using LSTM

## New York Times Comments

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
df = pd.read_csv('ArticlesApril2018.csv', encoding = 'latin1')
df

### Preprocessing

In [None]:
df.columns

In [None]:
df.headline.isnull().sum()

In [None]:
headlines = [i for i in df.headline.values]
len(headlines)

In [None]:
headlines[:5]

#### Delete noise datas

In [None]:
headlines = [i for i in headlines if i != 'Unknown']
len(headlines)

In [None]:
#### Delete punctuations and transfer to small letter

In [None]:
from string import punctuation
punctuation

In [None]:
def prepro(s) :
    s = s.encode('utf8').decode('ascii', 'ignore')
    return ''.join(i for i in s if i not in punctuation).lower()

In [None]:
headlines = [prepro(s) for s in headlines]
headlines[:5]

#### Generating word set and checking its size

In [None]:
t = Tokenizer()
t.fit_on_texts(headlines)
vocab_size = len(t.word_index) + 1
vocab_size

#### Sequence for learning

In [None]:
sequences = []
for i in headlines :
    encoded = t.texts_to_sequences([i])[0]
    for j in range(1, len(encoded)) :
        s = encoded[: j + 1]
        sequences.append(s)
sequences[:5]

In [None]:
max_len = max(len(s) for s in sequences)
max_len

In [None]:
sequences = pad_sequences(sequences, maxlen = max_len, padding = 'pre')
sequences[:5]

In [None]:
X = sequences[:, : -1]
y = sequences[:, -1]
Y = to_categorical(y)
X.shape, Y.shape

### Processing

- Embedding
- LSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

#### Embedding Vector 10, LSTM size 128

In [None]:
model = Sequential([
    Embedding(vocab_size, 10, input_length = max_len -1),
    LSTM(128),
    Dense(vocab_size, activation = 'softmax')
])
model.summary()

In [None]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [None]:
history = model.fit(X, Y, epochs = 200, verbose = 2)

Epoch 1/200
244/244 - 3s - loss: 0.2641 - accuracy: 0.9163
Epoch 2/200
244/244 - 3s - loss: 0.2615 - accuracy: 0.9161
Epoch 3/200
244/244 - 3s - loss: 0.2612 - accuracy: 0.9170
Epoch 4/200
244/244 - 3s - loss: 0.2622 - accuracy: 0.9161
Epoch 5/200
244/244 - 3s - loss: 0.2617 - accuracy: 0.9176
Epoch 6/200
244/244 - 3s - loss: 0.2618 - accuracy: 0.9167
Epoch 7/200
244/244 - 3s - loss: 0.2713 - accuracy: 0.9157
Epoch 8/200
244/244 - 3s - loss: 0.2668 - accuracy: 0.9166
Epoch 9/200
244/244 - 3s - loss: 0.2765 - accuracy: 0.9154
Epoch 10/200
244/244 - 3s - loss: 0.2652 - accuracy: 0.9164
Epoch 11/200
244/244 - 3s - loss: 0.2601 - accuracy: 0.9152
Epoch 12/200
244/244 - 3s - loss: 0.2619 - accuracy: 0.9171
Epoch 13/200
244/244 - 3s - loss: 0.2590 - accuracy: 0.9190
Epoch 14/200
244/244 - 3s - loss: 0.2594 - accuracy: 0.9152
Epoch 15/200
244/244 - 3s - loss: 0.2597 - accuracy: 0.9168
Epoch 16/200
244/244 - 3s - loss: 0.2588 - accuracy: 0.9176
Epoch 17/200
244/244 - 3s - loss: 0.2586 - accura

#### Verify model

In [None]:
from my_util import sentence_generation

In [None]:
print(sentence_generation(model, t, max_len, 'i', 10))
print(sentence_generation(model, t, max_len, 'epa', 10))
print(sentence_generation(model, t, max_len, 'former', 10)) 

i want to be rich and im not sorry president scott
epa sheriff indulged pruitt as security spending mounted behind the streets
former nfl cheerleaders settlement offer 1 and a meeting with goodell
