# Text generating by using LSTM

## New York Times Comments

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [5]:
df = pd.read_csv('ArticlesApril2018.csv', encoding = 'latin1')
df

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleadersâ Settlement Offer...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"âI understand that they could meet with us, ...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,Whatâs it like to eat at the second incarnat...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Torontoâs ...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1319,5ae82c93068401528a2ab969,1004,By CLAIRE CAIN MILLER,article,This Common Question Reinforces the Gender Pay...,"['Discrimination', 'Wages and Salaries', 'Labo...",68,Upshot,3,2018-05-01 09:00:01,Unknown,Several states and cities have ordered employe...,The New York Times,News,https://www.nytimes.com/2018/05/01/upshot/how-...
1320,5ae82c95068401528a2ab96b,1043,By TRACY J. GATES,article,"Anna, Llama and Me","['Friendship', 'Dewdney, Anna', 'Writing and W...",65,Well,0,2018-05-01 09:00:02,Family,"The beginning, middle and end of a picture boo...",The New York Times,News,https://www.nytimes.com/2018/05/01/well/family...
1321,5ae82c9d068401528a2ab96d,659,Interview by AUDIE CORNISH,article,Gen. Michael Hayden Has One Regret: Russia,"['Classified Information and State Secrets', '...",66,Magazine,70,2018-05-01 09:00:06,Unknown,"The former N.S.A. and C.I.A. chief on Trump, S...",The New York Times,News,https://www.nytimes.com/2018/05/01/magazine/ge...
1322,5ae82c9f068401528a2ab96f,1155,By JASON ROBERT BROWN,article,There Is Nothinâ Like a Tune,"['Books and Literature', 'Purdum, Todd S', 'Th...",68,BookReview,17,2018-05-01 09:00:07,Book Review,"In âSomething Wonderful,â Todd S. Purdum a...",The New York Times,Review,https://www.nytimes.com/2018/05/01/books/revie...


### Preprocessing

In [6]:
df.columns

Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')

In [7]:
df.headline.isnull().sum()

0

In [8]:
headlines = [i for i in df.headline.values]
len(headlines)

1324

In [9]:
headlines[:5]

['Former N.F.L. Cheerleadersâ\x80\x99 Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

#### Delete noise datas

In [10]:
headlines = [i for i in headlines if i != 'Unknown']
len(headlines)

1214

In [11]:
#### Delete punctuations and transfer to small letter

In [12]:
from string import punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
def prepro(s) :
    s = s.encode('utf8').decode('ascii', 'ignore')
    return ''.join(i for i in s if i not in punctuation).lower()

In [14]:
headlines = [prepro(s) for s in headlines]
headlines[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

#### Generating word set and checking its size

In [15]:
t = Tokenizer()
t.fit_on_texts(headlines)
vocab_size = len(t.word_index) + 1
vocab_size

3494

#### Sequence for learning

In [16]:
sequences = []
for i in headlines :
    encoded = t.texts_to_sequences([i])[0]
    for j in range(1, len(encoded)) :
        s = encoded[: j + 1]
        sequences.append(s)
sequences[:5]

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52]]

In [17]:
max_len = max(len(s) for s in sequences)
max_len

24

In [18]:
sequences = pad_sequences(sequences, maxlen = max_len, padding = 'pre')
sequences[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          99,  269],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   99,
         269,  371],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   99,  269,
         371, 1115],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,   99,  269,  371,
        1115,  582],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,   99,  269,  371, 1115,
         582,   52]], dtype=int32)

In [19]:
X = sequences[:, : -1]
y = sequences[:, -1]
Y = to_categorical(y)
X.shape, Y.shape

((7803, 23), (7803, 3494))

### Processing

- Embedding
- LSTM

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [21]:
#### Embedding Vector 10, 

In [22]:
model = Sequential([
    Embedding(vocab_size, 10, input_length = max_len -1),
    LSTM(128),
    Dense(vocab_size, activation = 'softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 23, 10)            34940     
_________________________________________________________________
lstm (LSTM)                  (None, 128)               71168     
_________________________________________________________________
dense (Dense)                (None, 3494)              450726    
Total params: 556,834
Trainable params: 556,834
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [24]:
history = model.fit(X, Y, epochs = 200, verbose = 2)

Epoch 1/200
244/244 - 9s - loss: 7.6493 - accuracy: 0.0300
Epoch 2/200
244/244 - 1s - loss: 7.1262 - accuracy: 0.0302
Epoch 3/200
244/244 - 1s - loss: 6.9862 - accuracy: 0.0313
Epoch 4/200
244/244 - 1s - loss: 6.8683 - accuracy: 0.0414
Epoch 5/200
244/244 - 1s - loss: 6.7320 - accuracy: 0.0437
Epoch 6/200
244/244 - 1s - loss: 6.5834 - accuracy: 0.0473
Epoch 7/200
244/244 - 1s - loss: 6.4172 - accuracy: 0.0470
Epoch 8/200
244/244 - 1s - loss: 6.2377 - accuracy: 0.0536
Epoch 9/200
244/244 - 1s - loss: 6.0490 - accuracy: 0.0578
Epoch 10/200
244/244 - 1s - loss: 5.8634 - accuracy: 0.0633
Epoch 11/200
244/244 - 1s - loss: 5.6900 - accuracy: 0.0686
Epoch 12/200
244/244 - 1s - loss: 5.5270 - accuracy: 0.0729
Epoch 13/200
244/244 - 1s - loss: 5.3665 - accuracy: 0.0787
Epoch 14/200
244/244 - 1s - loss: 5.2182 - accuracy: 0.0848
Epoch 15/200
244/244 - 1s - loss: 5.0705 - accuracy: 0.0951
Epoch 16/200
244/244 - 1s - loss: 4.9297 - accuracy: 0.1030
Epoch 17/200
244/244 - 1s - loss: 4.7958 - accura

#### Verify model

In [25]:
from my_util import sentence_generation

In [26]:
print(sentence_generation(model, t, max_len, 'i', 10))
print(sentence_generation(model, t, max_len, 'epa', 10))
print(sentence_generation(model, t, max_len, 'former', 10)) 

i want to be rich and im not sorry case plans
epa sheriff indulged pruitt as security spending mounted over slide up
former judge to review files seized in searches war say francis
