# Importing necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM
from tensorflow.keras.utils import to_categorical
import re

In [2]:
os.chdir('E:\Data\Text Generation')
tokenizer = Tokenizer()

# Getting Data

In [3]:
all_headlines = []
curr_dir = 'E:\Data\Text Generation'
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + '\\' + filename)
        all_headlines.extend(list(article_df.headline.values))
        break
        
all_headlines = [h for h in all_headlines if h != "Unknown"]

In [4]:
len(all_headlines)

831

# Cleaning Data

In [5]:
def clean_text(all_headlines):
    for i in range(len(all_headlines)):
        all_headlines[i] = re.sub('[^a-zA-Z]', ' ', all_headlines[i])
    new_list = []
    tokenizer = Tokenizer()
    for i in range(len(all_headlines)):
        line = all_headlines[i]
        tokens = line.split()
        tokens = [token.lower() for token in tokens]
        tokens = [token for token in tokens if token not in string.punctuation]
        new_list.append(tokens)
    return new_list

new_list = clean_text(all_headlines)

# Generating Sequence 

In [6]:
def get_sequence(new_list):
    tokenizer.fit_on_texts(new_list)
    word_len = len(tokenizer.word_index) + 1
    sequences = tokenizer.texts_to_sequences(new_list)
    
    return sequences, word_len

sequences, word_len = get_sequence(new_list)

# Padding the sequence

In [7]:
def generate_padded_sequence(sequences):
    max_len = max([len(line) for line in sequences])
    sequences = np.array(pad_sequences(sequences, maxlen = max_len, padding = 'pre'))
    X, y = sequences[:, :-1], sequences[:,-1]
    y = to_categorical(y, num_classes = word_len)
    return X, y, sequences, max_len

X, y, sequences, max_len = generate_padded_sequence(sequences)

In [8]:
sequences

array([[   0,    0,    0, ...,  176,    6,  676],
       [   0,    0,    0, ...,  677,    9,  678],
       [   0,    0,    0, ...,  680,  130,  681],
       ...,
       [   0,    0,    0, ...,   25, 2345, 2346],
       [   0,    0,    0, ..., 2347,   30,   41],
       [   0,    0,    0, ...,  374,   96, 2349]])

In [9]:
X

array([[   0,    0,    0, ...,  675,  176,    6],
       [   0,    0,    0, ...,    1,  677,    9],
       [   0,    0,    0, ...,    4,  680,  130],
       ...,
       [   0,    0,    0, ...,   23,   25, 2345],
       [   0,    0,    0, ...,   48, 2347,   30],
       [   0,    0,    0, ...,   59,  374,   96]])

In [10]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [11]:
y.shape

(831, 2350)

In [12]:
X.shape

(831, 19)

# Creating Model

In [13]:
def create_model(word_len, max_len, X):
    model = Sequential()
    model.add(Embedding(word_len, output_dim = X.shape[1], input_length = max_len-1))
    model.add(LSTM(100, return_sequences = True,  activation = 'relu'))
    model.add(Dropout(0.1))

    model.add(LSTM(100, return_sequences = True, activation = 'relu'))
    model.add(Dropout(0.1))

    model.add(LSTM(100, return_sequences = True, activation = 'relu'))
    model.add(Dropout(0.1))

    model.add(LSTM(100,  activation = 'relu'))

    # Add Output Layer
    model.add(Dense(word_len, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = create_model(word_len, max_len, X)

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 19, 19)            44650     
_________________________________________________________________
lstm (LSTM)                  (None, 19, 100)           48000     
_________________________________________________________________
dropout (Dropout)            (None, 19, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 19, 100)           80400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 19, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 19, 100)           80400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 19, 100)           0

In [15]:
model.fit(X, y, epochs=100, verbose=5)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x24b74dce850>

# Getting Output From Model

In [16]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()
   
print (generate_text("united states", 5, model, max_len))
print (generate_text("preident trump", 4, model, max_len))
print (generate_text("donald trump", 4, model, max_len))
print (generate_text("india and china", 4, model, max_len))
print (generate_text("new york", 4, model, max_len))
print (generate_text("science and technology", 5, model, max_len))



United States Whodunit Fraud Spice Seconds Seconds
Preident Trump Corrections Out Crisis Uniforms
Donald Trump Song Inflation Sing K
India And China Inflation Chats Task Elections
New York Out Obamacare Uniforms Sip
Science And Technology Crisis Dutch Forevermore Inflation Buddha
