In [1]:
# import
import numpy as np
import pandas as pd
from string import punctuation

import tensorflow as tf
import keras

from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Embedding, Dense, LSTM

# Data Preprocessing

In [2]:
# load data
df = pd.read_csv('../data/ArticlesApril2018.csv')
df.head(3)

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...


In [3]:
# check null
print(df.columns.values)
print(df.columns.isnull().any()) 

# choose 'headline' columns
headlines = df["headline"].tolist()
print(headlines[:5]) # 'Unknown' is in the list

# remove 'Unknown' from the 'headline'
print(len(headlines))
headlines = [word for word in headlines if word != "Unknown"] # remove "Unknown" headlines
print(len(headlines))

['articleID' 'articleWordCount' 'byline' 'documentType' 'headline'
 'keywords' 'multimedia' 'newDesk' 'printPage' 'pubDate' 'sectionName'
 'snippet' 'source' 'typeOfMaterial' 'webURL']
False
['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell', 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.', 'The New Noma, Explained', 'Unknown', 'Unknown']
1324
1214


In [4]:
# remove non-ascii and punctuation in headlines
def preprocessing(sentence):
    # remove non-ascii, such as Chinese characters
    sentence_processed = sentence.encode('UTF-8').decode('ascii', 'ignore') 
    # remove punctuation : r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
    sentence_processed = ''.join(char for char in sentence_processed if char not in punctuation).lower() 
    return sentence_processed

headlines_proccessed = [preprocessing(headline) for headline in headlines]
print(headlines_proccessed[:5])
print(len(headlines_proccessed))

['former nfl cheerleaders settlement offer 1 and a meeting with goodell', 'epa to unveil a new rule its effect less science in policymaking', 'the new noma explained', 'how a bag of texas dirt  became a times tradition', 'is school a place for selfexpression']
1214


In [5]:
# tokenize headlines
tokenizer = Tokenizer()
tokenizer.fit_on_texts(headlines_proccessed)
vocab_size = len(tokenizer.word_index) + 1 # index 0 is for padding.
print(vocab_size)

3494


In [12]:
# split headlines into sentences to make training data
sequences = list()

for headline in headlines_proccessed:
    encoded = tokenizer.texts_to_sequences([headline])[0]
    
    for i in range(1, len(encoded)):
        sentence = encoded[:i+1]
        sequences.append(sentence)
        
print(sequences[:11])

[[99, 269], [99, 269, 371], [99, 269, 371, 1115], [99, 269, 371, 1115, 582], [99, 269, 371, 1115, 582, 52], [99, 269, 371, 1115, 582, 52, 7], [99, 269, 371, 1115, 582, 52, 7, 2], [99, 269, 371, 1115, 582, 52, 7, 2, 372], [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10], [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116], [100, 3]]


In [10]:
# make index to word dictionary
index_to_word = {}
for k,v in tokenizer.word_index.items():# key : word, value : index
    index_to_word[v] = k

In [17]:
# padding sentences with max length
max_len = max(len(sequence) for sequence in sequences)
sequences_padded = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences_padded[:3])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   99  269]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   99  269  371]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0   99  269  371 1115]]


In [21]:
# split into X and y
data = np.array(sequences_padded)
X = data[:,:-1]
y = data[:,-1]

print(X[:3])
print(y[:3])

# one-hot encoding
y = to_categorical(y, num_classes=vocab_size)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0  99]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0  99 269]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0  99 269 371]]
[ 269  371 1115]


# Modeling

In [22]:
imbedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=imbedding_dim))
model.add(LSTM(units=hidden_units))
model.add(Dense(activation="softmax", units=vocab_size))
model.summary()

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X,y,epochs=200, verbose=2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 10)          34940     
                                                                 
 lstm (LSTM)                 (None, 128)               71168     
                                                                 
 dense (Dense)               (None, 3494)              450726    
                                                                 
Total params: 556834 (2.12 MB)
Trainable params: 556834 (2.12 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/200
244/244 - 3s - loss: 7.6362 - accuracy: 0.0283 - 3s/epoch - 14ms/step
Epoch 2/200
244/244 - 3s - loss: 7.1062 - accuracy: 0.0306 - 3s/epoch - 11ms/step
Epoch 3/200
244/244 - 3s - loss: 6.9765 - accuracy: 0.0355 - 3s/epoch - 11ms/step
Epoch 4/200
244/244 - 3s - loss: 6.850

<keras.src.callbacks.History at 0x29be39f50>

In [32]:
def predict_next_word(model, tokenizer, word, n):
    sentence = word
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([sentence])[0]
        encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)
                
        word_ = index_to_word[result[0]] # using map to reduce time complexity
        sentence += " " + word_
        
    return sentence

In [35]:
print(predict_next_word(model, tokenizer, 'how', 10))

how to make a crossword puzzle floods gave up with a
