# Text Completion and Prediction Using LSTM Model

Data: Euro 2024 elections

In [1]:
# import libraries
import numpy as np
#import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Path to text file
file_path = "/content/2024_European_Parliament_election.txt"

In [5]:
# Open the file to read its contents
with open(file_path, 'r', encoding='utf-8') as file:
    wiki_post = file.read()

In [6]:
# Print the content to ensure it's read correctly
print(wiki_post)

The 2024 European Parliament election was held in the European Union (EU) between 6 and 9 June 2024. It was the tenth parliamentary election since the first direct elections in 1979, and the first European Parliament election after Brexit. A total of 720 Members of the European Parliament (MEPs) were elected to represent more than 450 million people from 27 member states. This election also coincided with a number of other elections in some European Union member states.

On 9 June 2024, the European People's Party led by Ursula von der Leyen won the most seats in the European Parliament. The pro-EU centrist, liberal and environmentalist parties suffered significant losses, while anti-EU right-wing populist parties made substantial gains. The right-wing European Conservatives and Reformists group overtook the centrist Renew Europe group to win the third most seats. In addition, several new or Non-Inscrits parties gained seats in the Parliament.

In the previous election, held on 23–26 M

In [7]:
# Tokenize the text
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts([wiki_post])
total_words = len(text_tokenizer.word_index) + 1

In [8]:
# Display the word index
text_tokenizer.word_index

{'the': 1,
 'of': 2,
 'and': 3,
 'in': 4,
 'to': 5,
 'european': 6,
 'on': 7,
 'a': 8,
 'for': 9,
 'was': 10,
 'parliament': 11,
 'from': 12,
 'with': 13,
 'party': 14,
 'as': 15,
 'by': 16,
 'group': 17,
 '2024': 18,
 'election': 19,
 'parties': 20,
 'that': 21,
 'its': 22,
 'is': 23,
 'be': 24,
 'candidate': 25,
 'eu': 26,
 'had': 27,
 'their': 28,
 'june': 29,
 'it': 30,
 'meps': 31,
 'president': 32,
 'at': 33,
 'elected': 34,
 'which': 35,
 'europe': 36,
 'commission': 37,
 'not': 38,
 'this': 39,
 'der': 40,
 'debate': 41,
 'new': 42,
 'political': 43,
 'council': 44,
 'national': 45,
 'announced': 46,
 'von': 47,
 'leyen': 48,
 'would': 49,
 'were': 50,
 'member': 51,
 'other': 52,
 'right': 53,
 'or': 54,
 'epp': 55,
 'held': 56,
 'union': 57,
 'elections': 58,
 'after': 59,
 'seats': 60,
 'an': 61,
 'also': 62,
 'renew': 63,
 'they': 64,
 'who': 65,
 '2023': 66,
 'been': 67,
 'candidates': 68,
 'march': 69,
 'movement': 70,
 'countries': 71,
 'have': 72,
 'during': 73,
 'democ

In [9]:
# Create input sequences

text_input_sequences = []

for line in wiki_post.split("/n"):
  print(line)
  token_list = text_tokenizer.texts_to_sequences([line])[0]
  for n in range(1, len(token_list)):
    n_gram_sequence = token_list[:n+1]
    #print(n_gram_sequence)
    text_input_sequences.append(n_gram_sequence)

The 2024 European Parliament election was held in the European Union (EU) between 6 and 9 June 2024. It was the tenth parliamentary election since the first direct elections in 1979, and the first European Parliament election after Brexit. A total of 720 Members of the European Parliament (MEPs) were elected to represent more than 450 million people from 27 member states. This election also coincided with a number of other elections in some European Union member states.

On 9 June 2024, the European People's Party led by Ursula von der Leyen won the most seats in the European Parliament. The pro-EU centrist, liberal and environmentalist parties suffered significant losses, while anti-EU right-wing populist parties made substantial gains. The right-wing European Conservatives and Reformists group overtook the centrist Renew Europe group to win the third most seats. In addition, several new or Non-Inscrits parties gained seats in the Parliament.

In the previous election, held on 23–26 M

In [10]:
# Pad sequences to make them of equal length
max_sequence_len = max([len(seq) for seq in text_input_sequences])
input_sequences = np.array(pad_sequences(text_input_sequences, maxlen=max_sequence_len, padding="pre"))


In [11]:
input_sequences

array([[   0,    0,    0, ...,    0,    1,   18],
       [   0,    0,    0, ...,    1,   18,    6],
       [   0,    0,    0, ...,   18,    6,   11],
       ...,
       [   0,    0,    1, ..., 1297, 1298,  303],
       [   0,    1,   18, ..., 1298,  303,    2],
       [   1,   18,    6, ...,  303,    2,   36]], dtype=int32)

In [12]:
# Separate predictors (X) and label (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [13]:
X[0]

array([0, 0, 0, ..., 0, 0, 1], dtype=int32)

In [14]:
X[1]

array([ 0,  0,  0, ...,  0,  1, 18], dtype=int32)

In [15]:
y

array([ 18,   6,  11, ..., 303,   2,  36], dtype=int32)

In [16]:
# convert our label y to one-hot vectors
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [17]:
y[1]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [18]:
# Define the model

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation="softmax"))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4754, 100)         129900    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 1299)              196149    
                                                                 
Total params: 476649 (1.82 MB)
Trainable params: 476649 (1.82 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [19]:
# Compile the model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7d558c934fd0>

In [21]:
# predict the next word

input_text = "France"
predict_next_words = 30

for n in range(predict_next_words):
  token_list = text_tokenizer.texts_to_sequences({input_text})[0]
  print(token_list)
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding="pre")
  predicted = np.argmax(model.predict(token_list), axis=-1)
  output_word = ""
  for word, index in text_tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
  input_text += " " + output_word

print(input_text)

[277]
[277, 4]
[277, 4, 18]
[277, 4, 18, 1]
[277, 4, 18, 1, 6]
[277, 4, 18, 1, 6, 413]
[277, 4, 18, 1, 6, 413, 14]
[277, 4, 18, 1, 6, 413, 14, 34]
[277, 4, 18, 1, 6, 413, 14, 34, 45]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33, 1]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33, 1, 6]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33, 1, 6, 44]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33, 1, 6, 44, 271]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33, 1, 6, 44, 271, 45]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33, 1, 6, 44, 271, 45, 194]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33, 1, 6, 44, 271, 45, 194, 259]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33, 1, 6, 44, 271, 45, 194, 259, 77]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33, 1, 6, 44, 271, 45, 194, 259, 77, 47]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33, 1, 6, 44, 271, 45, 194, 259, 77, 47, 40]
[277, 4, 18, 1, 6, 413, 14, 34, 45, 20, 33, 1, 6, 44, 271, 45, 194, 259, 77,