In [3]:
!nvidia-smi

Wed Jan 31 06:07:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.23.0 typeguard-2.13.3


In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import reuters
from tensorflow.keras.layers import Embedding,LSTM,GRU,Bidirectional,Dense,Attention
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow_addons.seq2seq import BahdanauAttention
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import warnings
warnings.filterwarnings('ignore')


### Data loading


In [9]:
data = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/Next-word-pred/medium_data.csv')
data['title']
print(data.shape)

(6508, 10)


### preprocessing

In [10]:
data ['title'] = data['title'].apply(lambda x: x.replace(u'\xa0',u' ').replace('\u200a',' '))

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['title'])
total_words = len(tokenizer.word_index) + 1

print("Total number of words: ", total_words)

Total number of words:  8237


In [11]:
input_sequences = []

for line in data['title']:
  # print(line)
  token_list = tokenizer.texts_to_sequences([line])[0]
  # print(token_list)
  for i in range(1,len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

# input_sequences
print("total input sequences:",len(input_sequences))

total input sequences: 48461


In [12]:
# input_sequences = np.array(input_sequences)
max_sequence_length = max(len(seq) for seq in input_sequences)
print("Max Sequence Length:", max_sequence_length)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))
X,y = input_sequences[:,:-1],input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes = total_words)


Max Sequence Length: 40


### Bi-LSTM

In [15]:
model_bilstm = Sequential()
model_bilstm.add(Embedding(total_words,100,input_length= max_sequence_length-1 ))
model_bilstm.add(Bidirectional(LSTM(150)))
model_bilstm.add(Dense(total_words,activation = 'softmax'))
adam = Adam(learning_rate= 0.01)
model_bilstm.compile(optimizer =adam, loss ='categorical_crossentropy',metrics= ['accuracy'])
model_bilstm.summary()

history_bilstm = model_bilstm.fit(X, y, epochs=50, verbose=1,batch_size=32)
print(model_bilstm)

# model_filename = 'model.h5'
model_bilstm.save('bilstm_model.h5')
# tokenizer.to_json("bilstm_tokenizer.json")

tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 39, 100)           823700    
                                                                 
 bidirectional_2 (Bidirecti  (None, 300)               301200    
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 8237)              2479337   
                                                                 
Total params: 3604237 (13.75 MB)
Trainable params: 3604237 (13.75 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch

In [18]:
model_bilstm.save('/content/drive/MyDrive/ColabNotebooks/Next-word-pred/Models/bilstm_model.h5')
# tokenizer.to_json("bilstm_tokenizer.json")

tokenizer_json = tokenizer.to_json()
with open('/content/drive/MyDrive/ColabNotebooks/Next-word-pred/Models/tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)

In [23]:
evaluation_bilstm = model_bilstm.evaluate(X, y)
print("BiLSTM Model Evaluation:", evaluation_bilstm)

BiLSTM Model Evaluation: [2.0924787521362305, 0.5615030527114868]


In [17]:
%time history_bilstm = model_bilstm.fit(X, y, epochs=5, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1min 26s, sys: 8.38 s, total: 1min 34s
Wall time: 2min 25s


In [19]:
%%time
from tensorflow.keras.callbacks import ModelCheckpoint

logs_path = '/content/drive/MyDrive/ColabNotebooks/Next-word-pred/logs/'

checkpoint_filepath = logs_path + 'model_bilstm_checkpoint.h5'
model_checkpoint = ModelCheckpoint(
    checkpoint_filepath,
    save_weights_only=True,
    save_best_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1
)

# Train the model with the ModelCheckpoint callback
history_bilstm_model = model_bilstm.fit(
    X, y,
    epochs=5,
    callbacks=[model_checkpoint]
)

Epoch 1/5



Epoch 2/5



Epoch 3/5



Epoch 4/5



Epoch 5/5



CPU times: user 1min 24s, sys: 9.37 s, total: 1min 33s
Wall time: 1min 39s


In [20]:
models_path= '/content/drive/MyDrive/ColabNotebooks/Next-word-pred/Models/'

### GRU


In [21]:
model_gru = Sequential()
model_gru.add(Embedding(total_words,100,input_length= max_sequence_length-1 ))
# gru_model.add(Bidirectional(LSTM(150)))
model_gru.add(GRU(150))
model_gru.add(Dense(total_words,activation = 'softmax'))
adam = Adam(learning_rate= 0.01)
model_gru.compile(optimizer =adam, loss ='categorical_crossentropy',metrics= ['accuracy'])
model_gru.summary()


history_gru = model_gru.fit(X, y, epochs=50, verbose=1,batch_size=32)

print(model_gru)

# model_filename = 'model.h5'
model_gru.save(models_path + 'gru_model.h5')
# tokenizer.to_json("gru_tokenizer.json")

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 39, 100)           823700    
                                                                 
 gru (GRU)                   (None, 150)               113400    
                                                                 
 dense_3 (Dense)             (None, 8237)              1243787   
                                                                 
Total params: 2180887 (8.32 MB)
Trainable params: 2180887 (8.32 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
E

### Attention

In [None]:
# # Attention Model
# model_attention = Sequential()
# model_attention.add(Embedding(total_words, 100, input_length=max_sequence_length - 1))
# model_attention.add(Bidirectional(LSTM(150, return_sequences=True)))
# model_attention.add(Attention(use_scale=True))
# model_attention.add(Dense(total_words, activation='softmax'))
# model_attention.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# history_attention = model_attention.fit(X, y, epochs=50, verbose=1,batch_size=32)
# model_attention.save(models_path + 'attention_model.h5')

### Transformer

In [None]:
# # Transformer Model using GPT-2
# tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2')
# model_gpt2 = TFGPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer_gpt2.eos_token_id)

# # Save GPT-2 model (replace 'path_to_save' with your desired path)
# model_gpt2.save_pretrained('path_to_save')

# # Example usage for generating next words using GPT-2
# inputs = tokenizer_gpt2.encode("your seed text", return_tensors='tf')
# outputs = model_gpt2.generate(inputs, max_length=50, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95)
# predicted_text = tokenizer_gpt2.decode(outputs[0], skip_special_tokens=True)
# print("Predicted next words (GPT-2):", predicted_text)

## Prediction

In [None]:
def predict_next_word(seed_text, model, tokenizer, max_sequence_length):
    for _ in range(3):  # Predict the next 3 words
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_id = np.argmax(predicted_probs)
        predicted_word = tokenizer.index_word[predicted_id]
        seed_text += " " + predicted_word
    return seed_text

# seed_text = "hypothesis"
# predicted_text = predict_next_word(seed_text, , tokenizer, max_sequence_length)
# print("Seed text:", seed_text)
# print("Predicted next words:", predicted_text)


Seed text: hypothesis
Predicted next words: hypothesis testing glossary for
