In [1]:
!pip install spacy nltk
!spacy download en_core_web_sm
!pip uninstall tensorflow-gpu -y
!pip uninstall tensorflow -y
!pip install --upgrade tensorflow

[0m^C
2023-09-22 23:08:41.139724: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-22 23:08:41.466750: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-22 23:08:43.048715: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-22 23:08:43.050647: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [1]:
import spacy
import numpy as np
import os
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import json
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout, Embedding
from tensorflow.keras.optimizers import Adam, AdamW
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.optimizers.schedules import ExponentialDecay
import tensorflow as tf
from nltk.util import ngrams
import time
import csv

2023-09-22 23:08:54.305265: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-22 23:08:54.307852: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-22 23:08:54.440678: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-22 23:08:54.442086: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## utilize TPUs
the code below will create a TPU strategy if there are any TPUs availabe

In [2]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver('') 
    print(tpu)
    print('Running on TPU ', tpu.master())
except Exception as e:
    print(e)
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

tf.config.experimental.list_physical_devices()

Please provide a TPU Name to connect to.


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

## utilize multiple GPUs
the code below will create a mirror stategy which will allow multiple CPUs to work together.

In [3]:
mirrored_strategy = tf.distribute.MirroredStrategy()
#tf.config.set_soft_device_placement(True)
tf.test.is_gpu_available()
print('DEVICES AVAILABLE: {}'.format(mirrored_strategy.num_replicas_in_sync))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
DEVICES AVAILABLE: 1


## loading the small english core model from spacy
in the code bellow we are loading the small english web core from spacy while disabling parser, tagger, ner, since we are not gonna use these features in our model, disabling them will make the text processing faster.

In [4]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
nlp.add_pipe('sentencizer')
nlp.max_length = 6_000_000_000

spacy.prefer_gpu()

False

## folders and files we are gonna use for this model
you need to create a stories folder and add at least one text file.

In [5]:
word_index_file = "word_index.json"
stories_folder = "stories"
stories_files = os.listdir(stories_folder)

model_progress = 'result/model_progress.json'
models_path = 'result/models'
history_folder = 'result/history'

## function that will allow us to preprocess the data
transforming text data into raw numbers that our module can understand and process.

In [6]:
# def create_embedded_sequence(sequence):
#     embedded_sequence = []
#     for token in sequence: 
#         if token in w2v_model:
#             embedded_sequence.append(w2v_model[token])
#         else: 
#             embedded_sequence.append(np.zeros(w2v_model.vector_size))
#     return embedded_sequence

def create_inputs_targets(text, window_size=3):
    doc = nlp(text)
    windows = []

    for sent in doc.sents:
        tokens = [token.text for token in sent if token.is_alpha or token.like_num]
        if len(tokens) < window_size: 
            continue
        n_gram = ngrams(tokens, window_size) 
        for window in n_gram: 
            windows.append(list(window))

    words_targets = [w[-1] for w in windows]
    sequences_inputs =  [w[:-1] for w in windows]
    
    with open(word_index_file, "r") as json_file:
        word_index = json.load(json_file)
    
    tokenizer = Tokenizer(oov_token="<OOV>")
    tokenizer.word_index = word_index

    targets = np.array(tokenizer.texts_to_sequences([words_targets])).flatten()
    sequences_inputs = np.array(tokenizer.texts_to_sequences(sequences_inputs))

    #sequences_inputs = np.array([create_embedded_sequence(sequence) for sequence in sequences_inputs])
    
    return sequences_inputs, targets

preprocessing the first text file for test

In [7]:
for file in stories_files[:1]:
    with open(os.path.join(stories_folder, file), "r", encoding='utf-8') as f: 
        text = f.read()
        inputs, targets = create_inputs_targets(text, 7)
        print(len(inputs))
        print(len(targets))

325
325




## getting the vocab size

In [8]:
def get_vocab_size():
    with open(word_index_file, "r") as json_file:
        vocab_size = len(json.load(json_file)) + 1
    del json_file
    return vocab_size
get_vocab_size()

762

## model parameters

In [16]:
abjust_lr = 0.0

embedding_dim = 128
num_epochs = 150
batch_size = 128
training_batch_size = 2048
vocab_size = get_vocab_size()
lr = (0.01 * (batch_size / 32) ** -0.5) + abjust_lr
dr = 0.2
early_stopping_patience = 10
l1_r = 0.1
window_size = 10
initializer = GlorotUniform()

print(f'''

embedding dimention: {embedding_dim}
vocabulare size: {vocab_size}
num of the epochs: {num_epochs}

learning rate: {lr}
dropout rate: {dr}
batch_size: {batch_size}
training_batch_size: {training_batch_size}
early stopping patience: {early_stopping_patience}
window size: {window_size}
L1 regularization: {l1_r}
''')



embedding dimention: 128
vocabulare size: 762
num of the epochs: 150

learning rate: 0.005
dropout rate: 0.2
batch_size: 128
training_batch_size: 2048
early stopping patience: 10
window size: 10
L1 regularization: 0.1



## a function to create the LSTM model

In [10]:
def create_model():
    with mirrored_strategy.scope():
        print('creat a new model')
        model = Sequential()
        model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=window_size - 1))
        model.add(Bidirectional(LSTM(256, return_sequences=True, kernel_initializer=GlorotUniform()), input_shape=(window_size - 1, embedding_dim)))
        model.add(Dropout(dr))
        # model.add(Bidirectional(LSTM(128, return_sequences=True, kernel_initializer=initializer)))
        # model.add(Dropout(dr))
        model.add(Bidirectional(LSTM(128, kernel_initializer=GlorotUniform())))
        model.add(Dropout(dr))
        model.add(Dense(128, activation='relu', kernel_initializer=GlorotUniform()))
        model.add(Dense(vocab_size, activation='softmax', kernel_initializer=GlorotUniform()))

        adam = AdamW(learning_rate=lr)
        model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['SparseCategoricalAccuracy'])
    return model

## handling the progress function
a function that help maintance the progress of the model.

In [17]:
def handle_progress(path):

    if os.path.exists(model_progress):
        with open(model_progress, "r") as f:
            model_progress_file = json.load(f)
            with mirrored_strategy.scope():
                model = load_model(model_progress_file["current_model"])
            models_number = len(model_progress_file["models"])
            model_path = os.path.join(models_path, f'''model-{models_number}.h5''')
    else:
        model_progress_file = {
            'current_model': '',
            'models': [],
            'trained_files': [],
            'history_logs': []
        }
        model = create_model()
        model_path = os.path.join(models_path, f'''model-0.h5''')

    def save_file():
        with open(model_progress, "w") as f:
            json.dump(model_progress_file, f)

    def save_history(history_path, history):
        if history == None: 
            return
        if not os.path.exists(history_folder):
            os.makedirs(history_folder)

        with open(history_path, 'w+', newline='') as f:

            writer = csv.writer(f)
            writer.writerow(history.history.keys())

            for epoch in range(len(history.epoch)):
                row = [history.history[metric][epoch]
                       for metric in history.history.keys()]
                writer.writerow(row)

    def update_model_progress(model, model_path, history, duration):
        id = len(model_progress_file["models"])

        current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())

        model_progress_file['models'].append({
            'id': id,
            'model': model_path,
            'last_file': path,
            'duration': duration,
            'finished_at': current_time
        })

        history_path = os.path.join(history_folder, f'model-{id}.csv')

        save_history(history_path, history)

        model_progress_file['trained_files'].append(path)
        model_progress_file['history_logs'].append(history_path)
        model_progress_file['current_model'] = model_path
        model.save(model_path)

        save_file()

    return model, update_model_progress, model_path

## creating the training function 
this function will combine all the function that we created above. the train_lstm function will be responsible of training the LSTM model.

In [22]:
def train_lstm(paths, folder_name=""):
    for index, path in enumerate(paths):
        print(f'''{path} is getting preprocess now...''')

        file_location = os.path.join(folder_name, path)

        if not os.path.exists(file_location):
            print(f'''{file_location} is not exist''')
            continue

        try:
            with open(file_location, "r") as f:
                text = f.read()
        except Exception as e:
            print(f'''failed to load the file: {file_location}''')
            print(e)
            continue

        try:
            inputs, targets = create_inputs_targets(text, window_size)
        except Exception as e:
            print(f'''failed to tokenize the file: {file_location}''')
            print(e)
            continue
        model, update_model_progress, model_path = handle_progress(path)

        # callbacks

        # a callback to save the progress of a model
        checkpoint_callback = ModelCheckpoint(
            filepath=model_path,
            save_weights_only=True,
            save_freq=1
        )

        # a callback to apply early stuping to the model to save the best progress
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='loss',
            patience=early_stopping_patience,
            restore_best_weights=True
        )


        start_time = time.time()

        dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))

        buffer_size = len(targets)
        dataset = dataset.shuffle(buffer_size)

        batch_size = min(len(targets), training_batch_size)
        dataset = dataset.batch(batch_size)

        history = None

        for batch_inputs, batch_targets in dataset:
            with mirrored_strategy.scope():
                batch_history = model.fit(batch_inputs, batch_targets, epochs=num_epochs, batch_size=batch_size, callbacks=[
                                      checkpoint_callback, early_stopping])

#             if history is None:
#                 history = batch_history  # Initialize history with the first batch's history
#             else:
#                 # Merge the batch history with the existing history
#                 for metric, values in batch_history.history.items():
#                     history.history[metric].extend(values)
            
            tf.keras.backend.clear_session()
            del batch_inputs
            del batch_targets
        end_time = time.time()
        duration = end_time - start_time

        update_model_progress(model, model_path, history, duration)
        tf.keras.backend.clear_session()
        del dataset

In [23]:
train_lstm(stories_files, stories_folder)

test_4.txt is getting preprocess now...
creat a new model
Epoch 1/150


2023-09-22 23:15:59.548848: W tensorflow/core/framework/dataset.cc:956] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 7

2023-09-22 23:16:43.144571: W tensorflow/core/framework/dataset.cc:956] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 7

## utilizing the model and generate new text

In [24]:
with open(word_index_file, "r") as f: 
    word_index = json.load(f)

## the sample function
the simple function will turn the predictions into probabilty distrubotion resulting in selecting random words that will make our output more interesting.

In [25]:
def sample(preds, temperature = 1.0):
    preds = np.asarray(preds).astype("float64")
    preds = np.clip(preds, 1e-10, None)
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds[0], 1)
    return np.argmax(probas)

In [26]:
def create_generator(model, word_index):
    def generate_text(seed_text, sequence_length = 5, temperature= 1.0):
        if(len(seed_text) < window_size - 1):
            return 
        output_sent = ""
        tokenizer = Tokenizer(oov_token="<OOV>")
        tokenizer.word_index = word_index
        tokens = [token.text for token in nlp(seed_text)]
        seed_input_sequence = tokenizer.texts_to_sequences([tokens[-(window_size - 1):]])
        print(len(seed_input_sequence[0]))
        print(window_size -1)
        for _ in range(sequence_length):
            preds = model.predict(seed_input_sequence)
            next_word_index = sample(preds, temperature)
            output_word = ""
            for word, index in tokenizer.word_index.items():
                if index == next_word_index:
                    output_word = word
                    break
            output_sent += " " + output_word
            seed_input_sequence[0].append(next_word_index)
        
        return output_sent
    return generate_text

In [27]:
model = load_model("result/models/model-0.h5")
seed_text = "once upon a time, in a small village between two mountains I saw a "
generate_text = create_generator(model, word_index)
generate_text(seed_text)

9
9


' forest the forest shared lands'