In [6]:
!pip install spacy nltk
!spacy download en_core_web_sm
!pip uninstall tensorflow-gpu -y
!pip uninstall tensorflow -y
!pip install --upgrade tensorflow

Collecting typing-extensions>=4.6.1
  Using cached typing_extensions-4.8.0-py3-none-any.whl (31 kB)
Installing collected packages: typing-extensions
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.5.0
    Uninstalling typing_extensions-4.5.0:
      Successfully uninstalled typing_extensions-4.5.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.8.0 which is incompatible.[0m[31m
[0mSuccessfully installed typing-extensions-4.8.0
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/confection/__init__.py", line 38, in <module>
    from pydantic.v1 import BaseModel, Extra, ValidationError, create_model
  File "/usr/local/lib/python3.11/dist-packages/pydantic/__init__.py", line 13, in <mo

In [7]:
import spacy
import numpy as np
import os
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import json
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout, Embedding
from tensorflow.keras.optimizers import Adam, AdamW
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.optimizers.schedules import ExponentialDecay
import tensorflow as tf
from nltk.util import ngrams
import time
import csv

## utilize TPUs
the code below will create a TPU strategy if there are any TPUs availabe

In [8]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver('') 
    print(tpu)
    print('Running on TPU ', tpu.master())
except Exception as e:
    print(e)
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

tf.config.experimental.list_physical_devices()

Please provide a TPU Name to connect to.


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

## utilize multiple GPUs
the code below will create a mirror stategy which will allow multiple CPUs to work together.

In [9]:
mirrored_strategy = tf.distribute.MirroredStrategy()
#tf.config.set_soft_device_placement(True)
tf.test.is_gpu_available()
print('DEVICES AVAILABLE: {}'.format(mirrored_strategy.num_replicas_in_sync))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
DEVICES AVAILABLE: 1


## loading the small english core model from spacy
in the code bellow we are loading the small english web core from spacy while disabling parser, tagger, ner, since we are not gonna use these features in our model, disabling them will make the text processing faster.

In [11]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
nlp.add_pipe('sentencizer')
nlp.max_length = 6_000_000_000

spacy.prefer_gpu()

False

## folders and files we are gonna use for this model
you need to create a stories folder and add at least one text file.

In [13]:
word_index_file = "word_index.json"
stories_folder = "stories"
stories_files = os.listdir(stories_folder)

model_progress = 'result/model_progress.json'
models_path = 'result/working/models'
history_folder = 'result/working/history'

## function that will allow us to preprocess the data
transforming text data into raw numbers that our module can understand and process.

In [15]:
# def create_embedded_sequence(sequence):
#     embedded_sequence = []
#     for token in sequence: 
#         if token in w2v_model:
#             embedded_sequence.append(w2v_model[token])
#         else: 
#             embedded_sequence.append(np.zeros(w2v_model.vector_size))
#     return embedded_sequence

def create_inputs_targets(text, window_size=3):
    doc = nlp(text)
    windows = []

    for sent in doc.sents:
        tokens = [token.text for token in sent if token.is_alpha or token.like_num]
        if len(tokens) < window_size: 
            continue
        n_gram = ngrams(tokens, window_size) 
        for window in n_gram: 
            windows.append(list(window))

    words_targets = [w[-1] for w in windows]
    sequences_inputs =  [w[:-1] for w in windows]
    
    with open(word_index_file, "r") as json_file:
        word_index = json.load(json_file)
    
    tokenizer = Tokenizer(oov_token="<OOV>")
    tokenizer.word_index = word_index

    targets = np.array(tokenizer.texts_to_sequences([words_targets])).flatten()
    sequences_inputs = np.array(tokenizer.texts_to_sequences(sequences_inputs))

    #sequences_inputs = np.array([create_embedded_sequence(sequence) for sequence in sequences_inputs])
    
    return sequences_inputs, targets

preprocessing the first text file for test

In [20]:
for file in stories_files[:1]:
    with open(os.path.join(stories_folder, file), "r", encoding='utf-8') as f: 
        text = f.read()
        inputs, targets = create_inputs_targets(text, 7)
        print(len(inputs))
        print(len(targets))

102
102


## getting the vocab size

In [21]:
def get_vocab_size():
    with open(word_index_file, "r") as json_file:
        vocab_size = len(json.load(json_file)) + 1
    del json_file
    return vocab_size
get_vocab_size()

196

## model parameters

In [22]:
abjust_lr = 0.0

embedding_dim = 128
num_epochs = 150
batch_size = 512
training_batch_size = 2048
vocab_size = get_vocab_size()
lr = (0.01 * (batch_size / 32) ** -0.5) + abjust_lr
dr = 0.2
early_stopping_patience = 10
l1_r = 0.1
window_size = 10
initializer = GlorotUniform()

print(f'''

embedding dimention: {embedding_dim}
vocabulare size: {vocab_size}
num of the epochs: {num_epochs}

learning rate: {lr}
dropout rate: {dr}
batch_size: {batch_size}
training_batch_size: {training_batch_size}
early stopping patience: {early_stopping_patience}
window size: {window_size}
L1 regularization: {l1_r}
''')



embedding dimention: 128
vocabulare size: 196
num of the epochs: 150

learning rate: 0.0025
dropout rate: 0.2
batch_size: 512
training_batch_size: 2048
early stopping patience: 10
window size: 10
L1 regularization: 0.1



## a function to create the LSTM model

In [23]:
def create_model():
    with mirrored_strategy.scope():
        print('creat a new model')
        model = Sequential()
        model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=window_size - 1))
        model.add(Bidirectional(LSTM(256, return_sequences=True, kernel_initializer=GlorotUniform()), input_shape=(window_size - 1, embedding_dim)))
        model.add(Dropout(dr))
        # model.add(Bidirectional(LSTM(128, return_sequences=True, kernel_initializer=initializer)))
        # model.add(Dropout(dr))
        model.add(Bidirectional(LSTM(128, kernel_initializer=GlorotUniform())))
        model.add(Dropout(dr))
        model.add(Dense(128, activation='relu', kernel_initializer=GlorotUniform()))
        model.add(Dense(vocab_size, activation='softmax', kernel_initializer=GlorotUniform()))

        adam = AdamW(learning_rate=lr)
        model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['SparseCategoricalAccuracy'])
    return model

## handling the progress function
a function that help maintance the progress of the model.

In [None]:
def handle_progress(path):

    if os.path.exists(model_progress):
        with open(model_progress, "r") as f:
            model_progress_file = json.load(f)
            with mirrored_strategy.scope():
                model = load_model(model_progress_file["current_model"])
            models_number = len(model_progress_file["models"])
            model_path = os.path.join(models_path, f'''model-{models_number}.h5''')
    else:
        model_progress_file = {
            'current_model': '',
            'models': [],
            'trained_files': [],
            'history_logs': []
        }
        model = create_model()
        model_path = os.path.join(models_path, f'''model-0.h5''')

    def save_file():
        with open(model_progress, "w") as f:
            json.dump(model_progress_file, f)

    def save_history(history_path, history):
        if history == None: 
            return
        if not os.path.exists(history_folder):
            os.makedirs(history_folder)

        with open(history_path, 'w+', newline='') as f:

            writer = csv.writer(f)
            writer.writerow(history.history.keys())

            for epoch in range(len(history.epoch)):
                row = [history.history[metric][epoch]
                       for metric in history.history.keys()]
                writer.writerow(row)

    def update_model_progress(model, model_path, history, duration):
        id = len(model_progress_file["models"])

        current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())

        model_progress_file['models'].append({
            'id': id,
            'model': model_path,
            'last_file': path,
            'duration': duration,
            'finished_at': current_time
        })

        history_path = os.path.join(history_folder, f'model-{id}.csv')

        save_history(history_path, history)

        model_progress_file['trained_files'].append(path)
        model_progress_file['history_logs'].append(history_path)
        model_progress_file['current_model'] = model_path
        model.save(model_path)

        save_file()

    return model, update_model_progress, model_path