## Windows WSL 2 Installation
```
conda create -n dl_project python=3.10
conda activate dl_project
python -m pip install tensorflow[and-cuda]
sudo apt install cmake
python -m pip install --upgrade keras-nlp
python -m pip install tensorflow_datasets
python -m pip install pyyaml h5py
```

## Guides

https://keras.io/examples/generative/gpt2_text_generation_with_kerasnlp/


In [44]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"  # or "tensorflow" or "torch"

import keras_nlp
import keras
import tensorflow as tf
import time

keras.mixed_precision.set_global_policy("mixed_float16")

In [45]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [46]:
# To speed up training and generation, we use preprocessor of length 128
# instead of full length 1024.
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=128,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor
)

In [47]:
# Beware - uses 14GB of RAM
def read_and_filter_training_data():
    folder_path = 'bulgarian_books'
    data = []

    files = os.listdir(folder_path)

    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
        
        with open(file_path, 'r') as file:
            lines = file.readlines()
            for line in lines:
                if len(line) < 2 or line[0] == '>' or line[0] == '@' or line.startswith('D>') or line.startswith('D$') \
                    or line.startswith('E>') or line.startswith('E$') or line.startswith('P>') or line.startswith('P$'):
                    continue
                line = line.replace('\r', '').replace('\t', '')
                data.append(line)
    return data

def develop_fast_load_training_data():
    data_file = 'bulgarian_training_data.txt'
    with open(data_file, 'r') as file:
       data = []
       for i in range(10000):
           line = next(file)
           data.append(line)
    return data

def load_training_data():
    # If the training data is not already preprocessed, it is preprocessed and saved to a file.
    # Otherwise, it is just loaded from the file.
    data_file = 'bulgarian_training_data.txt'
    if not os.path.exists(data_file):
        data = read_and_filter_training_data()
        with open(data_file, 'w') as file:
            for line in data:
                file.write(''.join(line))
    else:
        with open(data_file, 'r') as file:
            data = file.readlines()
            for i in range(1000000):
                line = next(file)
                data.append(line)
    return data

In [48]:
def train_model(model, data):
    train_ds = (
        tf.data.Dataset.from_tensor_slices(data)
        .batch(256)
        #.cache()
        .prefetch(tf.data.AUTOTUNE)
    )

    num_epochs = 6

    learning_rate = keras.optimizers.schedules.PolynomialDecay(
        5e-4,
        decay_steps=train_ds.cardinality() * num_epochs,
        end_learning_rate=0.0,
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate),
        loss=loss,
        weighted_metrics=["accuracy"],
    )

    checkpoint_path = "training/cp-{epoch:02d}.weights.h5"
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                    save_weights_only=True,
                                                    verbose=1)

    model.fit(train_ds, epochs=num_epochs, callbacks=[cp_callback])

In [51]:
# training_data = load_training_data()
# data = develop_fast_load_training_data()

In [None]:
# train_model(gpt2_lm, training_data)

In [43]:
def ask_model(model, text):
    return model.generate(text, max_length=1000)


In [53]:
def print_model_output(model, text):
    print(ask_model(model, text))

In [59]:
gpt2_lm.load_weights('training_overnight/cp.ckpt')
print_model_output(gpt2_lm, "Докато шофираш, за да превключиш скорост, трябва да")

Докато шофираш, за да превключиш скорост, трябва да го сториш следобед. Неговите хора само могат да го убиват — и на
