In [22]:
#For Colab Only
from google.colab import drive
drive.mount('/content/drive')

# %cd /content/drive/MyDrive/Github/generative-text
# !git pull

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
# import the necessary libraries
import os 
os.environ['TF_CPP_MIN_LOG_LEVEL']='2' #Trying to reduce tensorflow warnings
import re
import math
import string
# import hw_utils # LOADS HW CODE (helps de-clutter this notebook)
import time
import json
import random
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from pathlib import Path

# useful structures and functions for experiments 
from time import sleep
from collections import Counter
from collections import defaultdict
from glob import glob

# specific machine learning functionality
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer
from tensorflow import keras
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.utils import to_categorical
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.utils.layer_utils import count_params
from sklearn.model_selection import train_test_split
from sklearn import manifold
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, confusion_matrix
try:
  from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig
  from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
except:
  print("transformers library not installed, installing through pip")
  !pip install transformers
  from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig
  from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

In [24]:
# download nltk's punkt sentence tokenizer
nltk.download('punkt')
# download nltk's stop words
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
# Enable/Disable Eager Execution
# Reference: https://www.tensorflow.org/guide/eager
# TensorFlow's eager execution is an imperative programming environment that evaluates operations immediately, 
# without building graphs

#tf.compat.v1.disable_eager_execution()
# tf.compat.v1.enable_eager_execution()

print("tensorflow version", tf.__version__)
print("keras version", tf.keras.__version__)
print("Eager Execution Enabled:", tf.executing_eagerly())

# Get the number of replicas 
strategy = tf.distribute.MirroredStrategy()
print("Number of replicas:", strategy.num_replicas_in_sync)

devices = tf.config.experimental.get_visible_devices()
print("Devices:", devices)
print(tf.config.experimental.list_logical_devices('GPU'))

print("GPU Available: ", tf.config.list_physical_devices('GPU'))
print("All Physical Devices", tf.config.list_physical_devices())

# Better performance with the tf.data API
# Reference: https://www.tensorflow.org/guide/data_performance
AUTOTUNE = tf.data.experimental.AUTOTUNE

tensorflow version 2.7.0
keras version 2.7.0
Eager Execution Enabled: True
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of replicas: 1
Devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[LogicalDevice(name='/device:GPU:0', device_type='GPU')]
GPU Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
All Physical Devices [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [26]:
# Load training texts and append
p = Path('/content/drive/Othercomputers/macbook/generative-text/text_generator/data')
aggregate = p / "aggregate.txt"
training_files = list(p.glob('*.txt'))
with open (aggregate, 'w') as a:
    for file in training_files:
        with open (file, 'r') as f:
            a.write(f.read())


In [27]:
with open (aggregate, 'r') as file_contents:
    training_data = re.sub(r'Page \| \d+ .*', '', file_contents.read()).replace('\n', ' ')
    # print(training_data)

    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
    training_data_encoded = tokenizer.encode(training_data)

    # print(training_data_encoded)

    training_chunks = []
    block_size = 100
    for i in range(0, len(training_data_encoded) - block_size + 1, block_size):
        training_chunks.append(training_data_encoded[i:i + block_size])

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (344074 > 1024). Running this sequence through the model will result in indexing errors


In [28]:
inputs = []
labels = []
for ex in training_chunks:
    inputs.append(ex[:-1])
    labels.append(ex[1:])

print("inputs length:",len(inputs))
print("labels length:",len(labels))

inputs length: 3440
labels length: 3440


In [29]:
BATCH_SIZE = 12
TRAIN_SHUFFLE_BUFFER_SIZE = len(inputs)

# Create TF Dataset
train_data = tf.data.Dataset.from_tensor_slices((inputs, labels))

#############
# Train data
#############
train_data = train_data.shuffle(buffer_size=TRAIN_SHUFFLE_BUFFER_SIZE)
train_data = train_data.batch(BATCH_SIZE, drop_remainder=True)
train_data = train_data.prefetch(buffer_size=AUTOTUNE)

print("train_data",train_data)

train_data <PrefetchDataset shapes: ((12, 99), (12, 99)), types: (tf.int32, tf.int32)>


In [30]:
model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")

Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [31]:
learning_rate = 3e-5 
epsilon=1e-08
clipnorm=1.0
epochs = 30

optimizer = keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon, clipnorm=clipnorm)
# Loss
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = keras.metrics.SparseCategoricalAccuracy('accuracy')

# Compile
model.compile(loss=[loss, *[None] * model.config.n_layer],
                  optimizer=optimizer,
                  metrics=[metric])

# Train model
start_time = time.time()
training_results = model.fit(
        train_data, # train_data.take(1000) for testing
        epochs=epochs, 
        verbose=1)
execution_time = (time.time() - start_time)/60.0
print("Training execution time (mins)",execution_time)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Training execution time (mins) 40.878454033533735


In [33]:
from datetime import datetime
right_now = datetime.today().strftime("%Y%m%d_%H%M%S")
model_file = p / (right_now)
model.save(model_file, save_format="tf")



INFO:tensorflow:Assets written to: /content/drive/Othercomputers/macbook/generative-text/text_generator/data/20220213_070815/assets


INFO:tensorflow:Assets written to: /content/drive/Othercomputers/macbook/generative-text/text_generator/data/20220213_070815/assets


In [34]:
# Your code here

# Input text
input_text = "Here we are again"

# Tokenize Input
input_ids = tokenizer.encode(input_text, return_tensors='tf')
print("input_ids",input_ids)

# Generate outout
outputs = model.generate(
    input_ids, 
    do_sample=True,
    max_length=75, 
    top_p=0.80, 
    top_k=0
)

print("Generated text:")
display(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


input_ids tf.Tensor([[4342  356  389  757]], shape=(1, 4), dtype=int32)
Generated text:


'Here we are again at an age when Europe is awash in the imagination. It is no secret that such an age can become very sophistical, that such a plan actually cannot be realized, because one has heard too much about it and too little about the meaning of such a plan. What we must prevent, however, is the truly tragic, for this is precisely what'

In [37]:
%cd /content/drive/MyDrive/Github/generative-text/
!git add .
!git commit -a -m "Finished training model on Colab"
!git push

/content/drive/MyDrive/Github/generative-text
[main 8b2278a] Finished training model on Colab
 1 file changed, 1 insertion(+)
 create mode 100644 text_generator/text_generation_notebook (1).ipynb
fatal: could not read Username for 'https://github.com': No such device or address


In [36]:
!git config --global user.email evje.eric@gmail.com
!git config --global user.name ericevje
