In [1]:
from transformers import (
    GPT2Tokenizer,
    TFGPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
import tensorflow as tf
from tensorflow.keras import mixed_precision
from transformers import AutoTokenizer, TFBertForSequenceClassification
import pandas as pd
import re

In [2]:
"""STOP"""

'STOP'

In [3]:
# Load BERT model and tokenizer for sentiment analysis
sentiment_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = TFBertForSequenceClassification.from_pretrained(sentiment_model_name)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [4]:
# Load GPT-2 model and tokenizer for story generation
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = TFGPT2LMHeadModel.from_pretrained("gpt2")

# Set the pad token to the EOS token
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [5]:
# Read and clean stories from "stories.csv"
df = pd.read_csv("../Resources/Datasets/folk_tales_deduplicated.csv", encoding='ISO-8859-1')
column_values = df.iloc[:, 3]
clean_values = column_values.apply(lambda x: re.sub(r'[^a-zA-Z0-9\s.]', '', str(x)))
df.iloc[:, 3] = clean_values
stories = df["text"].tolist()
print(len(stories))
df.head()

2938


Unnamed: 0,source,nation,title,text
0,https://fairytalez.com/momotaro/,japanese,Momotaro,If youll believe me there was a time when the ...
1,https://fairytalez.com/the-birdcatcher/,serbian,The Birdcatcher,Near Constantinople there lived a man who knew...
2,https://fairytalez.com/sharing-joy-and-sorrow/,german,Sharing Joy and Sorrow,There was once a tailor who was a quarrelsome ...
3,https://fairytalez.com/the-punishment-of-gangana/,french,The Punishment of Gangana,Once upon a time there lived a king and queen ...
4,https://fairytalez.com/the-peace-with-the-snakes/,north_american_native,The Peace with the Snakes,In those days there was a Piegan chief named O...


In [6]:
# Preprocess the text
def preprocess_text(texts):
    return [text.lower().strip() for text in texts]

processed_stories = preprocess_text(stories)
print(len(processed_stories))

2938


In [7]:
# Function to create batches and pad them
def create_batches(texts, batch_size):
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        # Tokenize and pad each batch
        batch_inputs = sentiment_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="tf")
        yield batch_inputs

# Process each batch separately for sentiment analysis
sentiments = []
for batch_inputs in create_batches(processed_stories, batch_size=10):
    # Generate predictions for the batch
    outputs = sentiment_model(**batch_inputs)
    batch_predictions = tf.argmax(outputs.logits, axis=-1)
    sentiments.extend(batch_predictions.numpy())

# Sentiment labels
sentiment_labels = ["very negative", "negative", "neutral", "positive", "very positive"]
sentiment_counts = {label: 0 for label in sentiment_labels}

# Count occurrences of each sentiment
for sentiment in sentiments:
    sentiment_counts[sentiment_labels[sentiment]] += 1

# Show sentiment counts
print("Sentiment Counts:")
for label, count in sentiment_counts.items():
    print(f"{label}: {count}")

# Identify the highest count sentiment
highest_count_sentiment = max(sentiment_counts, key=sentiment_counts.get)
print(f"Highest Count Sentiment: {highest_count_sentiment}")

Sentiment Counts:
very negative: 667
negative: 441
neutral: 38
positive: 954
very positive: 838
Highest Count Sentiment: positive


In [8]:
# Def for dataset

def create_text_dataset(tokenizer, texts, block_size=512):
    tokenized_texts = [
        tokenizer(text, truncation=True, padding='max_length', max_length=block_size, return_tensors="tf")["input_ids"]
        for text in texts
    ]

    # Convert all tensors to int32, which is the expected type for input IDs
    tokenized_texts = [tf.cast(tensor, tf.int32) for tensor in tokenized_texts]

    # Filter out tensors with a length of 0
    filtered_texts = [tensor for tensor in tokenized_texts if tf.shape(tensor)[1] > 0]

    if len(filtered_texts) < 2:
        raise ValueError("Not enough valid sequences to create a dataset. Please check your input data.")

    inputs = tf.concat(filtered_texts[:-1], axis=0)
    labels = tf.concat(filtered_texts[1:], axis=0)

    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
    return dataset
    
# Create and prepare the dataset for training
dataset = create_text_dataset(gpt_tokenizer, processed_stories)
dataset = dataset.batch(2)


In [12]:
# Define the optimizer, loss, and metrics for training
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)  # Reduced by a factor of 50. previously 5e-5(1e-6 is what it was changed to). This offers a slower convergence which can help capture more fine details.
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Enable mixed precision
policy = mixed_precision.Policy('float32')
mixed_precision.set_global_policy(policy)

# Compile the model with mixed precision
gpt_model.compile(optimizer='adam', loss=loss_fn)

In [13]:
"""STOP"""

'STOP'

In [14]:
# Fine-tune the model
gpt_model.fit(dataset, epochs=12)

Epoch 1/12
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<tf_keras.src.callbacks.History at 0x2127a264990>

In [15]:
"""STOP"""

'STOP'

In [16]:
def generate_story(prompt, max_length=250):
    inputs = gpt_tokenizer(prompt, return_tensors="tf", truncation=True, max_length=512)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    outputs = gpt_model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,  # Generate multiple sequences
        pad_token_id=gpt_tokenizer.eos_token_id,
        no_repeat_ngram_size=2,
        num_beams=5,  # Use beam search with 5 beams
        temperature=.7
    )

    stories = [gpt_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return stories


In [17]:
# Create a prompt for story generation
input_prompt = f"Once upon a time in a land filled with {highest_count_sentiment} emotions, there was a..."

# Generate and display a story
generated_story = generate_story(input_prompt, max_length=250)
print("Generated Story:")
generated_story

Generated Story:


['Once upon a time in a land filled with positive emotions, there was a... the. the the and the a the to.. a and. and and a. to the he. he the of the was the it the his the in the said the that the they the had the i the you the for the on the as the one the him the with the so the not the at the if the came the is the but the me the this the when the will the do the have the out the she the there the be the would the them the man the went the all the my the very the her thethe the some the who the by the up the then the\xa0 the are the little the were the into the down the from the good the no the what the o the go the which the time the we the day the an the about the over the can the get the away the']

In [45]:
tf.random.set_seed(12)

# encode context the generation is conditioned on
sample_input = 'The King heard a devil dwelled in the castle atop the hill'
input_ids = gpt_tokenizer.encode(sample_input, return_tensors='tf')

# use temperature to decrease the sensitivity to low probability candidates
sample_outputs = gpt_model.generate(
    input_ids, 
    do_sample=True, 
    max_length=250, 
    top_k=369, 
    top_p=0.96, 
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, gpt_tokenizer.decode(sample_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
0: The King heard a devil dwelled in the castle atop the hill well way as journey of had get. had from and the to the it give and a this him were then the his you was his the but son one night cameman made his. each the have as a had of an what the to one he me were to his you for.. so and. said he of have gold in to. eat a. my he do me right the who water door he soon prince a in were and said which soon found what to goat had eyes will your not began now down a over when they it look the you. never here he gold the. and never it put he day looked ain a a i the say said out tail replied an a a youngd and may him to to made as that fox dog it the been from them get the it to his will am to back it left when out a goar. will the a with. g orwell soon some go. you with and where said it heard and and and that who the off want and to you this people would began be who the would him

In [30]:
tf.random.set_seed(2)

# encode context the generation is conditioned on
sample_input = 'Tell me a tale of the castle upon the hill...'
input_ids = gpt_tokenizer.encode(sample_input, return_tensors='tf')

# use temperature to decrease the sensitivity to low probability candidates
sample_output = gpt_model.generate(
    input_ids, 
    do_sample=True, 
    max_length=150, 
    top_p=0.92,
    top_k=0
)

print("Output:\n" + 100 * '-')
print(gpt_tokenizer.decode(sample_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Tell me a tale of the castle upon the hill... a had village three f. as in he tree side castmy the ofrow forestens put wereah dontak a high your against for and quite along came he sleepo something the human for some. we he needle i have and one i here he throughnowyou the he stone together the out of cast that mistress and mouse at my in as too friend beautiful them aah no and the tried father. off spotas boy o good sea that an his had the and the said egg the tomorrow but fast ind. i is quick be whatever. years to and a so in and women the and points and m indeed and good at gone the is tears you teach of their robbers the stopped a


In [29]:
tf.random.set_seed(0)

# encode context the generation is conditioned on
sample_input = 'King father daughter child...'
input_ids = gpt_tokenizer.encode(sample_input, return_tensors='tf')

# use temperature to decrease the sensitivity to low probability candidates
sample_output = gpt_model.generate(
    input_ids, 
    do_sample=True, 
    max_length=150, 
    top_p=0.92,
    top_k=0
)

print("Output:\n" + 100 * '-')
print(gpt_tokenizer.decode(sample_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
King father daughter child... there time as lived and near river good beautifulra uncle land walking man but he whenevern number another soon a that they small was by the as. here for mothers he go of had him far and went was. could end met under. her loved came thought had day with for angry re back distress crept. togetherbutplei the. spring was of two the appeared soon of mans st disan that were the said cat outside but him dam get gave not these them. she the. her mistress in would happen in your it the their may to into sang large came stranger gave was ah a that my day. hill came how. a feet his of with nothing says and master that sorrow bright one darkness ill the and lost


In [20]:
# Save the GPT-2 model in the TensorFlow SavedModel format
gpt_model.save_pretrained('./pt_bert_bot')