In [8]:
from transformers import (
    GPT2Tokenizer,
    TFGPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
import tensorflow as tf
from transformers import AutoTokenizer, TFBertForSequenceClassification
import pandas as pd


In [9]:
# Load BERT model and tokenizer for sentiment analysis
sentiment_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = TFBertForSequenceClassification.from_pretrained(sentiment_model_name)


Some layers from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [10]:
# Load GPT-2 model and tokenizer for story generation
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = TFGPT2LMHeadModel.from_pretrained("gpt2")

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [11]:
# Set the pad token to the EOS token
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

In [46]:
# Read stories from "stories.csv"
df = pd.read_csv("folk_tales_deduplicated.csv", encoding='ISO-8859-1')
stories = df["text"].tolist()


In [47]:

# Preprocess the text
def preprocess_text(texts):
    return [text.lower().strip() for text in texts]

processed_stories = preprocess_text(stories)

In [48]:
# Function to create batches and pad them
def create_batches(texts, batch_size):
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        # Tokenize and pad each batch
        batch_inputs = sentiment_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="tf")
        yield batch_inputs

In [49]:
# Process each batch separately for sentiment analysis
sentiments = []
for batch_inputs in create_batches(processed_stories, batch_size=10):
    # Generate predictions for the batch
    outputs = sentiment_model(**batch_inputs)
    batch_predictions = tf.argmax(outputs.logits, axis=-1)
    sentiments.extend(batch_predictions.numpy())

In [50]:

# Sentiment labels
sentiment_labels = ["very negative", "negative", "neutral", "positive", "very positive"]
sentiment_counts = {label: 0 for label in sentiment_labels}

In [51]:
# Count occurrences of each sentiment
for sentiment in sentiments:
    sentiment_counts[sentiment_labels[sentiment]] += 1

In [52]:
# Show sentiment counts
print("Sentiment Counts:")
for label, count in sentiment_counts.items():
    print(f"{label}: {count}")

# Identify the highest count sentiment
highest_count_sentiment = max(sentiment_counts, key=sentiment_counts.get)
print(f"Highest Count Sentiment: {highest_count_sentiment}")

Sentiment Counts:
very negative: 588
negative: 399
neutral: 31
positive: 1002
very positive: 918
Highest Count Sentiment: positive


In [53]:
def create_text_dataset(tokenizer, texts, block_size=512):
    tokenized_texts = [
        tokenizer(text, truncation=True, padding='max_length', max_length=block_size, return_tensors="tf")["input_ids"]
        for text in texts
    ]

    # Convert all tensors to int32, which is the expected type for input IDs
    tokenized_texts = [tf.cast(tensor, tf.int32) for tensor in tokenized_texts]

    # Filter out tensors with a length of 0
    filtered_texts = [tensor for tensor in tokenized_texts if tf.shape(tensor)[1] > 0]

    if len(filtered_texts) < 2:
        raise ValueError("Not enough valid sequences to create a dataset. Please check your input data.")

    inputs = tf.concat(filtered_texts[:-1], axis=0)
    labels = tf.concat(filtered_texts[1:], axis=0)

    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
    return dataset



In [71]:
# Create and prepare the dataset for training
dataset = create_text_dataset(gpt_tokenizer, processed_stories)
dataset = dataset.batch(2)

In [72]:
# Define the optimizer, loss, and metrics for training
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)  # Reduced by a factor of 50. previously 5e-5(1e-6 is what it was changed to). This offers a slower convergence which can help capture more fine details.
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [73]:
# Compile the model
gpt_model.compile(optimizer=optimizer, loss=loss_fn)

In [74]:
def create_text_dataset(tokenizer, texts, block_size=128):  # Reduce block size
    tokenized_texts = [
        tokenizer(text, truncation=True, padding='max_length', max_length=block_size, return_tensors="tf")["input_ids"]
        for text in texts
    ]
    ...





In [76]:
from tensorflow.keras import mixed_precision

# Enable mixed precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

# Compile the model with mixed precision
gpt_model.compile(optimizer=optimizer, loss=loss_fn)


In [None]:
# Fine-tune the model
gpt_model.fit(dataset, epochs=1)



In [42]:
def generate_story(prompt, max_length=250):
    inputs = gpt_tokenizer(prompt, return_tensors="tf", truncation=True, max_length=512)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    outputs = gpt_model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,  # Generate multiple sequences
        pad_token_id=gpt_tokenizer.eos_token_id,
        no_repeat_ngram_size=2,
        num_beams=5,  # Use beam search with 5 beams
        temperature=1
    )

    stories = [gpt_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return stories



In [43]:
# Create a prompt for story generation
input_prompt = f"Once upon a time in a land filled with {highest_count_sentiment} emotions, there was a..."


In [44]:
# Generate and display a story
generated_story = generate_story(input_prompt, max_length=250)
print("Generated Story:")
generated_story

Generated Story:


['Once upon a time in a land filled with positive emotions, there was a... earth springs happiness springs everlasting bliss everlasting life everlasting happiness there happiness one happiness of happiness, bliss life happiness lived bliss lived of life lived happiness existed happinesst of existed bliss paradise life there paradise lived happy existed happyt last there life unhappy lived a life\n day happiness land paradise paradise of unhappy land there unhappyt. untilted paradise.\n to and happiness. to good.. good one happy land the happyted unhappy island happiness\n until., happiness sea happinessland happinessless ofents happiness toents world happiness unhappy.entsents of theents heents\nents the happiness and the world the unhappy,ents,,. happiness the the bliss of happy, the paradise happiness life the lonely of lonely. lonely, unhappy paradise\n\n happiness bliss land happiness happiness happy touched touched. touched world happy happiness touched the touched, touched his h

In [45]:
# Save the GPT-2 model in the TensorFlow SavedModel format
gpt_model.save_pretrained('./gpt2_saved_model')

