In [1]:
from transformers import (
    GPT2Tokenizer,
    TFGPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
import tensorflow as tf
from transformers import AutoTokenizer, TFBertForSequenceClassification
import pandas as pd


In [2]:
# Load BERT model and tokenizer for sentiment analysis
sentiment_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = TFBertForSequenceClassification.from_pretrained(sentiment_model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/670M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [3]:
# Load GPT-2 model and tokenizer for story generation
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = TFGPT2LMHeadModel.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [4]:
# Set the pad token to the EOS token
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

In [5]:
# Read stories from "stories.csv"
df = pd.read_csv("folk_tales_deduplicated.csv", encoding='ISO-8859-1')
stories = df["text"].tolist()
import re

column_values = df.iloc[:, 3]
clean_values = column_values.apply(lambda x: re.sub(r'[^a-zA-Z0-9\s.]', '', str(x)))
concatenated_string = ' '.join(clean_values)


In [6]:

# Preprocess the text
def preprocess_text(texts):
    return [text.lower().strip() for text in texts]

processed_stories = preprocess_text(stories)

In [7]:
# Function to create batches and pad them
def create_batches(texts, batch_size):
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        # Tokenize and pad each batch
        batch_inputs = sentiment_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="tf")
        yield batch_inputs

In [8]:
# Process each batch separately for sentiment analysis
sentiments = []
for batch_inputs in create_batches(processed_stories, batch_size=10):
    # Generate predictions for the batch
    outputs = sentiment_model(**batch_inputs)
    batch_predictions = tf.argmax(outputs.logits, axis=-1)
    sentiments.extend(batch_predictions.numpy())

In [9]:

# Sentiment labels
sentiment_labels = ["very negative", "negative", "neutral", "positive", "very positive"]
sentiment_counts = {label: 0 for label in sentiment_labels}

In [10]:
# Count occurrences of each sentiment
for sentiment in sentiments:
    sentiment_counts[sentiment_labels[sentiment]] += 1

In [11]:
# Show sentiment counts
print("Sentiment Counts:")
for label, count in sentiment_counts.items():
    print(f"{label}: {count}")

# Identify the highest count sentiment
highest_count_sentiment = max(sentiment_counts, key=sentiment_counts.get)
print(f"Highest Count Sentiment: {highest_count_sentiment}")

Sentiment Counts:
very negative: 588
negative: 399
neutral: 31
positive: 1002
very positive: 918
Highest Count Sentiment: positive


In [12]:
def create_text_dataset(tokenizer, texts, block_size=512):
    tokenized_texts = [
        tokenizer(text, truncation=True, padding='max_length', max_length=block_size, return_tensors="tf")["input_ids"]
        for text in texts
    ]

    # Convert all tensors to int32, which is the expected type for input IDs
    tokenized_texts = [tf.cast(tensor, tf.int32) for tensor in tokenized_texts]

    # Filter out tensors with a length of 0
    filtered_texts = [tensor for tensor in tokenized_texts if tf.shape(tensor)[1] > 0]

    if len(filtered_texts) < 2:
        raise ValueError("Not enough valid sequences to create a dataset. Please check your input data.")

    inputs = tf.concat(filtered_texts[:-1], axis=0)
    labels = tf.concat(filtered_texts[1:], axis=0)

    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
    return dataset



In [13]:
# Create and prepare the dataset for training
dataset = create_text_dataset(gpt_tokenizer, processed_stories)
dataset = dataset.batch(2)

In [14]:
# Define the optimizer, loss, and metrics for training
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)  # Reduced by a factor of 50. previously 5e-5(1e-6 is what it was changed to). This offers a slower convergence which can help capture more fine details.
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [15]:
# Compile the model
gpt_model.compile(optimizer=optimizer, loss=loss_fn)

In [16]:
def create_text_dataset(tokenizer, texts, block_size=128):  # Reduce block size
    tokenized_texts = [
        tokenizer(text, truncation=True, padding='max_length', max_length=block_size, return_tensors="tf")["input_ids"]
        for text in texts
    ]
    ...





In [17]:
from tensorflow.keras import mixed_precision

# Enable mixed precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

# Compile the model with mixed precision
gpt_model.compile(optimizer=optimizer, loss=loss_fn)


In [None]:
# Fine-tune the model
gpt_model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

In [None]:
def generate_story(prompt, max_length=250):
    inputs = gpt_tokenizer(prompt, return_tensors="tf", truncation=True, max_length=512)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    outputs = gpt_model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,  # Generate multiple sequences
        pad_token_id=gpt_tokenizer.eos_token_id,
        no_repeat_ngram_size=2,
        num_beams=5,  # Use beam search with 5 beams
        temperature=1
    )

    stories = [gpt_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return stories



In [None]:
# Create a prompt for story generation
input_prompt = f"Once upon a time in a land filled with {highest_count_sentiment} emotions, there was a..."


In [None]:
# Generate and display a story
generated_story = generate_story(input_prompt, max_length=250)
print("Generated Story:")
generated_story