<a href="https://colab.research.google.com/github/gkiflex/MSAI-630-A01/blob/master/GptWineReview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# GPT Wine Review Generator - Google Colab Version
import numpy as np
import json
import re
import string
import pandas as pd
from IPython.display import display, HTML

import tensorflow as tf
from tensorflow.keras import layers, models, losses, callbacks

print("TensorFlow version:", tf.__version__)

# Parameters
VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
BATCH_SIZE = 32
EPOCHS = 5

tf.random.set_seed(42)
np.random.seed(42)

# Load dataset from sample_data folder
df = pd.read_csv('/content/winemag-data-130k-v2.csv')
print(f"Dataset loaded: {df.shape}")

# Convert to wine data format
wine_data = []
for _, row in df.iterrows():
    wine_data.append({
        'country': row.get('country'),
        'province': row.get('province'),
        'variety': row.get('variety'),
        'description': row.get('description')
    })

print(f"Sample wine review:\n{wine_data[10]}")

# Filter and format dataset
filtered_data = [
    "wine review : "
    + str(x["country"])
    + " : "
    + str(x["province"])
    + " : "
    + str(x["variety"])
    + " : "
    + str(x["description"])
    for x in wine_data
    if x["country"] is not None
    and x["province"] is not None
    and x["variety"] is not None
    and x["description"] is not None
]

n_wines = len(filtered_data)
print(f"{n_wines} wine reviews processed")

# Tokenization - pad punctuation
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}, '\\n'])", r" \\1 ", s)
    s = re.sub(" +", " ", s)
    return s

text_data = [pad_punctuation(x) for x in filtered_data]

# Create TensorFlow dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

# Create vectorization layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

print("Vocabulary size:", len(vocab))

# Prepare training data
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

train_ds = text_ds.map(prepare_inputs)

# Causal attention mask
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)

# Transformer Block
class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, key_dim, embed_dim, ff_dim, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate
        self.attn = layers.MultiHeadAttention(num_heads, key_dim, output_shape=embed_dim)
        self.dropout_1 = layers.Dropout(self.dropout_rate)
        self.ln_1 = layers.LayerNormalization(epsilon=1e-6)
        self.ffn_1 = layers.Dense(self.ff_dim, activation="relu")
        self.ffn_2 = layers.Dense(self.embed_dim)
        self.dropout_2 = layers.Dropout(self.dropout_rate)
        self.ln_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output, attention_scores = self.attn(
            inputs, inputs, attention_mask=causal_mask, return_attention_scores=True
        )
        attention_output = self.dropout_1(attention_output)
        out1 = self.ln_1(inputs + attention_output)
        ffn_1 = self.ffn_1(out1)
        ffn_2 = self.ffn_2(ffn_1)
        ffn_output = self.dropout_2(ffn_2)
        return (self.ln_2(out1 + ffn_output), attention_scores)

    def get_config(self):
        config = super().get_config()
        config.update({
            "key_dim": self.key_dim,
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "dropout_rate": self.dropout_rate,
        })
        return config

# Token and Position Embedding
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        config = super().get_config()
        config.update({
            "max_len": self.max_len,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
        })
        return config

# Build GPT model
inputs = layers.Input(shape=(None,), dtype=tf.int32)
x = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x, attention_scores = TransformerBlock(N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FORWARD_DIM)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
gpt = models.Model(inputs=inputs, outputs=[outputs, attention_scores])
gpt.compile("adam", loss=[losses.SparseCategoricalCrossentropy(), None])

print("GPT Model created successfully!")
gpt.summary()

# Text Generator
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {word: index for index, word in enumerate(index_to_word)}

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [self.word_to_index.get(x, 1) for x in start_prompt.split()]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y, att = self.model.predict(x, verbose=0)
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            info.append({
                "prompt": start_prompt,
                "word_probs": probs,
                "atts": att[0, :, -1, :],
            })
            start_tokens.append(sample_token)
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\\nGenerated text (temperature={temperature}):\\n{start_prompt}\\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("wine review", max_tokens=60, temperature=1.0)

# Training callbacks
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint.weights.h5",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

text_generator = TextGenerator(vocab)

# Train model
print("Starting training...")
history = gpt.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, text_generator],
    verbose=1
)

print("Training completed!")

# Generate with different temperatures
print("="*60)
print("GENERATION WITH DIFFERENT TEMPERATURES")
print("="*60)

temperatures = [0.2, 0.5, 0.8, 1.0, 1.2]
prompts = ["wine review : us", "wine review : italy", "wine review : france"]

results = {}

for temp in temperatures:
    print(f"\\n{'='*20} TEMPERATURE {temp} {'='*20}")
    results[temp] = {}

    for prompt in prompts:
        print(f"\\nPrompt: '{prompt}'")
        info = text_generator.generate(prompt, max_tokens=60, temperature=temp)
        results[temp][prompt] = info[-1]['prompt'] if info else "No generation"

# Analysis output
print("\\n" + "="*80)
print("TEMPERATURE ANALYSIS")
print("="*80)

for prompt in prompts:
    print(f"\\nPROMPT: {prompt}")
    print("-" * 40)
    for temp in temperatures:
        generated_text = results[temp].get(prompt, "No generation")
        display_text = generated_text[:100] + "..." if len(generated_text) > 100 else generated_text
        print(f"T={temp}: {display_text}")

# Save model
gpt.save("gpt_wine_model.keras")
print("\\n🍷 Model saved as 'gpt_wine_model.keras'")
print("Training and analysis complete!")

TensorFlow version: 2.19.0
Dataset loaded: (129971, 14)
Sample wine review:
{'country': 'US', 'province': 'California', 'variety': 'Cabernet Sauvignon', 'description': 'Soft, supple plum envelopes an oaky structure in this Cabernet, supported by 15% Merlot. Coffee and chocolate complete the picture, finishing strong at the end, resulting in a value-priced wine of attractive flavor and immediate accessibility.'}
129971 wine reviews processed
Vocabulary size: 10000
GPT Model created successfully!


Starting training...
Epoch 1/5
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 2.1564\nGenerated text (temperature=1.0):\nwine review \1 romania \1 \1 \1 \1 \1 \1 m \1 \1 grenache \1 \1 \1 \1 \1 \1 \1 \1 aged \1 in \1 both \1 the \1 dominant \1 bottle \1 \1 30 \1 \1 \1 \1 \1 \1 \1 \1 \1 and \1 chardonnay \1 5 \1 \1 bourboulenc \1 with \1 10 \1 \1 cabernet\n
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 39ms/step - loss: 2.1563
Epoch 2/5
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 1.4884\nGenerated text (temperature=1.0):\nwine review \1 \1 \1 \1 chile \1 \1 \1 colchagua \1 valley \1 \1 \1 red \1 blend \1 \1 \1 aromas \1 of \1 vanilla \1 \1 wood \1 grain \1 and \1 violet \1 lead \1 to \1 a \1 fresh \1 palate \1 with \1 red \1 plum \1 \1 berry \1 and \1 cassis \1\n
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 27ms/step - loss: 1.4884
Epoch 3/5
[1m4061/4062[0m