In [0]:
dbutils.library.restartPython()


In [0]:
import numpy as np
print("numpy:", np.__version__)



In [0]:
import tensorflow as tf
print("tf:", tf.__version__)
from tensorflow.keras import layers
print("tf.keras OK")


In [0]:
import re
import tensorflow as tf
from tensorflow.keras import layers

#load the table
df = spark.table("default.wiki_movie_plots_deduped").toPandas()

print("Rows:", len(df))

# clean the data
def clean_text(s):
    #remove missing values
    if s is None:
        return ""
    #force to a string and replace with non breaking space
    s = str(s).replace("\u00a0", " ")
    # collaps multiple spaces, newtabs, newlines
    # remove leading and trailing space
    s = re.sub(r"\s+", " ", s).strip()
    return s

def safe_year(y):
    # extract 4 digit year or return empty
    try:
        return str(int(y))
    except Exception:
        return ""

def take_cast(cast, n=5):
    # clean the cast string
    cast = clean_text(cast)
    # if empty, return empty
    if not cast:
        return ""
    # split into names
    # return the first 5 names
    parts = [p.strip() for p in cast.split(",") if p.strip()]
    return ", ".join(parts[:n])

# each movie(row) to a strucutred text block
def row_to_doc(r):
    title    = clean_text(r["Title"])
    year     = safe_year(r["Release Year"])
    origin   = clean_text(r["Origin/Ethnicity"])
    director = clean_text(r["Director"])
    genre    = clean_text(r["Genre"])
    cast     = take_cast(r["Cast"], n=5)
    plot     = clean_text(r["Plot"])

    if not title or not plot:
        return ""

    return (
        f"Title: {title}\n"
        f"Year: {year}\n"
        f"Origin: {origin}\n"
        f"Director: {director}\n"
        f"Cast: {cast}\n"
        f"Genre: {genre}\n"
        f"Plot: {plot}\n"
    ).strip()

docs = [row_to_doc(r) for _, r in df.iterrows()]
docs = [d for d in docs if d]

# print("Documents:", len(docs))
# print("\n--- SAMPLE DOCUMENT ---\n")
# print(docs[:3][:500])

# train unsupervised
# max number of unique tokens
VOCAB_SIZE = 20000
# token seq length
SEQ_LEN = 128
# size per gradient update
BATCH_SIZE = 128

# convert to TensorFlow dataset and shuffle it
text_ds = tf.data.Dataset.from_tensor_slices(docs).shuffle(10000, seed=42)

#for x in text_ds.take(3):
#    print(x)

# tokenizer
vectorizer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    output_mode="int",
)
vectorizer.adapt(text_ds.batch(256))
vocab = vectorizer.get_vocabulary()

print("Vocab size:", len(vocab))
print("Vocab sample:", vocab[:10])

def make_windows(token_ids):
    return tf.data.Dataset.from_tensor_slices(token_ids).window(
        SEQ_LEN + 1, shift=SEQ_LEN, drop_remainder=True
    ).flat_map(lambda w: w.batch(SEQ_LEN + 1))

def split_xy(seq):
    return seq[:-1], seq[1:]

def doc_to_ds(doc):
    ids = vectorizer(tf.expand_dims(doc, 0))[0]
    ids = tf.boolean_mask(ids, ids > 0)
    return make_windows(ids).map(split_xy, num_parallel_calls=tf.data.AUTOTUNE)

lm_ds = text_ds.shuffle(5000, seed=42).flat_map(doc_to_ds).shuffle(20000, seed=42)

VAL_EXAMPLES = 5000

val_ds = (lm_ds.take(VAL_EXAMPLES)
          .batch(BATCH_SIZE)
          .prefetch(tf.data.AUTOTUNE))

train_ds = (lm_ds.skip(VAL_EXAMPLES)
            .batch(BATCH_SIZE)
            .prefetch(tf.data.AUTOTUNE))

train_ds_rep = train_ds.repeat()
val_ds_rep   = val_ds.repeat()

# Pick explicit steps since cardinality is unknown (-2)
STEPS_PER_EPOCH = 500
VAL_STEPS = 50

vocab_size = len(vocab)
inputs = layers.Input(shape=(SEQ_LEN,), dtype=tf.int32)
x = layers.Embedding(input_dim=vocab_size, output_dim=256)(inputs)
# Must keep return_sequences=True so attention can see all time steps
lstm_out = layers.LSTM(512, return_sequences=True)(x)
# Self-attention over the LSTM outputs (query=keys=values=lstm_out)
attn_out = layers.Attention()([lstm_out, lstm_out])
# Combine original LSTM signal + attended context
x = layers.Concatenate()([lstm_out, attn_out])
x = layers.Dropout(0.2)(x)
# Next-token logits at every timestep
outputs = layers.Dense(vocab_size)(x)
lm_model = tf.keras.Model(inputs=inputs, outputs=outputs)

lm_model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)

history = lm_model.fit(
    train_ds_rep,
    validation_data=val_ds_rep,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_steps=VAL_STEPS,
    epochs=2
)

import mlflow
import mlflow.tensorflow
import json

mlflow.set_experiment("/Users/desiborisovab@gmail.com/movie_chatbot_experiment")

with mlflow.start_run(run_name="lstm_attention_movie_lm") as run:
    run_id = run.info.run_id

    mlflow.tensorflow.log_model(lm_model, artifact_path="model")
    mlflow.log_text(json.dumps(vocab), "vocab.json")

    mlflow.log_param("SEQ_LEN", SEQ_LEN)
    mlflow.log_param("VOCAB_SIZE", len(vocab))
    mlflow.log_param("BATCH_SIZE", BATCH_SIZE)
    mlflow.log_param("STEPS_PER_EPOCH", STEPS_PER_EPOCH)
    mlflow.log_param("VAL_STEPS", VAL_STEPS)

print("Logged to MLflow. run_id =", run_id)



