Simple SEDD Example

In [None]:
#@title 0. Dataset Example
raw_sentences = [
    "I love playing guitar",
    "I enjoy cooking meals",
    "I like painting landscapes",
    "I adore writing poetry",
    "I enjoy dancing daily",
    "I love traveling abroad",
    "I like reading books",
    "I enjoy playing chess",
    "I love coding projects",
    "I prefer hiking mountains",
    "I enjoy taking photos",
    "I like swimming pools",
    "I practice yoga daily",
    "I enjoy video games",
    "I love singing songs",
    "I enjoy night driving",
    "I like working out",
    "I enjoy gardening flowers",
    "I love baking cakes",
    "I like watching sunsets",
    "I enjoy learning languages",
    "I love sketching nature",
    "I like watching movies",
    "I enjoy fishing trips",
    "I love pet cuddles",
    "I practice meditation daily",
    "I enjoy riding bikes",
    "I like solving puzzles",
    "I enjoy board games",
    "I prefer nature photography",
    "I practice daily journaling",
    "I love trying foods",
    "I enjoy music playlists",
    "I adore volunteering work",
    "I love traveling solo",
    "I enjoy watching documentaries",
    "I practice skateboarding tricks",
    "I love magic tricks",
    "I prefer digital art",
    "I enjoy playing piano",
    "I adore home decorating",
    "I love nature walks",
    "I enjoy knitting sweaters",
    "I love exploring history",
    "I enjoy writing stories",
    "I like editing videos",
    "I enjoy studying cultures",
    "I love skydiving adventures",
    "I like building models",
    "I enjoy restoring furniture"
]


In [None]:
#@title 1. Generate Dicitionary

import numpy as np

def preprocess_sentences(sentences):
    """
    Converts a list of sentences (as strings) into a vocabulary-indexed format and ensures all sentences have the same length.

    Args:
    sentences (list of str): A list of sentences (each sentence is a string).

    Returns:
    indexed_sentences (list of list of int): Sentences converted to indices.
    vocab (dict): Mapping of words to indices.
    vocab_size (int): Number of unique words in the vocabulary.
    """
    # Tokenize sentences into words
    tokenized_sentences = [sentence.split() for sentence in sentences]

    # Create vocabulary
    unique_words = sorted(set(word for sentence in tokenized_sentences for word in sentence))  # Sort for consistent indexing
    vocab = {word: idx for idx, word in enumerate(unique_words)}
    vocab_size = len(vocab)

    # Convert sentences to indexed form
    indexed_sentences = [[vocab[word] for word in sentence] for sentence in tokenized_sentences]

    # Find the shortest sentence length
    min_length = min(len(sentence) for sentence in indexed_sentences)

    # Truncate sentences to match the shortest length
    truncated_sentences = [sentence[:min_length] for sentence in indexed_sentences]

    return truncated_sentences, vocab, vocab_size, min_length

# Example usage
# sentences = [
#     "hello world good morning",
#     "world is beautiful",
#     "good morning everyone today"
# ]
sentences = raw_sentences
indexed_sentences, vocab, vocab_size, sentence_length = preprocess_sentences(sentences)

print("Vocabulary:", vocab)
print("Indexed Sentences:", indexed_sentences)
print("Vocabulary Size:", vocab_size)


In [None]:
#@title Generate Training Data
import numpy as np

def create_transition_matrix(vocab_size, t, T, sigma_min=0.0000001, sigma_max=0.000001, schedule="linear"):
    """Creates a tridiagonal transition matrix Q_t with a noise scheduler."""

    # # Choose a noise scaling function
    # if schedule == "linear":
    #     sigma_t = sigma_min + t * (sigma_max - sigma_min) / T
    # elif schedule == "exponential":
    #     sigma_t = sigma_min * np.exp(2 * t / T)  # Adjust exponent as needed
    # elif schedule == "cosine":
    #     sigma_t = sigma_min + 0.5 * (sigma_max - sigma_min) * (1 - np.cos(np.pi * t / T))
    # else:
    #     raise ValueError("Invalid schedule type. Choose 'linear', 'exponential', or 'cosine'.")
    # sigma_t = 0.9
    # Q_t = np.zeros((vocab_size, vocab_size))

    # for i in range(vocab_size):
    #     if i > 0:
    #         Q_t[i, i - 1] = sigma_t  # Transition to previous index
    #     if i < vocab_size - 1:
    #         Q_t[i, i + 1] = sigma_t  # Transition to next index
    #     Q_t[i, i] = -(Q_t[i, i - 1] if i > 0 else 0) - (Q_t[i, i + 1] if i < vocab_size - 1 else 0)

    # alternative code
    Q_t = np.full((vocab_size, vocab_size), 0.01)
    np.fill_diagonal(Q_t, -np.sum(Q_t, axis=1))
    # end of alternative code


    return Q_t

def forward_diffusion(p_t, Q_t, delta_t):
    """Performs one step of forward diffusion for each word separately."""
    return p_t + delta_t * np.dot(Q_t, p_t)

def compute_pairwise_ratios(p_t):
    """Computes the probability ratios p_t(y) / p_t(x) for all pairs (x, y) in each row."""
    vocab_size = p_t.shape[1]
    ratios = np.zeros((p_t.shape[0], vocab_size, vocab_size))

    for x in range(p_t.shape[0]):  # Word level
        for y in range(vocab_size):  # The probability level
            for z in range(vocab_size):  # Each other probability
                if p_t[x, y] > 0:  # Only compute ratios for non-zero elements
                    ratios[x, y, z] = p_t[x, z] / p_t[x, y]  # Divide element by the diagonal element in the same row

    return ratios

def sample_new_indices(p_t1):
    """Samples new word indices for each word separately using its own probability distribution."""
    new_sentence = []
    for word_dist in p_t1:
        new_word = np.random.choice(len(word_dist), p=word_dist / np.sum(word_dist))  # Normalize to ensure valid probabilities
        new_sentence.append(new_word)
    return new_sentence

def generate_training_data(sentences, vocab_size, delta_t, num_steps, schedule="linear"):
    """Generates training data with a noise scheduler applied to Q_t."""
    X_train = []
    Y_train = []
    Z_train = []

    first_sentence = sentences[0]  # Track only the first sentence

    # print(f"Step 0 (Initial Sentence): {first_sentence}")  # Print initial state

    for sentence in sentences:
        p_t = np.full((len(sentence), vocab_size), 1e-6)
        for i, word in enumerate(sentence):
            p_t[i, word] = 1  # Start with a delta distribution

        for step in range(num_steps):
            Q_t = create_transition_matrix(vocab_size, step, num_steps, schedule=schedule)
            pairwise_ratios = compute_pairwise_ratios(p_t)
            p_t1 = np.array([forward_diffusion(p_t[i], Q_t, delta_t) for i in range(len(sentence))])
            new_sentence = sample_new_indices(p_t1)

            X_train.append(new_sentence)
            Y_train.append(pairwise_ratios)
            Z_train.append(p_t)
            print(sentence)
            # if sentence is first_sentence:
            #     print(f"Step {step + 1}: {new_sentence}")  # Print progression of the first sentence
            #     print("times")
            sentence = new_sentence
            p_t = p_t1  # Update probabilities for the next step

    return X_train, Y_train, Z_train, Q_t

# Example usage
delta_t = 0.1
sentences = indexed_sentences  # Example word indices
X_train, Y_train, Z_train, Q_t = generate_training_data(sentences, vocab_size, delta_t, num_steps=10)

# Print extracted training data
# print("X_train (word indices at t):", X_train)
# print("Y_train (pairwise ratios from t-1):", Y_train)
# print("Z_train (probabilities from t-1):", Z_train)


In [None]:
#@title 3. Train a model
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Reshape


# Normalize Y_train
Y_min = np.min(Y_train)
Y_max = np.max(Y_train)


# X and Y train from actual data
# Convert lists to numpy arrays
X_train = np.array(X_train)
Y_train = np.array(Y_train)
Y_train_normalized = (Y_train - Y_min) / (Y_max - Y_min)  # Normalize to [0,1]

# Get output shape
output_shape = Y_train.shape[1:]  # Extract shape from generated Y_train
output_units = np.prod(output_shape)  # Total number of output elements



# end of X and Y train from actual data




# # Define the model
model = Sequential([
    Dense(256, activation="relu", input_shape=(X_train.shape[1],)),  # First hidden layer
    Dense(256, activation="relu"),  # Second hidden layer
    Dense(output_units, activation="linear"),  # Output layer (flattened version of (2,3,3))
    Reshape(output_shape)  # Reshape to (2,3,3)
])

# Define the model
# model = Sequential([
#     Dense(512, activation="relu", input_shape=(X_train.shape[1],)),  # First hidden layer
#     Dense(512, activation="relu"),  # Second hidden layer
#     Dense(256, activation="relu"),  # Third hidden layer
#     Dense(256, activation="relu"),  # Fourth hidden layer
#     Dense(128, activation="relu"),  # Fifth hidden layer
#     Dense(128, activation="relu"),  # Sixth hidden layer
#     Dense(output_units, activation="linear"),  # Output layer (flattened version of (2,3,3))
#     Reshape(output_shape)  # Reshape to (2,3,3)
# ])


# Compile the model
model.compile(optimizer="adam", loss="mse")

# Train the model
model.fit(X_train, Y_train_normalized, epochs=20, batch_size=16, verbose=1)

# Predict
predictions = model.predict(X_train[1:3])  # Predict for the first 5 samples
print("Predicted Y_train shape:", predictions.shape)  # Should match (5, 2, 3, 3)


In [None]:
#@title 4. Backward diffusion using model ratios

#@title 2. The Revesed Matrix Construction (forward and backward diffusion demo)

# Y_train[0]   #==> model output (ratios of a sequence)
# This function takes in the ratios of a word in the sequence. (ratios = Y_train[0][i])
def estimate_Qt_inv_from_ratios(Q_t, ratios):
  vocab_size = len(Q_t)
  Q_inv = np.zeros_like(Q_t, dtype=np.float64)

  for x in range(vocab_size):
      for y in range(vocab_size):
          if x != y:
              # Use stored pairwise ratio instead of computing from p_t
              Q_inv[y, x] = ratios[x, y] * Q_t[x, y]     #ratios[x, y] means p(y)/p(x) where the index of the word is the same as its index in the ratios matrix

  for x in range(vocab_size):
      Q_inv[x, x] = -np.sum(Q_inv[:, x][Q_inv[:, x] != Q_inv[x, x]])

  return Q_inv

def backward_diffusion(P_new, Q_t, delta_t, ratios):   # on the word level
    """Performs one step of backward diffusion using estimated Qt_inv."""
    I = np.eye(len(Q_t))  # Identity matrix
    Q_t_bar = estimate_Qt_inv_from_ratios(Q_t, ratios)  # Compute reversed transitio matrix

    # Compute P_old using the formula from the image: P_old = P_new * (I - Δt * Q_t_bar)^(-1)
    P_old = np.linalg.inv(I - delta_t * Q_t_bar) @ P_new

    return P_old


# the following function takes in p_t of a sequence and returns p_t_reconvered (at t-1)
def sentence_backward_diffusion(p_t, sen_ratios, Q_t, delta_t):
  sequence_len = p_t.shape[0]  #each word is a row
  #print("sequence len is", sequence_len)
  p_t_recovered = []
  for i in range(sequence_len):    # for each word
      P_new = p_t[i] # probabilitys of a word (each row is a word)
      sen_ratios = np.squeeze(sen_ratios)
      ratios = sen_ratios[i]   # ratios of a word

      P_recovered = backward_diffusion(P_new, Q_t, delta_t, ratios)
      p_t_recovered.append(P_recovered)

  return p_t_recovered

# Initialize random probability distribution (assuming vocab_size = 3 for shape consistency)
def initialize_random_distribution(sentence_length, vocab_size):
    """Creates a random probability distribution for a given sentence length."""
    p_t = np.random.rand(sentence_length, vocab_size)
    p_t /= p_t.sum(axis=1, keepdims=True)  # Normalize
    return p_t

# p_t_original = np.array([[1.e-06, 1.e+00, 1.e-06],
#                           [1.e-06, 1.e-06, 1.e+00]])  # Added missing bracket

# p_t1 = np.array([[9.0000910e-02, 8.2000018e-01, 9.0000910e-02],
#                  [1.0000000e-06, 9.0000910e-02, 9.1000009e-01]])  # No extra bracket

# p_t_original = np.array([[9.0000910e-02, 8.2000018e-01, 9.0000910e-02],
#                          [1.0000000e-06, 9.0000910e-02, 9.1000009e-01]])  # Correct

# p_t1 = np.array([[0.15570084, 0.68860031, 0.15570084],
#                  [0.00810099, 0.15570084, 0.83620016]])  # Removed extra bracket

# p_t_original = Z_train[1]
# p_t1 = Z_train[2]

# sample = X_train[0].reshape(1, -1)  # Reshape to (1, input_dim) to keep batch dimension
# # print("sample is", sample)
# predictions_normalized = model.predict(sample)
# predictions = predictions_normalized * (Y_max - Y_min) + Y_min

# # print("Prediction", predictions)  # Expected output: (1, 2, 3, 3)

# p_t_recovered = sentence_backward_diffusion(p_t1, predictions, Q_t, delta_t)


# print("p_t + 1 is", p_t1)
# print("p_t original is", p_t_original)
# print("p_t is", p_t_recovered)

In [None]:
#@title 5 - SEED
# SEED = 100
# p_t = Z_train[SEED]
p_t = initialize_random_distribution(sentence_length, vocab_size)

# List to store all samples
all_samples = []


In [None]:
#@title 6 - NEXT STEP

#@title 5. Mutli-Step Generation (saving all the steps)
import numpy as np

# Define function to sample new indices
def sample_new_indices(p_t1):
    """Samples new word indices for each word separately using its own probability distribution."""
    new_sentence = []
    for word_dist in p_t1:
        word_dist[word_dist < 0] = 0  # Fix negative values
        new_word = np.random.choice(len(word_dist), p=word_dist/np.sum(word_dist))  # Normalize
        new_sentence.append(new_word)
    return np.array(new_sentence)


def indices_to_sentence(indexed_sentences, vocab):
    """
    Converts indexed sentences back to human-readable sentences using the vocabulary.

    Args:
    indexed_sentences (list of list of int): List of sentences represented as word indices.
    vocab (dict): Dictionary mapping words to indices.

    Returns:
    sentences (list of str): List of reconstructed sentences.
    """
    # Reverse the vocabulary (index → word)
    index_to_word = {idx: word for word, idx in vocab.items()}

    # Convert each indexed sentence back to words
    sentences = [" ".join(index_to_word[idx] for idx in sentence) for sentence in indexed_sentences]

    return sentences




# Generate new sample based on current probability distribution
sample = sample_new_indices(p_t).reshape(1, -1)
all_samples.append(sample)  # Save the sample
# print(f"Step {step+1} - Sample: {sample}")

# Predict new probabilities using the model
predictions_normalized = model.predict(sample)
predictions = predictions_normalized * (Y_max - Y_min) + Y_min

# Update probability distribution
p_t = sentence_backward_diffusion(p_t, predictions, Q_t, delta_t)
p_t = np.array(p_t)


Final_Sentence = indices_to_sentence(sample, vocab)
print(Final_Sentence)

Other ---

In [None]:
#@title Forward and Backward diffusion (Final Code)(16-3-2025)(demo)

import numpy as np

def forward_diffusion(P_old, Q_t, delta_t):
    """Performs one step of forward diffusion."""
    return P_old + delta_t * np.dot(Q_t, P_old)

def estimate_Qt_inv(Q_t, p_t):
    """Estimate the reversed transition matrix Q_inv using probability ratio estimation."""
    Q_inv = np.zeros_like(Q_t, dtype=np.float64)

    for x in range(len(Q_t)):
        for y in range(len(Q_t)):
            if x != y:
                Q_inv[y, x] = (p_t[y] / p_t[x]) * Q_t[x, y]

    for x in range(len(Q_t)):
        Q_inv[x, x] = -np.sum(Q_inv[:, x][Q_inv[:, x] != Q_inv[x, x]])

    print("Q inverse is")
    print(Q_inv)

    return Q_inv

def backward_diffusion(P_new, Q_t, delta_t, p_t):
    """Performs one step of backward diffusion using estimated Qt_inv."""
    I = np.eye(len(Q_t))  # Identity matrix
    Q_t_bar = estimate_Qt_inv(Q_t, p_t)  # Compute reversed transitio matrix

    # Compute P_old using the formula from the image: P_old = P_new * (I - Δt * Q_t_bar)^(-1)
    P_old = np.linalg.inv(I - delta_t * Q_t_bar) @ P_new

    return P_old


Q_t = np.array([[-2, 1, 1],
                [1, -2, 1],
                [1, 1, -2]])


# Initial probability distribution
# P_old = np.array([0.7, 0.2, 0.1])
P_old = np.array([0.5, 0.3, 0.2])

# Time step
delta_t = 0.1

# Forward diffusion
P_new = forward_diffusion(P_old, Q_t, delta_t)
print("P_new (after forward diffusion):", P_new)
print("Difference between P_new and P_old:", np.linalg.norm(P_new - P_old))

# Backward diffusion
P_recovered = backward_diffusion(P_new, Q_t, delta_t, P_old)
print("P_old (after backward diffusion, should be close to original):", P_recovered)
print("Difference between P_recovered and P_old:", np.linalg.norm(P_recovered - P_old))


In [None]:
#@title Inferrence (demo)
sample = X_train[0].reshape(1, -1)  # Reshape to (1, input_dim) to keep batch dimension
predictions_normalized = model.predict(sample)
predictions = predictions_normalized * (Y_max - Y_min) + Y_min

print("Prediction", predictions)  # Expected output: (1, 2, 3, 3)


In [None]:
#@title The Revesed Matrix Construction (forward and backward diffusion) (demo)

# Y_train[0]   #==> model output (ratios of a sequence)
# This function takes in the ratios of a word in the sequence. (ratios = Y_train[0][i])
def estimate_Qt_inv_from_ratios(Q_t, ratios):
  vocab_size = len(Q_t)
  Q_inv = np.zeros_like(Q_t, dtype=np.float64)

  for x in range(vocab_size):
      for y in range(vocab_size):
          if x != y:
              # Use stored pairwise ratio instead of computing from p_t
              Q_inv[y, x] = ratios[x, y] * Q_t[x, y]     #ratios[x, y] means p(y)/p(x) where the index of the word is the same as its index in the ratios matrix

  for x in range(vocab_size):
      Q_inv[x, x] = -np.sum(Q_inv[:, x][Q_inv[:, x] != Q_inv[x, x]])

  return Q_inv

def backward_diffusion(P_new, Q_t, delta_t, ratios):   # on the word level
    """Performs one step of backward diffusion using estimated Qt_inv."""
    I = np.eye(len(Q_t))  # Identity matrix
    Q_t_bar = estimate_Qt_inv_from_ratios(Q_t, ratios)  # Compute reversed transitio matrix

    # Compute P_old using the formula from the image: P_old = P_new * (I - Δt * Q_t_bar)^(-1)
    P_old = np.linalg.inv(I - delta_t * Q_t_bar) @ P_new

    return P_old


# the following function takes in p_t of a sequence and returns p_t_reconvered (at t-1)
def sentence_backward_diffusion(p_t, Y_train, Q_t, delta_t):
  sequence_len = p_t.shape[0]  #each word is a row
  print("sequence len is", sequence_len)
  p_t_recovered = []
  for i in range(sequence_len):    # for each word
      print("i is")
      print(i)
      P_new = p_t[i] # probabilitys of a word (each row is a word)


      model_output = Y_train[1]

      ratios = Y_train[1][i]   # ratios of a word
      print("input is ")
      print(Y_train[1])
      P_recovered = backward_diffusion(P_new, Q_t, delta_t, ratios)
      p_t_recovered.append(P_recovered)

  return p_t_recovered

# p_t_original = np.array([[1.e-06, 1.e+00, 1.e-06],
#                           [1.e-06, 1.e-06, 1.e+00]])  # Added missing bracket

# p_t1 = np.array([[9.0000910e-02, 8.2000018e-01, 9.0000910e-02],
#                  [1.0000000e-06, 9.0000910e-02, 9.1000009e-01]])  # No extra bracket

# p_t_original = np.array([[9.0000910e-02, 8.2000018e-01, 9.0000910e-02],
#                          [1.0000000e-06, 9.0000910e-02, 9.1000009e-01]])  # Correct

# p_t1 = np.array([[0.15570084, 0.68860031, 0.15570084],
#                  [0.00810099, 0.15570084, 0.83620016]])  # Removed extra bracket

p_t_original = Z_train[1]
p_t1 = Z_train[2]

p_t_recovered = sentence_backward_diffusion(p_t1, Y_train, Q_t, delta_t)




print("p_t + 1 is", p_t1)
print("p_t original is", p_t_original)
print("p_t is", p_t_recovered)

In [None]:
#@title 2. Generate Training Data (variable noise schadual)(option 2)
import numpy as np

def create_transition_matrix(vocab_size, t, T, sigma_min=0.01, sigma_max=0.1, schedule="linear"):
    """Creates a tridiagonal transition matrix Q_t with a noise scheduler."""

    # Choose a noise scaling function
    if schedule == "linear":
        sigma_t = sigma_min + t * (sigma_max - sigma_min) / T
    elif schedule == "exponential":
        sigma_t = sigma_min * np.exp(2 * t / T)  # Adjust exponent as needed
    elif schedule == "cosine":
        sigma_t = sigma_min + 0.5 * (sigma_max - sigma_min) * (1 - np.cos(np.pi * t / T))
    else:
        raise ValueError("Invalid schedule type. Choose 'linear', 'exponential', or 'cosine'.")

    Q_t = np.zeros((vocab_size, vocab_size))

    for i in range(vocab_size):
        if i > 0:
            Q_t[i, i - 1] = sigma_t  # Transition to previous index
        if i < vocab_size - 1:
            Q_t[i, i + 1] = sigma_t  # Transition to next index
        Q_t[i, i] = -(Q_t[i, i - 1] if i > 0 else 0) - (Q_t[i, i + 1] if i < vocab_size - 1 else 0)

    return Q_t

def forward_diffusion(p_t, Q_t, delta_t):
    """Performs one step of forward diffusion for each word separately."""
    return p_t + delta_t * np.dot(Q_t, p_t)


def compute_pairwise_ratios(p_t):
    """Computes the probability ratios p_t(y) / p_t(x) for all pairs (x, y) in each row."""
    print(p_t.shape)
    vocab_size = p_t.shape[1]
    print(vocab_size)
    ratios = np.zeros((p_t.shape[0], vocab_size, vocab_size))
    print("ratios shape is")
    print(ratios.shape)
    for x in range(p_t.shape[0]): # word level
        for y in range(vocab_size): # the probablities level
            for z in range(vocab_size): # each other probabilty
              if p_t[x, y] > 0:  # Only compute ratios for non-zero elements
                  ratios[x, y, z] = p_t[x, z] / p_t[x, y]  # Divide element by the diagonal element in the same row

    return ratios

def sample_new_indices(p_t1):
    """Samples new word indices for each word separately using its own probability distribution."""
    new_sentence = []
    for word_dist in p_t1:
        new_word = np.random.choice(len(word_dist), p=word_dist/np.sum(word_dist))  # Normalize to ensure valid probabilities
        new_sentence.append(new_word)
    return new_sentence

def generate_training_data(sentences, vocab_size, delta_t, num_steps, schedule="exponential"):
    """Generates training data with a noise scheduler applied to Q_t."""
    X_train = []
    Y_train = []
    Z_train = []

    for sentence in sentences:
        p_t = np.full((len(sentence), vocab_size), 1e-2)
        for i, word in enumerate(sentence):
            p_t[i, word] = 1  # Start with a delta distribution

        for step in range(num_steps):
            Q_t = create_transition_matrix(vocab_size, step, num_steps, schedule=schedule)
            pairwise_ratios = compute_pairwise_ratios(p_t)
            p_t1 = np.array([forward_diffusion(p_t[i], Q_t, delta_t) for i in range(len(sentence))])
            new_sentence = sample_new_indices(p_t1)

            X_train.append(new_sentence)
            Y_train.append(pairwise_ratios)
            Z_train.append(p_t)

            sentence = new_sentence
            p_t = p_t1  # Update probabilities for next step

    return X_train, Y_train, Z_train, Q_t

# Example usage
delta_t = 0.1
sentences = indexed_sentences  # Example word indices
X_train, Y_train, Z_train, Q_t = generate_training_data(sentences, vocab_size, delta_t, num_steps=10)

# Print extracted training data
print("X_train (word indices at t):", X_train)
print("Y_train (pairwise ratios from t-1):", Y_train)
print("Z_train (probabilities from t-1):", Z_train)


In [None]:
#@title 2. Generate Training Data (constant noise schadual)(option 1)
import numpy as np

def create_transition_matrix(vocab_size, rate=0.00001):
    """Creates a tridiagonal transition matrix Q_t for discrete diffusion."""
    Q_t = np.zeros((vocab_size, vocab_size))

    for i in range(vocab_size):
        if i > 0:
            Q_t[i, i - 1] = rate  # Transition to previous index
        if i < vocab_size - 1:
            Q_t[i, i + 1] = rate  # Transition to next index
        Q_t[i, i] = -(Q_t[i, i - 1] if i > 0 else 0) - (Q_t[i, i + 1] if i < vocab_size - 1 else 0)

    return Q_t

def forward_diffusion(p_t, Q_t, delta_t):
    """Performs one step of forward diffusion for each word separately."""
    return p_t + delta_t * np.dot(Q_t, p_t)


def compute_pairwise_ratios(p_t):
    """Computes the probability ratios p_t(y) / p_t(x) for all pairs (x, y) in each row."""
    # print(p_t.shape)
    vocab_size = p_t.shape[1]
    # print(vocab_size)
    ratios = np.zeros((p_t.shape[0], vocab_size, vocab_size))
    # print("ratios shape is")
    # print(ratios.shape)
    for x in range(p_t.shape[0]): # word level
        for y in range(vocab_size): # the probablities level
            for z in range(vocab_size): # each other probabilty
              if p_t[x, y] > 0:  # Only compute ratios for non-zero elements
                  ratios[x, y, z] = p_t[x, z] / p_t[x, y]  # Divide element by the diagonal element in the same row

    return ratios

def sample_new_indices(p_t1):
    """Samples new word indices for each word separately using its own probability distribution."""
    new_sentence = []
    for word_dist in p_t1:
        new_word = np.random.choice(len(word_dist), p=word_dist/np.sum(word_dist))  # Normalize to ensure valid probabilities
        new_sentence.append(new_word)
    return new_sentence

def generate_training_data(sentences, vocab_size, delta_t, num_steps):
    """Generates X_train and Y_train for model training."""
    Q_t = create_transition_matrix(vocab_size)
    print("Q_t is")
    print(Q_t)
    X_train = []  # Stores word indices at time t
    Y_train = []  # Stores pairwise ratios from time t-1
    Z_train = []  # Stores the probabilities from time t-1

    for sentence in sentences:
        # p_t = np.zeros((len(sentence), vocab_size))  # Each word has its own probability distribution
        p_t = np.full((len(sentence), vocab_size), 1e-2)  # Initialize with 10^-6
        # print("here")
        # print(p_t.shape)
        for i, word in enumerate(sentence):
            p_t[i, word] = 1  # Start with a delta distribution for each word
        # print("initialization")
        # print(p_t)
        # prev_ratios = None  # Store ratios from t-1
        for step in range(num_steps):
            pairwise_ratios = compute_pairwise_ratios(p_t)  # Compute ratios for each word separately
            p_t1 = np.array([forward_diffusion(p_t[i], Q_t, delta_t) for i in range(len(sentence))])  # Apply diffusion separately
            new_sentence = sample_new_indices(p_t1)  # Sample new indices for each word

            # if prev_ratios is not None:
            X_train.append(new_sentence)  # Store word indices at t
            Y_train.append(pairwise_ratios)  # Store ratios from t-1
            Z_train.append(p_t)

            # Print (X1, Y1) pairs
            # print(f"Time Step {step}:")
            # for i in range(len(sentence)):
            #     print(f"Word {sentence[i]} transitioned to {new_sentence[i]}")
            print(sentence)
            sentence = new_sentence  # Update sentence for next step
            p_t = p_t1  # Update for next step
            # prev_ratios = pairwise_ratios  # Store current ratios for next step

    return X_train, Y_train, Z_train, Q_t

# Example usage
delta_t = 0.000001
sentences = indexed_sentences  # Example word indices
X_train, Y_train, Z_train, Q_t = generate_training_data(sentences, vocab_size, delta_t, num_steps=10)

# Print extracted training data
# print("X_train (word indices at t):", X_train)
# print("Y_train (pairwise ratios from t-1):", Y_train)
# print("Z_train (probabilities from t-1):", Z_train)
