In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example sentences
sentences = [
  "The quick brown fox jumps over the lazy dog",
  "The lazy dog is sleeping",
  "The quick brown fox is fast"
]

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# Convert the sentences to sequences of n-grams
n_gram = 2
sequences = []
for sentence in sentences:
  words = sentence.split()
  for i in range(len(words)-n_gram+1):
    sequence = ' '.join(words[i:i+n_gram])
    sequences.append(sequence)

# Tokenize the n-grams
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
ngram_index = tokenizer.word_index

# Convert the n-grams to sequences of integers
ngram_sequences = tokenizer.texts_to_sequences(sequences)

# Pad the n-gram sequences
max_length = max([len(seq) for seq in ngram_sequences])
padded_sequences = pad_sequences(ngram_sequences, maxlen=max_length, padding='post')

# Print the results
print("Word Index: ", word_index)
print("N-Gram Index: ", ngram_index)
print("N-Gram Sequences: ", ngram_sequences)
print("Padded N-Gram Sequences: ", padded_sequences)


2023-02-14 19:27:54.102966: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Word Index:  {'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'lazy': 5, 'dog': 6, 'is': 7, 'jumps': 8, 'over': 9, 'sleeping': 10, 'fast': 11}
N-Gram Index:  {'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'lazy': 5, 'is': 6, 'dog': 7, 'jumps': 8, 'over': 9, 'sleeping': 10, 'fast': 11}
N-Gram Sequences:  [[1, 2], [2, 3], [3, 4], [4, 8], [8, 9], [9, 1], [1, 5], [5, 7], [1, 5], [5, 7], [7, 6], [6, 10], [1, 2], [2, 3], [3, 4], [4, 6], [6, 11]]
Padded N-Gram Sequences:  [[ 1  2]
 [ 2  3]
 [ 3  4]
 [ 4  8]
 [ 8  9]
 [ 9  1]
 [ 1  5]
 [ 5  7]
 [ 1  5]
 [ 5  7]
 [ 7  6]
 [ 6 10]
 [ 1  2]
 [ 2  3]
 [ 3  4]
 [ 4  6]
 [ 6 11]]
