# Problem 1: Tokenization of Text

In [17]:
#import libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds


In [18]:
#add sentences to be Tokenized
sentences = [
    'Python is commonly used to develop AI applications, such as improving human to computer interactions, identifying trends, and making predictions.',
    'One way that Python is used for human to computer interactions is through chatbots.',
    'Chatbots use artificial intelligence and natural language processing to allow us to communicate with a computer more naturally.',
    'We interact with chatbots using text or voice commands when we are trying to contact customer service or when we are asking our Alexa or Google Home to answer a question or perform a task.',
    'Today you will learn how to make your first AI in Python using some basic techniques.'
]

In [19]:
#Tokenize the words in the sentences
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'to': 1, 'or': 2, 'python': 3, 'is': 4, 'computer': 5, 'chatbots': 6, 'a': 7, 'we': 8, 'used': 9, 'ai': 10, 'human': 11, 'interactions': 12, 'and': 13, 'with': 14, 'using': 15, 'when': 16, 'are': 17, 'commonly': 18, 'develop': 19, 'applications': 20, 'such': 21, 'as': 22, 'improving': 23, 'identifying': 24, 'trends': 25, 'making': 26, 'predictions': 27, 'one': 28, 'way': 29, 'that': 30, 'for': 31, 'through': 32, 'use': 33, 'artificial': 34, 'intelligence': 35, 'natural': 36, 'language': 37, 'processing': 38, 'allow': 39, 'us': 40, 'communicate': 41, 'more': 42, 'naturally': 43, 'interact': 44, 'text': 45, 'voice': 46, 'commands': 47, 'trying': 48, 'contact': 49, 'customer': 50, 'service': 51, 'asking': 52, 'our': 53, 'alexa': 54, 'google': 55, 'home': 56, 'answer': 57, 'question': 58, 'perform': 59, 'task': 60, 'today': 61, 'you': 62, 'will': 63, 'learn': 64, 'how': 65, 'make': 66, 'your': 67, 'first': 68, 'in': 69, 'some': 70, 'basic': 71, 'techniques': 72}


In [20]:
#turning the sentences into sequences
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[3, 4, 18, 9, 1, 19, 10, 20, 21, 22, 23, 11, 1, 5, 12, 24, 25, 13, 26, 27], [28, 29, 30, 3, 4, 9, 31, 11, 1, 5, 12, 4, 32, 6], [6, 33, 34, 35, 13, 36, 37, 38, 1, 39, 40, 1, 41, 14, 7, 5, 42, 43], [8, 44, 14, 6, 15, 45, 2, 46, 47, 16, 8, 17, 48, 1, 49, 50, 51, 2, 16, 8, 17, 52, 53, 54, 2, 55, 56, 1, 57, 7, 58, 2, 59, 7, 60], [61, 62, 63, 64, 65, 1, 66, 67, 68, 10, 69, 3, 15, 70, 71, 72]]


In [21]:
#adding test data
test_data = [
    'Through this tutorial, you will get a basic understanding of how chatbots work.',
    'The chatbots you interact with everyday are pretty smart because they use additional algorithms and libraries.',
    'Click the Start Coding button on the page to sign in or create an account.'
]

In [22]:
#tokenizing the test data into sequences
test_sequences = tokenizer.texts_to_sequences(test_data)
print(test_sequences)

[[32, 62, 63, 7, 71, 65, 6], [6, 62, 44, 14, 17, 33, 13], [1, 69, 2]]


In [23]:
#adding out-of-vocabulary token and padding
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences)

print(word_index)
print(padded)

{'<OOV>': 1, 'to': 2, 'or': 3, 'python': 4, 'is': 5, 'computer': 6, 'chatbots': 7, 'a': 8, 'we': 9, 'used': 10, 'ai': 11, 'human': 12, 'interactions': 13, 'and': 14, 'with': 15, 'using': 16, 'when': 17, 'are': 18, 'commonly': 19, 'develop': 20, 'applications': 21, 'such': 22, 'as': 23, 'improving': 24, 'identifying': 25, 'trends': 26, 'making': 27, 'predictions': 28, 'one': 29, 'way': 30, 'that': 31, 'for': 32, 'through': 33, 'use': 34, 'artificial': 35, 'intelligence': 36, 'natural': 37, 'language': 38, 'processing': 39, 'allow': 40, 'us': 41, 'communicate': 42, 'more': 43, 'naturally': 44, 'interact': 45, 'text': 46, 'voice': 47, 'commands': 48, 'trying': 49, 'contact': 50, 'customer': 51, 'service': 52, 'asking': 53, 'our': 54, 'alexa': 55, 'google': 56, 'home': 57, 'answer': 58, 'question': 59, 'perform': 60, 'task': 61, 'today': 62, 'you': 63, 'will': 64, 'learn': 65, 'how': 66, 'make': 67, 'your': 68, 'first': 69, 'in': 70, 'some': 71, 'basic': 72, 'techniques': 73}
[[ 0  0  0  0

In [24]:
movie_sentences = []
train_data = tfds.as_numpy(tfds.load('movie_rationales', split="train"))
for item in train_data:
    movie_sentences.append(str(item['review']))
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(movie_sentences)
sequences = tokenizer.texts_to_sequences(movie_sentences)
print(tokenizer.word_index)
print(sequences[123])



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/movie_rationales/0.1.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/movie_rationales/incomplete.K703CW_0.1.0/movie_rationales-train.tfrecord*.…

Generating validation examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/movie_rationales/incomplete.K703CW_0.1.0/movie_rationales-validation.tfrec…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/movie_rationales/incomplete.K703CW_0.1.0/movie_rationales-test.tfrecord*..…

Dataset movie_rationales downloaded and prepared to /root/tensorflow_datasets/movie_rationales/0.1.0. Subsequent calls will reuse this data.
[955, 2477, 100, 6, 510, 9, 138, 1078, 2477, 100, 3, 28, 127, 26, 45, 8, 87, 1, 487, 26, 23, 33, 167, 1453, 248, 18, 172, 59, 4108, 10, 68, 24, 37, 213, 2, 338, 3, 1255, 161, 23, 566, 1, 487, 14, 3, 1351, 22, 4465, 10, 34, 502, 77, 388, 4106, 638, 1681, 379, 12, 23, 175, 2, 2804, 2698, 27, 1283, 16, 296, 571, 34, 77, 57, 13, 3502, 203, 638, 1681, 8, 1124, 4, 203, 2605, 23, 33, 69, 381, 27, 31, 453, 5, 24, 250, 56, 546, 3, 46, 2575, 130, 221, 28, 1997, 12, 5, 224, 62, 31, 63, 1678, 188, 19, 29, 2143, 7, 18, 14, 22, 23, 50, 124, 9, 15, 53, 1175, 333, 638, 1681, 685, 5, 24, 428, 2, 561, 890, 1697, 574, 1, 827, 2266, 890, 209, 100, 12, 2630, 7, 16, 1241, 228, 80, 167, 7, 131, 3, 16, 104, 638, 1681, 905, 209, 100, 12, 2, 289, 147, 120, 305, 105, 825, 1997, 12, 5, 1, 2547, 15, 131, 3, 16, 1616, 573, 24, 47, 825, 23, 119, 29, 74, 124, 27, 20, 31, 453, 5,

In [25]:
from bs4 import BeautifulSoup
import string

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
             "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how",
             "hows", "i", "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself",
             "lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes", "should",
             "so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then",
             "there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those", "through",
             "to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve", "were",
             "what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why",
             "whys", "with", "would", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
             "yourselves"]

In [26]:
table = str.maketrans('', '', string.punctuation)

In [27]:
movie_sentences = []
train_data = tfds.as_numpy(tfds.load('movie_rationales', split="train"))
for item in train_data:
    sentence = str(item['review'].decode('UTF-8').lower())
    sentence = sentence.replace(",", " , ")
    sentence = sentence.replace(".", " . ")
    sentence = sentence.replace("-", " - ")
    sentence = sentence.replace("/", " / ")
    soup = BeautifulSoup(sentence)
    sentence = soup.get_text()
    words = sentence.split()
    filtered_sentence = ""
    for word in words:
        word = word.translate(table)
        if word not in stopwords:
            filtered_sentence = filtered_sentence + word + " "
    movie_sentences.append(filtered_sentence)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50000)
tokenizer.fit_on_texts(movie_sentences)
sequences = tokenizer.texts_to_sequences(movie_sentences)
print(tokenizer.word_index)



# Problem 2: Vibe Coding

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 5 training sentences taken from the abstract and introduction
train_sentences = [
    "Transformer models have achieved impressive results across a variety of NLP tasks.",
    "The dominant view is that the attention mechanism is the primary contributor to model quality.",
    "In this paper, we challenge this view and investigate the role of the feedforward network (FFN) sublayer.",
    "We conduct experiments that swap and ablate the FFN and attention components of pretrained models.",
    "Our results show that the FFN sublayer is more important than attention in many settings."
]

# Step 1: Initialize tokenizer
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(train_sentences)

# Step 2: Print word index
word_index = tokenizer.word_index
print("🔠 Word Index:\n", word_index)

# Step 3: Convert training sentences to sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
print("\n🔢 Training Sequences:")
for i, seq in enumerate(train_sequences):
    print(f"Sentence {i+1}: {seq}")

# Step 4: Number of words in the word index
print(f"\n📊 Total tokens in word index: {len(word_index)}")


🔠 Word Index:
 {'the': 1, 'of': 2, 'is': 3, 'that': 4, 'attention': 5, 'and': 6, 'ffn': 7, 'models': 8, 'results': 9, 'view': 10, 'in': 11, 'this': 12, 'we': 13, 'sublayer': 14, 'transformer': 15, 'have': 16, 'achieved': 17, 'impressive': 18, 'across': 19, 'a': 20, 'variety': 21, 'nlp': 22, 'tasks': 23, 'dominant': 24, 'mechanism': 25, 'primary': 26, 'contributor': 27, 'to': 28, 'model': 29, 'quality': 30, 'paper': 31, 'challenge': 32, 'investigate': 33, 'role': 34, 'feedforward': 35, 'network': 36, 'conduct': 37, 'experiments': 38, 'swap': 39, 'ablate': 40, 'components': 41, 'pretrained': 42, 'our': 43, 'show': 44, 'more': 45, 'important': 46, 'than': 47, 'many': 48, 'settings': 49}

🔢 Training Sequences:
Sentence 1: [15, 8, 16, 17, 18, 9, 19, 20, 21, 2, 22, 23]
Sentence 2: [1, 24, 10, 3, 4, 1, 5, 25, 3, 1, 26, 27, 28, 29, 30]
Sentence 3: [11, 12, 31, 13, 32, 12, 10, 6, 33, 1, 34, 2, 1, 35, 36, 7, 14]
Sentence 4: [13, 37, 38, 4, 39, 6, 40, 1, 7, 6, 5, 41, 2, 42, 8]
Sentence 5: [43, 9,

In [29]:
# 3 test sentences from later in the introduction
test_sentences = [
    "Feedforward networks play a crucial role in transformer performance.",
    "We find that replacing the FFN has a larger effect than removing attention.",
    "These findings question the singular importance of attention mechanisms."
]

# Step 5: Convert test sentences using original tokenizer (no refitting!)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

print("\n🧪 Test Sequences (no OOV handling):")
for i, seq in enumerate(test_sequences):
    print(f"Test Sentence {i+1}: {seq}")

# Step 6: Manual observation for missing words
print("\n🚫 Observation: Words not in the original word_index are skipped in the test sequences.")



🧪 Test Sequences (no OOV handling):
Test Sentence 1: [35, 20, 34, 11, 15]
Test Sentence 2: [13, 4, 1, 7, 20, 47, 5]
Test Sentence 3: [1, 2, 5]

🚫 Observation: Words not in the original word_index are skipped in the test sequences.
