In [0]:
%pip install PyPDF2

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import PyPDF2
import re

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Load and preprocess text
pdf_path = "/Workspace/Users/dbuser2@meteoros.ai/Simple Story.pdf"
raw_text = extract_text_from_pdf(pdf_path)

# Tokenize words: lowercase, alphanumeric
words = re.findall(r"\b\w+\b", raw_text.lower())
unique_words = sorted(list(set(words)))



In [0]:
unique_words

['a',
 'achoo',
 'adventure',
 'always',
 'amazing',
 'and',
 'animals',
 'announcer',
 'annual',
 'antelopes',
 'anthem',
 'ants',
 'anyone',
 'around',
 'as',
 'at',
 'away',
 'awesome',
 'baboon',
 'back',
 'backpack',
 'badge',
 'ball',
 'banana',
 'be',
 'beamed',
 'beep',
 'before',
 'being',
 'beside',
 'big',
 'bigger',
 'bird',
 'bongo',
 'bottom',
 'bouncy',
 'brave',
 'bridges',
 'brothers',
 'build',
 'bushes',
 'but',
 'button',
 'called',
 'came',
 'can',
 'careful',
 'case',
 'caught',
 'cheered',
 'cheetahs',
 'chuckled',
 'click',
 'clouds',
 'could',
 'cousin',
 'crowd',
 'cub',
 'curious',
 'curled',
 'cut',
 'day',
 'deeper',
 'did',
 'didn',
 'do',
 'ears',
 'enough',
 'enter',
 'even',
 'event',
 'everyone',
 'explorer',
 'exploring',
 'falls',
 'family',
 'fastest',
 'feather',
 'first',
 'fit',
 'fly',
 'for',
 'forward',
 'found',
 'free',
 'frogs',
 'from',
 'fun',
 'gasped',
 'gasps',
 'get',
 'giraffe',
 'glowing',
 'going',
 'golden',
 'gulped',
 'gumbala',

In [0]:
import numpy as np
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
import os

# Create tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(unique_words)
vocab_size = len(tokenizer.word_index) + 1

def build_embedding_model(dim,vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=dim, name=f"embedding_{dim}d"))
    model.compile('adam', 'mse')
    return model

# Build models
model_8 = build_embedding_model(8, vocab_size)
model_16 = build_embedding_model(16, vocab_size)
model_32 = build_embedding_model(32, vocab_size)


In [0]:
def get_word_embedding(model, word_index):
    word_input = np.array([word_index])
    return model.predict(word_input, verbose=0).flatten().round(4).tolist()

# Save all to 3 separate JSON files
def save_embeddings(dim, model):
    embeddings = {}
    for word, idx in tokenizer.word_index.items():
        embeddings[word] = get_word_embedding(model, idx)
    with open(f"/Workspace/Users/dbuser2@meteoros.ai/embedding_{dim}d.json", "w") as f:
        json.dump(embeddings, f)

save_embeddings(8, model_8)
save_embeddings(16, model_16)
save_embeddings(32, model_32)


In [0]:
def load_all_embeddings():
    embeddings = {}
    for dim in [8, 16, 32]:
        with open(f"/Workspace/Users/dbuser2@meteoros.ai/embedding_{dim}d.json") as f:
            embeddings[dim] = json.load(f)
    return embeddings

def update_and_retrain(word):
    global tokenizer, unique_words
    
    print(f"'{word}' not found.")
    
    if word not in unique_words:
        unique_words.append(word)
    
    tokenizer.fit_on_texts(unique_words)
    vocab_size = len(tokenizer.word_index) + 1

    # Rebuild models with correct updated vocab size
    model_8 = build_embedding_model(8, vocab_size)
    model_16 = build_embedding_model(16, vocab_size)
    model_32 = build_embedding_model(32, vocab_size)

    # Save updated embeddings
    save_embeddings(8, model_8)
    save_embeddings(16, model_16)
    save_embeddings(32, model_32)

    idx = tokenizer.word_index[word]
    
    return {
        8: get_word_embedding(model_8, idx),
        16: get_word_embedding(model_16, idx),
        32: get_word_embedding(model_32, idx)
    }

# Main interaction
user_word = input("🔍 Enter a word to search: ").strip().lower()
embeddings = load_all_embeddings()

if user_word in embeddings[8]:
    print(f"'{user_word}' found in all embeddings:")
    for dim in [8, 16, 32]:
        print(f"{dim}D: {embeddings[dim][user_word]}")
else:
    new_vectors = update_and_retrain(user_word)
    print(f"Generated embeddings for '{user_word}':")
    for dim, vector in new_vectors.items():
        print(f"{dim}D: {vector}")


🔍 Enter a word to search:  flamboyant

'flamboyant' not found.
Generated embeddings for 'flamboyant':
8D: [0.004699999932199717, -0.014600000344216824, -0.031300000846385956, -0.031199999153614044, -0.026900000870227814, -0.007400000002235174, 0.04019999876618385, 0.04490000009536743]
16D: [-0.026599999517202377, -0.0406000018119812, 0.03660000115633011, 0.00559999980032444, 0.03290000185370445, 0.013799999840557575, 0.00559999980032444, -0.04600000008940697, -0.04729999974370003, -0.0005000000237487257, 0.03150000050663948, 0.0348999984562397, 0.03139999881386757, -0.019200000911951065, 0.030300000682473183, 0.030899999663233757]
32D: [0.043800000101327896, 0.006599999964237213, -0.047600001096725464, -0.045499999076128006, 0.04749999940395355, 0.039400000125169754, 0.04749999940395355, 0.002199999988079071, -0.023600000888109207, 0.020400000736117363, -0.008200000040233135, 0.039500001817941666, -0.0032999999821186066, 0.02590000070631504, -0.002199999988079071, -0.013500000350177288, 0.0034000000450760126, -0.02380000054