In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer

# URL of the Gutenberg webpage
url = "https://www.gutenberg.org/cache/epub/36/pg36-images.html#chap03"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')
    paragraph_list = [p.get_text(strip=True) for p in paragraphs]
else:
    raise Exception("Failed to retrieve the webpage.")

In [2]:
# Part (a)

# Select 10 paragraphs for the large text dataset
large_text = ' '.join(paragraph_list[8:18]).lower()

# Define a smaller text with some new characters or symbols
small_text = "This is a sample small text with new characters like !, @, $, #, %.".lower()

# Combine both texts for tokenizer to capture all characters
combined_text = large_text + ' ' + small_text

# Tokenize by character level
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts([combined_text])

# Convert texts to sequences of integers
large_text_seq = tokenizer.texts_to_sequences([large_text])[0]
small_text_seq = tokenizer.texts_to_sequences([small_text])[0]

# Display some samples from the smaller text
print("Processed smaller text (as integers):", small_text_seq[:50])

# Character to integer mapping
char_index = tokenizer.word_index
print("\nCharacter mapping:", char_index)


Processed smaller text (as integers): [3, 10, 5, 6, 1, 5, 6, 1, 4, 1, 6, 4, 14, 18, 11, 2, 1, 6, 14, 4, 11, 11, 1, 3, 2, 28, 3, 1, 17, 5, 3, 10, 1, 7, 2, 17, 1, 15, 10, 4, 9, 4, 15, 3, 2, 9, 6, 1, 11, 5]

Character mapping: {' ': 1, 'e': 2, 't': 3, 'a': 4, 'i': 5, 's': 6, 'n': 7, 'o': 8, 'r': 9, 'h': 10, 'l': 11, 'd': 12, 'u': 13, 'm': 14, 'c': 15, 'f': 16, 'w': 17, 'p': 18, 'g': 19, 'y': 20, '\r': 21, '\n': 22, ',': 23, 'b': 24, 'v': 25, '.': 26, 'k': 27, 'x': 28, '—': 29, '-': 30, '0': 31, 'q': 32, 'z': 33, 'j': 34, ';': 35, '!': 36, '3': 37, '5': 38, '?': 39, '1': 40, '8': 41, '9': 42, '4': 43, '2': 44, '“': 45, '”': 46, ':': 47, '’': 48, '@': 49, '$': 50, '#': 51, '%': 52}


In [3]:
# Part (b)

from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

# Define constants
vocab_size = len(tokenizer.word_index) + 1  
embedding_dim = 8 
input_length = len(large_text_seq)  

# Create the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))  

model.compile(optimizer='adam', loss='binary_crossentropy')

# Convert sequences to numpy arrays
large_text_seq = np.array(large_text_seq).reshape(1, -1)

# Train the embedding model
model.fit(large_text_seq, np.array([1]), epochs=10, verbose=1)

# Get the embedding layer
embedding_layer = model.layers[0]

# Apply the trained embedding model to the smaller text
small_text_seq = np.array(small_text_seq).reshape(1, -1)
embeddings = embedding_layer(small_text_seq).numpy()

# Display samples of the embedded representation
print("Embedded representation of the smaller text:", embeddings[0][:10])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Embedded representation of the smaller text: [[-0.02512773 -0.04524183 -0.04393537  0.03889078 -0.05544361  0.03904049
  -0.0312106  -0.02451975]
 [ 0.02603345 -0.03320745 -0.03595594  0.03306324  0.0279299   0.05295893
  -0.01967241 -0.02973782]
 [ 0.0343282   0.02563981 -0.01774368  0.02292908 -0.03180461 -0.01492068
   0.018551    0.03441832]
 [-0.05525877  0.03881495  0.03274475  0.03385846 -0.02314739 -0.04105077
  -0.00859764  0.00682296]
 [-0.01968647  0.05395486 -0.05481583  0.02182067  0.03086295  0.02956053
   0.0397648   0.02453694]
 [ 0.0343282   0.02563981 -0.01774368  0.02292908 -0.03180461 -0.01492068
   0.018551    0.03441832]
 [-0.05525877  0.03881495  0.03274475  0.03385846 -0.02314739 -0.04105077
  -0.00859764  0.00682296]
 [-0.01968647  0.05395486 -0.05481583  0.02182067  0.03086295  0.02956053
   0.0397648   0.02453694]
 [ 0.02352492 -0.03851312 -0.0212390

2024-10-25 12:48:47.996535: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [4]:
# Part (c)

# Extract the embedding weights from the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Function to get embeddings for characters
def get_char_embeddings(char_seq, char_index, embedding_weights):
    for char, idx in char_index.items():
        if idx < len(embedding_weights):  
            print(f"Character: '{char}' - Embedding: {embedding_weights[idx]}")

# Show embeddings for the common characters between large and small text
print("\nCharacter Embeddings from the Large Text for Shared Characters:")
shared_chars = set(large_text) & set(small_text)  # Find common characters
for char in shared_chars:
    idx = char_index[char]
    print(f"Character: '{char}' - Embedding: {embedding_weights[idx]}")

# Show embeddings for the small text
print("\nCharacter Embeddings for the Small Text:")
for char in small_text:
    if char in char_index:
        idx = char_index[char]
        print(f"Character: '{char}' - Embedding: {embedding_weights[idx]}")
    else:
        print(f"Character: '{char}' is out-of-vocabulary.")



Character Embeddings from the Large Text for Shared Characters:
Character: 'e' - Embedding: [-0.04226547  0.00950972 -0.03681732  0.04905974  0.00763726 -0.03691835
 -0.04609565  0.02569714]
Character: 'x' - Embedding: [-0.01579528  0.03050924 -0.02084445 -0.05173132 -0.01258191 -0.04671421
 -0.04082201  0.02790914]
Character: 'c' - Embedding: [-0.0427279  -0.03716389 -0.05652898 -0.02804159 -0.00986847 -0.00887603
 -0.0466985  -0.04726185]
Character: 'p' - Embedding: [-0.04411635  0.02783827  0.03957607  0.0565103   0.03599318  0.04962989
  0.05234342  0.04529336]
Character: 't' - Embedding: [-0.02512773 -0.04524183 -0.04393537  0.03889078 -0.05544361  0.03904049
 -0.0312106  -0.02451975]
Character: 'w' - Embedding: [-0.00717229 -0.0195091   0.00754918 -0.04615949  0.02726276  0.01619787
  0.01306046 -0.01941648]
Character: ',' - Embedding: [ 0.01524457  0.03086192 -0.01282921 -0.02874693 -0.02800214  0.05083368
  0.0360284   0.03180785]
Character: 'l' - Embedding: [-0.02413644 -0.03

In [8]:
# Part (d): Handling New Characters
new_small_text = "^&*()_+"
new_small_text_seq = tokenizer.texts_to_sequences([new_small_text])[0]

print("\nHandling New Characters in the Smaller Text:")
for char in new_small_text:
    if char in char_index:
        idx = char_index[char]
        print(f"Character: '{char}' - Embedding: {embedding_weights[idx]}")
    else:
        print(f"Character: '{char}' is out-of-vocabulary (OOV).")
        print(f"Character: '{char}' - Embedding: {embedding_weights[idx]}")


Handling New Characters in the Smaller Text:
Character: '^' is out-of-vocabulary (OOV).
Character: '^' - Embedding: [-0.0289985   0.04180746  0.00819798 -0.02336457 -0.04074388 -0.00334794
  0.02192151  0.04228214]
Character: '&' is out-of-vocabulary (OOV).
Character: '&' - Embedding: [-0.0289985   0.04180746  0.00819798 -0.02336457 -0.04074388 -0.00334794
  0.02192151  0.04228214]
Character: '*' is out-of-vocabulary (OOV).
Character: '*' - Embedding: [-0.0289985   0.04180746  0.00819798 -0.02336457 -0.04074388 -0.00334794
  0.02192151  0.04228214]
Character: '(' is out-of-vocabulary (OOV).
Character: '(' - Embedding: [-0.0289985   0.04180746  0.00819798 -0.02336457 -0.04074388 -0.00334794
  0.02192151  0.04228214]
Character: ')' is out-of-vocabulary (OOV).
Character: ')' - Embedding: [-0.0289985   0.04180746  0.00819798 -0.02336457 -0.04074388 -0.00334794
  0.02192151  0.04228214]
Character: '_' is out-of-vocabulary (OOV).
Character: '_' - Embedding: [-0.0289985   0.04180746  0.00819