In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import io
import re
import tqdm
import string

In [2]:
EMBEDDING_SIZE = 256
WINDOW_SIZE = 3

In [3]:
# Define the corpus

# corpus = ['In a small village surrounded by mountains, people lived in harmony with nature. The seasons changed gently, bringing new colors and sounds to the valley. In spring, the flowers bloomed, and birds sang joyfully, filling the air with music. Farmers planted crops in neat rows, while children played near the streams, their laughter echoing through the hills. Summer brought warmth, and the fields turned green with life. The village buzzed with activity, as people harvested fruits and vegetables. Autumn arrived with a cool breeze, turning the leaves into shades of gold and red. The village prepared for winter, gathering wood and storing food. Snow covered the land, and the world seemed to slow down. Families gathered around fires, sharing stories of the past and dreaming of the future. Despite the challenges of each season, the villagers remained hopeful and resilient, always finding joy in the simple moments of life.']

def custom_standardization(input_data):
    input_data.lower()
    # Create translation table
    translator = str.maketrans('', '', string.punctuation)

    # Remove punctuation
    return input_data.lower().translate(translator)

f = open("war_and_peace.txt", "r")
input_data = f.read()
data = custom_standardization(input_data)
corpus = [data]
 
# Convert the corpus to a sequence of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
print("After converting our words in the corpus into vector of integers:")
print(len(sequences[0]))

After converting our words in the corpus into vector of integers:
1113458


In [4]:
word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}


# Define the parameters
vocab_size = len(tokenizer.word_index) + 1
print("vocab size: ", vocab_size)

# Generate the context-target pairs
contexts = []
targets = []
for sequence in sequences:
    for i in range(WINDOW_SIZE, len(sequence) - WINDOW_SIZE):
        context = sequence[i - WINDOW_SIZE:i] +\
            sequence[i + 1:i + WINDOW_SIZE + 1]
        target = sequence[i]
        contexts.append(context)
        targets.append(target)

# Convert the contexts and targets to numpy arrays
X = np.array(contexts)

y = np.zeros((len(X), vocab_size))
for i in range(0, len(targets)):
    y[i][targets[i]] = 1


# Define the CBOW model
model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=EMBEDDING_SIZE,
                    input_length=2*WINDOW_SIZE))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
model.add(Dense(units=vocab_size, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

vocab size:  41611
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 6, 256)            10652416  
                                                                 
 lambda (Lambda)             (None, 256)               0         
                                                                 
 dense (Dense)               (None, 41611)             10694027  
                                                                 
Total params: 21346443 (81.43 MB)
Trainable params: 21346443 (81.43 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
# Fit model
# model.fit(X[:1000], y, epochs=5, verbose=2)

In [5]:
model.load_weights('cbow_weights.h5')

W1 = model.get_weights()[0]
W2 = model.get_weights()[1]

In [36]:
def get_hidden_vector(context_ids, W1):
    # W1[context_ids] gives you the embeddings for each context word
    # Take mean across rows to get the CBOW hidden vector
    print(" ----> ", context_ids)
    return np.mean(W1[context_ids], axis=0)


sentense = 'rendered the country turbulent and difficult to'
context_words = []

split_sentense = sentense.split(" ")

if (len(split_sentense) - 1) != WINDOW_SIZE * 2:
    print("ERROR: sentense does not have the correct lenght for testinng")
else:
    for w in split_sentense:
        context_words.append(w)
    del context_words[WINDOW_SIZE]
    
context_ids = []
for w in context_words:
    context_ids.append(word_index[w])
    
# context_ids

h = get_hidden_vector(context_ids, W1)


print(h.shape)      # (EMBEDDING_SIZE,)
print(h.tolist())   # plain list of floats

 ---->  [1019, 1, 126, 3, 900, 4]
(256,)
[0.7258069515228271, 0.5210476517677307, -0.31427672505378723, 0.010791580192744732, -0.49918320775032043, 0.7085666656494141, 1.1874264478683472, -1.226688027381897, -0.360283762216568, -0.10812386125326157, -0.5481446385383606, 0.39936044812202454, -0.828488826751709, 0.143117293715477, 0.3158906400203705, 0.32476159930229187, 1.6690653562545776, -1.3208023309707642, 1.577238917350769, -0.9376479983329773, 0.06187152862548828, 0.018156806007027626, 0.8472817540168762, 4.186216831207275, 1.465622901916504, -0.5364861488342285, 1.229336142539978, 0.06967908143997192, 0.5099479556083679, 0.10128132253885269, -3.0068235397338867, 0.906862735748291, 0.4811820089817047, 0.1265731304883957, 0.2828443944454193, 2.252056360244751, -0.4407115876674652, 0.4164792597293854, -0.5646108388900757, 0.40751567482948303, 0.5648707747459412, 1.023675799369812, 0.7552998661994934, 1.3809815645217896, -0.9224650263786316, -1.100748896598816, -0.39271703362464905, 

In [None]:
# Additional processing with RELU function
"""def relu(x):
    return max(0.0, x)

weights_new = model.get_weights()[0].copy()
threshold = 0.8

for i in range(len(weights_new)):
    for j in range(len(weights_new[i])):
        if relu(weights_new[i][j]) > threshold:
            weights_new[i][j] = 1
        else:
            weights_new[i][j] = 0
            
index = 35
count = 0
for i in weights_new[index]:
    if i == 1.0:
        count += 1

print(count)"""

In [None]:
# Test predictions
def predict_target(context_words):
    context_sequence = tokenizer.texts_to_sequences([context_words])[0]
    context_sequence = pad_sequences([context_sequence], maxlen=WINDOW_SIZE * 2)
    prediction = model.predict(context_sequence)
    predicted_word_idx = np.argmax(prediction)
    predicted_word = tokenizer.index_word[predicted_word_idx]
    return predicted_word

# Example prediction
sentense = 'rendered the country turbulent and difficult to'
context_words = []

split_sentense = sentense.split(" ")

if (len(split_sentense) - 1) != WINDOW_SIZE * 2:
    print("ERROR: sentense does not have the correct lenght for testinng")
else:
    for w in split_sentense:
        context_words.append(w)
    del context_words[WINDOW_SIZE]

    predicted_word = predict_target(context_words)
    print(f"Predicted word for context {context_words}: {predicted_word}")