In [6]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/longnguyen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Example sentences for training
S = [
    "The quick brown fox jumps over the lazy dog.",
    "She sells seashells by the seashore.", 
    "The cat sat on the mat.",
    "A stitch in time saves nine.",
    "Rome wasn't built in a day.",
    "Where there's a will, there's a way.",
    "Actions speak louder than words.",
    "All's well that ends well.",
    "Beauty is in the eye of the beholder.",
    "Time flies when you're having fun.",
    "The early bird catches the worm.",
    "Don't count your chickens before they hatch.",
    "A penny saved is a penny earned.",
    "Every cloud has a silver lining.",
    "Haste makes waste.",
    "The sun rises in the east and sets in the west.",
    "Life is like a box of chocolates, you never know what you're gonna get.",
    "Two wrongs don't make a right, but three lefts do.",
    "Laughter is timeless, imagination has no age, and dreams are forever.",
    "The only way to do great work is to love what you do.",
    "The journey of a thousand miles begins with a single step.",
    "Success is not final, failure is not fatal: It is the courage to continue that counts.",
    "Education is the most powerful weapon which you can use to change the world.",
    "In the middle of difficulty lies opportunity.",
    "The best way to predict the future is to create it.",
    "Life is what happens when you're busy making other plans.",
    "Believe you can and you're halfway there.",
    "The future belongs to those who believe in the beauty of their dreams.", 
    "Be yourself; everyone else is already taken.",
    "It does not matter how slowly you go as long as you do not stop.",
    "Embrace your uniqueness; there's no one else quite like you.",
    "Stay true to who you are; there's nobody else quite like you."
    "The bank by the river has a beautiful view.",
    "She used a sharp knife to cut the cake.",
    "The crane lifted heavy loads at the construction site.",
    "The crane watched the flock of birds flying overhead."]

In [8]:
# Preprocess a sentence
def preprocess_sentence(sentence):
    tokens = word_tokenize(sentence.lower())  # Tokenize and lowercase
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]  # Remove stopwords and punctuation
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    return tokens

# Preprocess all sentences in S
preprocessed_S = [preprocess_sentence(sentence) for sentence in S]

# Train Word2Vec model with CBOW
model_cbow = Word2Vec(preprocessed_S, vector_size=100, window=5, min_count=1, workers=4, sg=0)

# Train Word2Vec model with Skip-gram
model_skipgram = Word2Vec(preprocessed_S, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Function to compute average word embedding of a sentence
def average_word_embedding(sentence_tokens, model):
    embeddings = []
    for token in sentence_tokens:
        if token in model.wv:
            embeddings.append(model.wv[token])
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return None

# Function to find most similar sentence using a given model
def most_similar_sentence_with_model(X, S, model):
    X_tokens = preprocess_sentence(X)
    X_embedding = average_word_embedding(X_tokens, model)
    if X_embedding is None:
        return None
    max_similarity = -1
    most_similar_sentence = None
    for sentence in S:
        sentence_tokens = preprocess_sentence(sentence)
        sentence_embedding = average_word_embedding(sentence_tokens, model)
        if sentence_embedding is not None:
            similarity = cosine_similarity([X_embedding], [sentence_embedding])[0][0]
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_sentence = sentence
    return most_similar_sentence, max_similarity

# Given sentence X
X = "The crane"

# Find most similar sentence using CBOW model
most_similar_cbow, similarity_score_cbow = most_similar_sentence_with_model(X, S, model_cbow)

# Find most similar sentence using Skip-gram model
most_similar_skipgram, similarity_score_skipgram = most_similar_sentence_with_model(X, S, model_skipgram)

print("Most similar sentence with CBOW:", most_similar_cbow)
print("Similarity score with CBOW:", similarity_score_cbow)

print("\nMost similar sentence with Skip-gram:", most_similar_skipgram)
print("Similarity score with Skip-gram:", similarity_score_skipgram)

Most similar sentence with CBOW: The crane watched the flock of birds flying overhead.
Similarity score with CBOW: 0.45683396

Most similar sentence with Skip-gram: The crane watched the flock of birds flying overhead.
Similarity score with Skip-gram: 0.45727187


In [9]:
# Load pre-trained GloVe word embeddings manually
def load_glove_embeddings(file_path):
    word_vectors = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            word_vectors[word] = vector
    return word_vectors

# Load pre-trained GloVe word embeddings
glove_file = './glove.6B.50d.txt'  # Change path to the location of your GloVe file
word_vectors = load_glove_embeddings(glove_file)

In [10]:
# Function to compute average word embedding of a sentence using GloVe embeddings
def average_word_embedding(sentence_tokens):
    embeddings = []
    for token in sentence_tokens:
        if token in word_vectors:
            embeddings.append(word_vectors[token])
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return None

# Function to find most similar sentence using GloVe embeddings
def most_similar_sentence(X, S):
    X_tokens = preprocess_sentence(X)
    X_embedding = average_word_embedding(X_tokens)
    if X_embedding is None:
        return None
    max_similarity = -1
    most_similar_sentence = None
    for i, sentence in enumerate(S):
        sentence_tokens = preprocess_sentence(sentence)
        sentence_embedding = average_word_embedding(sentence_tokens)
        if sentence_embedding is not None:
            similarity = cosine_similarity([X_embedding], [sentence_embedding])[0][0]
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_sentence = sentence
    return most_similar_sentence, max_similarity

# Given sentence X
X = "The crane lifted"

# Find most similar sentence using GloVe embeddings
most_similar, similarity_score = most_similar_sentence(X, S)
print("Most similar sentence:", most_similar)
print("Similarity score:", similarity_score)

Most similar sentence: The crane lifted heavy loads at the construction site.
Similarity score: 0.7860202
