In [6]:
import os
import numpy as np
from gensim.models import Word2Vec
import ptb

# Load the PTB dataset
train_corpus, word_to_id, id_to_word = ptb.load_data('train')

# convert to sentences
def process_data(corpus):
    sentences = []
    current_sentence = []
    for word_id in corpus:
        word = id_to_word[word_id]
        if word != '<eos>':
            current_sentence.append(word)
        else:
            sentences.append(current_sentence)
            current_sentence = []
    return sentences

# Util funcs
def find_similar_words(model, target_words):
    similar_words = {}
    for word in target_words:
        try:
            similar = model.wv.most_similar(word, topn=5)
            similar_words[word] = [word for word, score in similar]
        except KeyError:
            similar_words[word] = ["Word not in vocabulary"]
    return similar_words

def solve_analogy(model, analogy_pairs):
    results = {}
    for pair in analogy_pairs:
        word_a, word_b, word_c = pair
        try:
            result = model.wv.most_similar(positive=[word_b, word_c], negative=[word_a], topn=1)
            results[pair] = result[0][0]
        except KeyError:
            results[pair] = "Words not in vocabulary"
    return results

sentences = process_data(train_corpus)

# Train a CBOW model
cbow_model = Word2Vec(sentences=sentences, vector_size=100, sg=0, window=5, min_count=1)

# Train a Skip-Gram model
skipgram_model = Word2Vec(sentences=sentences, vector_size=100, sg=1, window=5, min_count=1)

# Process res
target_words = ['stock', 'revenue', 'savings', 'debt']
cbow_words_res = find_similar_words(cbow_model, target_words)
kipgram_words_res = find_similar_words(skipgram_model, target_words)

# Log out
for word in target_words:
    print(f"Words most similar to '{word}' (CBOW): {cbow_words_res[word]}")
    print(f"Words most similar to '{word}' (Skip-Gram): {kipgram_words_res[word]}")
    print()

# Pairs problem
analogy_pairs = [('take', 'took', 'go'), ('car', 'cars', 'child'), ('good', 'better', 'bad')]

# Process res
analogy_results_cbow = solve_analogy(cbow_model, analogy_pairs)
analogy_results_skipgram = solve_analogy(skipgram_model, analogy_pairs)

# Log out res
for pair, result in analogy_results_cbow.items():
    print(f"Analogy: {pair[0]}:{pair[1]} = {pair[2]}:?")
    print(f"CBOW Model Result: {result}\n")

for pair, result in analogy_results_skipgram.items():
    print(f"Analogy: {pair[0]}:{pair[1]} = {pair[2]}:?")
    print(f"Skip-Gram Model Result: {result}\n")

Words most similar to 'stock' (CBOW): ['shares', 'converted', 'price', 'value', 'bond']
Words most similar to 'stock' (Skip-Gram): ['stocks', 'shares', 'junk-bond', 'plunge', 'depositary']

Words most similar to 'revenue' (CBOW): ['income', 'annual', '#', 'sales', 'profit']
Words most similar to 'revenue' (Skip-Gram): ['kronor', 'pretax', 'passenger', 'a$', 'second-quarter']

Words most similar to 'savings' (CBOW): ['loan', 'corp', 'ltd', 'electric', 'trust']
Words most similar to 'savings' (Skip-Gram): ['loan', 'arizona', 'merabank', 'ncnb', 'valley']

Words most similar to 'debt' (CBOW): ['cash', 'assets', 'financing', 'amount', 'reserves']
Words most similar to 'debt' (Skip-Gram): ['refinancing', 'tva', 'cash', 'repay', 'assets']

Analogy: take:took = go:?
CBOW Model Result: went

Analogy: car:cars = child:?
CBOW Model Result: folks

Analogy: good:better = bad:?
CBOW Model Result: greater

Analogy: take:took = go:?
Skip-Gram Model Result: ran

Analogy: car:cars = child:?
Skip-Gram M