In [1]:
import gensim
import numpy as np
import pandas as pd

In [2]:
def compare_results(word: str, model_ft: gensim.models.FastText, model_w2v: gensim.models.Word2Vec) -> pd.DataFrame:
    """
    Compare similarity search results between FastText and Word2Vec models for a given word.
    
    Args:
        word (str): The word to compare.
        model_ft (gensim.models.FastText): The FastText model.
        model_w2v (gensim.models.Word2Vec): The Word2Vec model.

    Returns:
        pd.DataFrame: A DataFrame containing comparison results.
    """
    word_count_ft = model_ft.wv.get_vecattr(word, 'count')
    word_count_w2v = model_w2v.wv.get_vecattr(word, 'count')
    print(f"Word '{word}' has {word_count_ft} samples in training text.")

    print('Running similarity searches...')
    results_ft = model_ft.wv.most_similar(word)
    results_w2v = model_w2v.wv.most_similar(word)

    table_rows = []

    for (word_ft, score_ft), (word_w2v, score_w2v) in zip(results_ft, results_w2v):
        count_ft = model_ft.wv.get_vecattr(word_ft, 'count')
        count_w2v = model_w2v.wv.get_vecattr(word_w2v, 'count')

        table_rows.append(
            (word_ft, f'{count_ft:,}', f'{score_ft:.2f}', word_w2v, f'{count_w2v:,}', f'{score_w2v:.2f}')
        )

    columns = ['fasttext', 'freq_ft', 'score_ft', 'word2vec', 'freq_w2v', 'score_w2v']
    df = pd.DataFrame(table_rows, columns=columns)
    return df

In [3]:
model_ft = gensim.models.FastText.load('../data/fasttext.model')
model_w2v = gensim.models.Word2Vec.load('../data/word2vec.model')

In [4]:
word_to_compare = 'example'
result_df = compare_results(word_to_compare, model_ft, model_w2v)
print(result_df.head())

Word 'example' has 2912 samples in training text.
Running similarity searches...
          fasttext freq_ft score_ft      word2vec freq_w2v score_w2v
0   counterexample       3     0.86      instance      565      0.70
1         instance     565     0.85       synonym       47      0.53
2         examples     781     0.82  abbreviation       36      0.50
3  counterexamples       2     0.82      starters       46      0.50
4      consistancy       2     0.79  illustration       66      0.49


In [5]:
word_to_compare = 'famous'
result_df = compare_results(word_to_compare, model_ft, model_w2v)
print(result_df.head())

Word 'famous' has 577 samples in training text.
Running similarity searches...
    fasttext freq_ft score_ft     word2vec freq_w2v score_w2v
0  famousdog       2     0.90        poets       41      0.65
1    famouse       2     0.85  influential      119      0.65
2   famously      25     0.81         poet       69      0.62
3   infamous      67     0.77       bailey      138      0.59
4  strangler       2     0.77     greatest      308      0.59


In [6]:
# Calculate and store vector norms and similarities
word_vectors_w2v = model_w2v.wv
word_vectors_ft = model_ft.wv

norm_stupid = np.linalg.norm(word_vectors_w2v['stupid'])
norm_bwahahahaha = np.linalg.norm(word_vectors_w2v['bwahahahaha'])

similarity_stupid_dumb_ft = word_vectors_ft.similarity('stupid', 'dumb')
similarity_stupid_dumb_w2v = word_vectors_w2v.similarity('stupid', 'dumb')

# Print the results
print(f"Norm of 'stupid': {norm_stupid:.3f}")
print(f"Norm of 'bwahahahaha': {norm_bwahahahaha:.3f}")

Norm of 'stupid': 16.118
Norm of 'bwahahahaha': 0.603


In [7]:
print("Similarity between 'stupid' and 'dumb:'")
print(f"  fasttext: {similarity_stupid_dumb_ft:.2f}")
print(f"  word2vec: {similarity_stupid_dumb_w2v:.2f}")

Similarity between 'stupid' and 'dumb:'
  fasttext: 0.61
  word2vec: 0.72
