Word2Vec is a specific algorithm for learning vector representations of words, which are known as word embeddings. The algorithm uses a shallow neural network to train on large amounts of text data and produce a vector for each word in a vocabulary, where the vectors are optimized to capture the relationships between words in the data.

Word embeddings, on the other hand, refer more broadly to the idea of mapping words or phrases to vectors of numbers. There are other algorithms besides Word2Vec that can be used to learn word embeddings, and word embeddings can also be pre-trained and then fine-tuned on a specific task.

In [1]:
## use case 1： Topic modeling and text summarization

# gensim is a pretrained word2vec model

import gensim
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

# Load in the text data that you want to use for topic recognition
text_data = [
    "machine learning is a field of computer science",
    "deep learning is a subfield of machine learning",
    "computer vision is a field related to machine learning",
    "natural language processing is another field related to machine learning"
]

# Preprocess the text data to remove punctuation, lowercase all words, etc.
processed_text = [text.lower().split() for text in text_data]

# Train the Word2Vec model on the processed text data
model = Word2Vec(processed_text, size=100, window=5, min_count=1, workers=4)

# Get the word vectors for each word in the vocabulary
word_vectors = model.wv

# Use the word vectors to perform topic recognition
word_topics = {}
for word in word_vectors.vocab:
    most_similar_words = word_vectors.most_similar(word)
    topics = [word for word, similarity in most_similar_words]
    word_topics[word] = topics

# Print the topics for each word
for word, topics in word_topics.items():
    print(f"{word}: {topics}")


machine: ['of', 'computer', 'to', 'subfield', 'is', 'field', 'science', 'another', 'deep', 'a']
learning: ['of', 'science', 'processing', 'language', 'is', 'vision', 'related', 'machine', 'subfield', 'field']
is: ['of', 'related', 'machine', 'science', 'another', 'field', 'computer', 'a', 'learning', 'language']
a: ['of', 'natural', 'field', 'another', 'machine', 'subfield', 'is', 'to', 'vision', 'related']
field: ['processing', 'machine', 'to', 'of', 'a', 'is', 'deep', 'another', 'natural', 'related']
of: ['a', 'subfield', 'science', 'machine', 'is', 'learning', 'another', 'field', 'computer', 'deep']
computer: ['deep', 'science', 'machine', 'subfield', 'another', 'processing', 'is', 'of', 'language', 'field']
science: ['language', 'of', 'computer', 'processing', 'another', 'machine', 'is', 'learning', 'vision', 'deep']
deep: ['computer', 'to', 'processing', 'machine', 'language', 'natural', 'related', 'field', 'of', 'science']
subfield: ['related', 'of', 'natural', 'vision', 'compute

