Code to generate embeddings of vector length 25 (run in Base environment)

In [None]:
import fasttext
import numpy as np
import pickle

# Define file paths
VOCAB_FILE = "vocab.txt"  # Path to the vocabulary file
OUTPUT_EMBEDDINGS_FILE = "fasttext_embeddings_25.pkl"  # Path to save embeddings

# Step 1: Train FastText Model
print("Training FastText model...")
model = fasttext.train_unsupervised(
    VOCAB_FILE, model="skipgram", dim=15
)  # Train with 50-dimensional embeddings
print("FastText model trained successfully.")

# Step 2: Load Vocabulary
with open(VOCAB_FILE, "r", encoding="utf-8") as f:
    words = f.read().splitlines()  # Read vocabulary file and split into words
print(f"Loaded vocabulary with {len(words)} words.")

# Step 3: Generate FastText Embeddings
print("Generating FastText embeddings...")
fasttext_embeddings = {}  # Dictionary to store word embeddings
for word in words:
    fasttext_embeddings[word] = model.get_word_vector(
        word
    ).tolist()  # Get embedding for each word
print(f"Generated embeddings for {len(fasttext_embeddings)} words.")

# Step 4: Handle Missing Words (if any)
embedding_dim = 25  # Dimension of FastText embeddings
all_vectors = np.array(
    list(fasttext_embeddings.values())
)  # Convert embeddings to a NumPy array
mean_vector = np.mean(all_vectors, axis=0)  # Compute mean vector of all embeddings

for word in words:
    if word not in fasttext_embeddings:  # If word is missing in FastText embeddings
        fasttext_embeddings[word] = mean_vector.tolist()  # Assign mean vector
print(f"Handled missing words (if any).")

# Step 5: Verify Embedding Dimensions
print("Verifying embedding dimensions...")
for word, embedding in fasttext_embeddings.items():
    if len(embedding) != embedding_dim:
        print(
            f"Error: Embedding for '{word}' has incorrect dimension {len(embedding)}."
        )
    else:
        print(f"Embedding for '{word}' has correct dimension {len(embedding)}.")

# Step 6: Save Embeddings
with open(OUTPUT_EMBEDDINGS_FILE, "wb") as f:
    pickle.dump(fasttext_embeddings, f)  # Save embeddings to a pickle file
print(f"FastText embeddings saved to {OUTPUT_EMBEDDINGS_FILE}.")


Training FastText model...
FastText model trained successfully.
Loaded vocabulary with 3867 words.
Generating FastText embeddings...
Generated embeddings for 3867 words.
Handled missing words (if any).
Verifying embedding dimensions...
Error: Embedding for 'apples' has incorrect dimension 15.
Error: Embedding for 'expect' has incorrect dimension 15.
Error: Embedding for 'translucent' has incorrect dimension 15.
Error: Embedding for 'appropriate' has incorrect dimension 15.
Error: Embedding for 'lentil' has incorrect dimension 15.
Error: Embedding for 'ARE' has incorrect dimension 15.
Error: Embedding for 'time' has incorrect dimension 15.
Error: Embedding for 'selected' has incorrect dimension 15.
Error: Embedding for 'intimate' has incorrect dimension 15.
Error: Embedding for 'MOSHPHERE' has incorrect dimension 15.
Error: Embedding for 'marsala' has incorrect dimension 15.
Error: Embedding for 'saturday' has incorrect dimension 15.
Error: Embedding for 'Murrays' has incorrect dimensio