In [None]:
!uv add sentence-transformers scikit-learn matplotlib numpy pandas

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import seaborn as sns

# ================================
# 1. BASIC EMBEDDING CONCEPTS
# ================================

print("=== Understanding Embeddings ===")

# Load a pre-trained sentence transformer model
# This model converts text to 384-dimensional vectors
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example sentences to embed
sentences = [
    "The cat sits on the mat",
    "A feline rests on the carpet", 
    "Dogs are loyal companions",
    "Canines make faithful friends",
    "I love eating pizza",
    "Python is a programming language",
    "Machine learning is fascinating"
]

# Generate embeddings
embeddings = model.encode(sentences)

print(f"Number of sentences: {len(sentences)}")
print(f"Embedding shape: {embeddings.shape}")
print(f"Each sentence becomes a vector of {embeddings.shape[1]} numbers")

# ================================
# 2. EXPLORING SIMILARITY
# ================================

print("\n=== Measuring Semantic Similarity ===")

# Calculate cosine similarity between all pairs
similarity_matrix = cosine_similarity(embeddings)

# Create a DataFrame for better visualization
similarity_df = pd.DataFrame(
    similarity_matrix, 
    index=sentences, 
    columns=sentences
)

# Display similarity scores
print("Similarity between first two sentences (cat/feline):")
print(f"{similarity_matrix[0][1]:.3f}")

print("\nSimilarity between cat sentence and pizza sentence:")
print(f"{similarity_matrix[0][4]:.3f}")

# Visualize similarity matrix
plt.figure(figsize=(12, 8))
sns.heatmap(similarity_df, annot=True, cmap='coolwarm', center=0.5)
plt.title('Semantic Similarity Between Sentences')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# ================================
# 3. SEMANTIC SEARCH EXAMPLE
# ================================

print("\n=== Practical Example: Semantic Search ===")

# Database of product descriptions
product_descriptions = [
    "Wireless bluetooth headphones with noise cancellation",
    "Lightweight running shoes for marathon training", 
    "Organic coffee beans from Colombian highlands",
    "Waterproof smartphone case for outdoor adventures",
    "Professional camera lens for portrait photography",
    "Ergonomic office chair with lumbar support",
    "Stainless steel water bottle keeps drinks cold",
    "Gaming laptop with high refresh rate display"
]

# Embed all products
product_embeddings = model.encode(product_descriptions)

def semantic_search(query, top_k=3):
    """Find most relevant products for a search query"""
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, product_embeddings)[0]
    
    # Get top k most similar products
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        results.append({
            'product': product_descriptions[idx],
            'similarity': similarities[idx]
        })
    
    return results

# Test semantic search
queries = [
    "audio equipment for music",
    "fitness gear for running", 
    "photography equipment",
    "something to drink water"
]

for query in queries:
    print(f"\nSearch: '{query}'")
    results = semantic_search(query)
    for i, result in enumerate(results, 1):
        print(f"{i}. {result['product']} (similarity: {result['similarity']:.3f})")

# ================================
# 4. VISUALIZING EMBEDDINGS IN 2D
# ================================

print("\n=== Visualizing High-Dimensional Embeddings ===")

# Use PCA to reduce dimensions for visualization
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(product_embeddings)

# Create visualization
plt.figure(figsize=(12, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                     c=range(len(product_descriptions)), cmap='tab10', s=100)

# Add labels for each point
for i, desc in enumerate(product_descriptions):
    # Truncate long descriptions for readability
    label = desc[:30] + "..." if len(desc) > 30 else desc
    plt.annotate(label, (embeddings_2d[i, 0], embeddings_2d[i, 1]), 
                xytext=(5, 5), textcoords='offset points', fontsize=9)

plt.title('Product Embeddings Visualized in 2D Space')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"PCA explains {sum(pca.explained_variance_ratio_):.2%} of total variance")

# ================================
# 5. CLUSTERING WITH EMBEDDINGS
# ================================

print("\n=== Clustering Similar Content ===")

from sklearn.cluster import KMeans

# Cluster the product embeddings
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(product_embeddings)

# Visualize clusters
plt.figure(figsize=(12, 8))
colors = ['red', 'blue', 'green', 'purple', 'orange']

for cluster_id in range(n_clusters):
    cluster_points = embeddings_2d[clusters == cluster_id]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], 
               c=colors[cluster_id], label=f'Cluster {cluster_id}', s=100)

# Add product labels
for i, desc in enumerate(product_descriptions):
    label = desc[:25] + "..." if len(desc) > 25 else desc
    plt.annotate(label, (embeddings_2d[i, 0], embeddings_2d[i, 1]), 
                xytext=(5, 5), textcoords='offset points', fontsize=9)

plt.title('Product Clustering Based on Embeddings')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Show which products are in each cluster
print("Cluster assignments:")
for cluster_id in range(n_clusters):
    products_in_cluster = [product_descriptions[i] for i in range(len(product_descriptions)) 
                          if clusters[i] == cluster_id]
    print(f"\nCluster {cluster_id}:")
    for product in products_in_cluster:
        print(f"  - {product}")

# ================================
# 6. PRACTICAL ML PIPELINE
# ================================

print("\n=== Using Embeddings for Classification ===")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Create a classification dataset
texts = [
    "This movie was absolutely amazing! Great acting and plot.",
    "Terrible film, wasted my time. Poor story and bad acting.",
    "Outstanding performance by the lead actor. Highly recommend!",
    "Boring and predictable. Not worth watching.",
    "Incredible cinematography and emotional depth.",
    "Awful movie with terrible dialogue and weak characters.",
    "Masterpiece! One of the best films I've ever seen.",
    "Disappointing sequel that ruins the original.",
    "Beautiful storytelling and excellent character development.",
    "Completely overrated. Don't believe the hype."
]

# Labels: 1 = positive, 0 = negative
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

# Generate embeddings for the texts
text_embeddings = model.encode(texts)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    text_embeddings, labels, test_size=0.3, random_state=42
)

# Train classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

print("Classification Results:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

# Test on new examples
new_reviews = [
    "This movie exceeded all my expectations!",
    "Worst film I've seen this year."
]

new_embeddings = model.encode(new_reviews)
predictions = classifier.predict(new_embeddings)
probabilities = classifier.predict_proba(new_embeddings)

for i, review in enumerate(new_reviews):
    sentiment = "Positive" if predictions[i] == 1 else "Negative"
    confidence = max(probabilities[i])
    print(f"\nReview: '{review}'")
    print(f"Predicted sentiment: {sentiment} (confidence: {confidence:.3f})")

print("\n=== Key Takeaways ===")
print("1. Embeddings convert text to dense numerical vectors")
print("2. Similar meanings result in similar vectors (cosine similarity)")
print("3. Can be used for search, clustering, and classification")
print("4. Pre-trained models like sentence-transformers work well out-of-the-box")
print("5. Embeddings enable semantic understanding in ML pipelines")