# Casino Game Embeddings Exploration

## Objectives
- Analyze embedding characteristics
- Identify clustering patterns
- Explore semantic relationships

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objs as go

# Load embeddings
df = pd.read_csv('game_embeddings.csv')

# Extract embedding columns
embedding_columns = df.columns[-1536:]  # Adjust based on actual embedding dimensions
embeddings = df[embedding_columns].values

# Basic embedding statistics
print("Embedding Shape:", embeddings.shape)
print("\nEmbedding Statistics:")
print(pd.DataFrame(embeddings).describe())

In [None]:
# Clustering Analysis
def find_optimal_clusters(embeddings, max_clusters=10):
    """Find optimal number of clusters using silhouette score"""
    silhouette_scores = []
    
    for n_clusters in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(embeddings)
        score = silhouette_score(embeddings, cluster_labels)
        silhouette_scores.append(score)
    
    # Plot silhouette scores
    plt.figure(figsize=(10, 5))
    plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o')
    plt.title('Silhouette Scores for Different Cluster Counts')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.show()

# Find optimal clusters
find_optimal_clusters(embeddings)

In [None]:
# Perform clustering with optimal number of clusters
from sklearn.decomposition import PCA

# Determine optimal clusters (you'll choose based on previous plot)
optimal_clusters = 5  # Example, adjust based on silhouette score plot

# Cluster the embeddings
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(embeddings)

# Reduce dimensionality for visualization
pca = PCA(n_components=3)
embeddings_3d = pca.fit_transform(embeddings)

# Create 3D scatter plot with clusters
fig = px.scatter_3d(
    x=embeddings_3d[:, 0], 
    y=embeddings_3d[:, 1], 
    z=embeddings_3d[:, 2],
    color=df['cluster'].astype(str),
    hover_data=[df['Text']],
    title='Game Embeddings Clustered'
)
fig.write_html('game_clusters_3d.html')

# Analyze cluster characteristics
cluster_summary = df.groupby('cluster')['Text'].agg(['count', 'first', 'last'])
print("\nCluster Summary:")
print(cluster_summary)

In [None]:
# Semantic Similarity Analysis
def cosine_similarity(a, b):
    """Calculate cosine similarity between two vectors"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Find most similar games within each cluster
def find_similar_games(df, embeddings, cluster, top_n=5):
    """Find top N most similar games within a cluster"""
    cluster_mask = df['cluster'] == cluster
    cluster_embeddings = embeddings[cluster_mask]
    cluster_games = df[cluster_mask]['Text'].values
    
    similarities = []
    for i, game_embedding in enumerate(cluster_embeddings):
        game_sims = [
            (cosine_similarity(game_embedding, other_embedding), cluster_games[j])
            for j, other_embedding in enumerate(cluster_embeddings)
            if j != i
        ]
        game_sims.sort(reverse=True)
        similarities.append({
            'game': cluster_games[i],
            'similar_games': game_sims[:top_n]
        })
    
    return similarities

# Print similar games for each cluster
for cluster in range(optimal_clusters):
    print(f"\nCluster {cluster} Similar Games:")
    similar_games = find_similar_games(df, embeddings, cluster)
    for game_info in similar_games[:3]:  # Show first 3 games in detail
        print(f"\nGame: {game_info['game']}")
        print("Similar Games:")
        for sim, similar_game in game_info['similar_games']:
            print(f"  - {similar_game} (Similarity: {sim:.4f})")