# TTML Clustering Examples

This notebook demonstrates using the TTML model for clustering tasks. We'll cover:

1. Unsupervised Clustering
   - Feature extraction with transformer encoder
   - Clustering in latent space
   - Cluster visualization and analysis

2. Semi-supervised Learning
   - Combining labeled and unlabeled data
   - Cluster-based classification
   - Performance evaluation

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

# Import TTML modules
from tabular_transformer.models import TabularTransformer
from tabular_transformer.models.task_heads import ClusteringHead
from tabular_transformer.training import Trainer
from tabular_transformer.inference import predict
from tabular_transformer.explainability import global_explanations
from tabular_transformer.utils.config import TransformerConfig
from tabular_transformer.data.dataset import TabularDataset

# Import data utilities
from data_utils import download_adult_dataset, download_wine_quality_dataset

## Part 1: Unsupervised Clustering

We'll use the Wine Quality dataset to demonstrate unsupervised clustering based on chemical properties.

In [None]:
# Download Wine Quality dataset
wine_df = download_wine_quality_dataset(save_csv=False, variant='red')
print("Wine Quality dataset shape:", wine_df.shape)
print("\nFeature types:")
print(wine_df.dtypes)

In [None]:
# Identify numeric and categorical columns
numeric_features = wine_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = wine_df.select_dtypes(include=['object']).columns.tolist()

# Remove quality column from features
quality_column = 'quality'
if quality_column in numeric_features:
    numeric_features.remove(quality_column)
if quality_column in categorical_features:
    categorical_features.remove(quality_column)

# Create dataset for unsupervised learning (no target)
dataset_wine = TabularDataset(
    dataframe=wine_df,
    numeric_columns=numeric_features,
    categorical_columns=categorical_features,
    target_columns=None  # No target for unsupervised learning
)

In [None]:
# Get feature dimensions from preprocessor
feature_dims = dataset_wine.preprocessor.get_feature_dimensions()
numeric_dim = feature_dims['numeric_dim']
categorical_dims = feature_dims['categorical_dims']
categorical_embedding_dims = feature_dims['categorical_embedding_dims']

# Model configuration
config = TransformerConfig(
    embed_dim=64,
    num_heads=4,
    num_layers=3,
    dropout=0.1,
    variational=False
)

# Initialize transformer encoder
encoder = TabularTransformer(
    numeric_dim=numeric_dim,
    categorical_dims=categorical_dims,
    categorical_embedding_dims=categorical_embedding_dims,
    config=config
)

# Initialize clustering head
clustering_head = ClusteringHead(
    input_dim=64,  # Should match config.embed_dim
    n_clusters=4  # Number of wine quality clusters to discover
)

In [None]:
# Create data loader
data_loader_wine = dataset_wine.create_dataloader(batch_size=32, shuffle=True)

# Initialize trainer
trainer = Trainer(
    encoder=encoder,
    task_head=clustering_head,
    optimizer=None,  # Will be created by trainer
    device=None  # Will use CUDA if available
)

# Train the model
history = trainer.train(
    train_loader=data_loader_wine,
    num_epochs=20,
    early_stopping_patience=3
)

In [None]:
# Get cluster assignments
predictions = trainer.predict(data_loader_wine)
cluster_probs = predictions['main']['probabilities']
cluster_assignments = torch.argmax(cluster_probs, dim=1).numpy()

# Calculate silhouette score
features = np.concatenate([dataset_wine.numeric_features, dataset_wine.categorical_features], axis=1)
silhouette_avg = silhouette_score(features, cluster_assignments)
print(f"Silhouette Score: {silhouette_avg:.4f}")

# Get latent representations
latent_repr = predictions['latent_representations'].numpy()

# Reduce dimensionality for visualization
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(latent_repr)

# Plot clusters
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=cluster_assignments, cmap='viridis')
plt.colorbar(scatter)
plt.title('Wine Clusters in Latent Space')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

## Cluster Analysis

Let's analyze the characteristics of each cluster.

In [None]:
# Add cluster assignments to original dataframe
wine_df['Cluster'] = cluster_assignments

# Calculate cluster statistics
cluster_stats = wine_df.groupby('Cluster').agg([
    'mean', 'std'
]).round(2)

print("Cluster Statistics:")
print(cluster_stats)

# Plot feature distributions by cluster
features = numeric_features + categorical_features
n_features = len(features)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

plt.figure(figsize=(15, 4*n_rows))
for i, feature in enumerate(features, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(data=wine_df, x='Cluster', y=feature)
    plt.title(f'{feature} by Cluster')
plt.tight_layout()
plt.show()

## Part 2: Semi-supervised Learning

Now we'll demonstrate semi-supervised learning using the Adult Income dataset.

In [None]:
# Download Adult dataset
adult_df = download_adult_dataset(save_csv=False)
print("Adult dataset shape:", adult_df.shape)
print("\nFeature types:")
print(adult_df.dtypes)

In [None]:
# Identify numeric and categorical columns
numeric_features = adult_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = adult_df.select_dtypes(include=['object']).columns.tolist()

# Remove target column from features
target_column = 'class'
if target_column in numeric_features:
    numeric_features.remove(target_column)
if target_column in categorical_features:
    categorical_features.remove(target_column)

# Create train/test datasets
train_dataset_adult, test_dataset_adult, _ = TabularDataset.from_dataframe(
    dataframe=adult_df,
    numeric_columns=numeric_features,
    categorical_columns=categorical_features,
    target_columns={'main': [target_column]},
    validation_split=0.2,
    random_state=42
)

# Simulate partially labeled data (50% labeled)
n_samples = len(train_dataset_adult)
n_labeled = n_samples // 2
labeled_mask = np.zeros(n_samples, dtype=bool)
labeled_mask[:n_labeled] = True
np.random.shuffle(labeled_mask)

# Create labeled and unlabeled datasets
labeled_dataset = TabularDataset(
    dataframe=adult_df.iloc[labeled_mask],
    numeric_columns=numeric_features,
    categorical_columns=categorical_features,
    target_columns={'main': [target_column]},
    preprocessor=train_dataset_adult.preprocessor  # Use same preprocessor
)

unlabeled_dataset = TabularDataset(
    dataframe=adult_df.iloc[~labeled_mask],
    numeric_columns=numeric_features,
    categorical_columns=categorical_features,
    target_columns=None,  # No targets for unlabeled data
    preprocessor=train_dataset_adult.preprocessor  # Use same preprocessor
)

In [None]:
# Get feature dimensions from preprocessor
feature_dims = train_dataset_adult.preprocessor.get_feature_dimensions()
numeric_dim = feature_dims['numeric_dim']
categorical_dims = feature_dims['categorical_dims']
categorical_embedding_dims = feature_dims['categorical_embedding_dims']

# Model configuration
config = TransformerConfig(
    embed_dim=128,
    num_heads=8,
    num_layers=4,
    dropout=0.2,
    variational=False
)

# Initialize transformer encoder
encoder_semi = TabularTransformer(
    numeric_dim=numeric_dim,
    categorical_dims=categorical_dims,
    categorical_embedding_dims=categorical_embedding_dims,
    config=config
)

# Initialize clustering head
clustering_head_semi = ClusteringHead(
    input_dim=128,  # Should match config.embed_dim
    n_clusters=4,  # Number of clusters
    semi_supervised=True
)

In [None]:
# Create data loaders
labeled_loader = labeled_dataset.create_dataloader(batch_size=64, shuffle=True)
unlabeled_loader = unlabeled_dataset.create_dataloader(batch_size=64, shuffle=True)
test_loader = test_dataset_adult.create_dataloader(batch_size=64, shuffle=False)

# Initialize trainer
trainer_semi = Trainer(
    encoder=encoder_semi,
    task_head=clustering_head_semi,
    optimizer=None,  # Will be created by trainer
    device=None  # Will use CUDA if available
)

# Train the model
history_semi = trainer_semi.train(
    train_loader=labeled_loader,
    unlabeled_loader=unlabeled_loader,
    val_loader=test_loader,
    num_epochs=25,
    early_stopping_patience=3
)

In [None]:
# Make predictions
predictions = trainer_semi.predict(test_loader)

# Get cluster assignments and latent representations
cluster_probs = predictions['main']['probabilities']
cluster_assignments = torch.argmax(cluster_probs, dim=1).numpy()
latent_repr = predictions['latent_representations'].numpy()

# Get true labels
y_test = test_dataset_adult.targets['main']

# Reduce dimensionality
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(latent_repr)

# Plot results
plt.figure(figsize=(15, 5))

# Plot clusters
plt.subplot(121)
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=cluster_assignments, cmap='viridis')
plt.colorbar(scatter)
plt.title('Learned Clusters')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')

# Plot true labels
plt.subplot(122)
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_test, cmap='viridis')
plt.colorbar(scatter)
plt.title('True Labels')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')

plt.tight_layout()
plt.show()

# Calculate adjusted Rand index
ari = adjusted_rand_score(y_test, cluster_assignments)
print(f"Adjusted Rand Index: {ari:.4f}")

## Feature Importance Analysis

Let's analyze which features are most important for the clustering.

In [None]:
# Calculate and plot feature importance
feature_importance = global_explanations.calculate_feature_importance(
    encoder=encoder_semi,
    task_head=clustering_head_semi,
    dataset=test_dataset_adult,
    feature_names=numeric_features + categorical_features
)

plt.figure(figsize=(12, 6))
feature_importance.sort_values().plot(kind='barh')
plt.title('Feature Importance for Clustering')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

## Conclusion

This notebook demonstrated the clustering capabilities of the TTML model:

1. Unsupervised Clustering
   - Successfully identified natural clusters in the Wine Quality dataset
   - Visualized cluster distributions and characteristics
   - Achieved good cluster separation (silhouette score)

2. Semi-supervised Learning
   - Combined labeled and unlabeled data effectively
   - Demonstrated good alignment with true labels (ARI score)
   - Identified important features for clustering

The TTML model showed its versatility in handling both fully unsupervised and semi-supervised learning tasks.