# TTML Regression Examples

This notebook demonstrates using the TTML model for regression tasks. We'll cover:

1. Wine Quality Prediction
   - Predicting wine quality scores
   - Feature importance for wine characteristics
   - Performance visualization

2. NHANES Biomarker Prediction
   - Predicting health biomarker levels
   - Handling missing values
   - Model interpretation

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Import TTML modules
from tabular_transformer.models import TabularTransformer
from tabular_transformer.models.task_heads import RegressionHead
from tabular_transformer.training import Trainer
from tabular_transformer.inference import predict
from tabular_transformer.explainability import global_explanations, local_explanations
from tabular_transformer.utils.config import TransformerConfig
from tabular_transformer.data.dataset import TabularDataset

# Import data utilities
from data_utils import download_wine_quality_dataset, download_nhanes_dataset

## Part 1: Wine Quality Prediction

First, we'll work with the Wine Quality dataset to predict quality scores based on chemical properties.

In [None]:
# Download Wine Quality dataset
wine_df = download_wine_quality_dataset(save_csv=False, variant='red')
print("Wine Quality dataset shape:", wine_df.shape)
print("\nFeature types:")
print(wine_df.dtypes)
print("\nQuality score distribution:")
print(wine_df['quality'].value_counts().sort_index())

In [None]:
# Identify numeric and categorical columns
numeric_features = wine_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = wine_df.select_dtypes(include=['object']).columns.tolist()

# Remove target column from features
target_column = 'quality'
if target_column in numeric_features:
    numeric_features.remove(target_column)
if target_column in categorical_features:
    categorical_features.remove(target_column)

# Create train/test datasets
train_dataset_wine, test_dataset_wine, _ = TabularDataset.from_dataframe(
    dataframe=wine_df,
    numeric_columns=numeric_features,
    categorical_columns=categorical_features,
    target_columns={'main': [target_column]},
    validation_split=0.2,
    random_state=42
)

In [None]:
# Get feature dimensions from preprocessor
feature_dims = train_dataset_wine.preprocessor.get_feature_dimensions()
numeric_dim = feature_dims['numeric_dim']
categorical_dims = feature_dims['categorical_dims']
categorical_embedding_dims = feature_dims['categorical_embedding_dims']

# Model configuration
config = TransformerConfig(
    embed_dim=64,
    num_heads=4,
    num_layers=3,
    dropout=0.1,
    variational=False
)

# Initialize transformer encoder
encoder_wine = TabularTransformer(
    numeric_dim=numeric_dim,
    categorical_dims=categorical_dims,
    categorical_embedding_dims=categorical_embedding_dims,
    config=config
)

# Initialize regression head
task_head_wine = RegressionHead(
    input_dim=64,  # Should match config.embed_dim
    output_dim=1  # Single target value
)

In [None]:
# Create data loaders
train_loader_wine = train_dataset_wine.create_dataloader(batch_size=32, shuffle=True)
test_loader_wine = test_dataset_wine.create_dataloader(batch_size=32, shuffle=False)

# Initialize trainer
trainer_wine = Trainer(
    encoder=encoder_wine,
    task_head=task_head_wine,
    optimizer=None,  # Will be created by trainer
    device=None  # Will use CUDA if available
)

# Train the model
history_wine = trainer_wine.train(
    train_loader=train_loader_wine,
    val_loader=test_loader_wine,
    num_epochs=20,
    early_stopping_patience=3
)

In [None]:
# Make predictions
predictions_wine = trainer_wine.predict(test_loader_wine)

# Get predictions for the main task
y_pred_wine = predictions_wine['main']['predictions'].numpy()
y_test_wine = test_dataset_wine.targets['main']

# Calculate metrics
mse = mean_squared_error(y_test_wine, y_pred_wine)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_wine, y_pred_wine)
r2 = r2_score(y_test_wine, y_pred_wine)

print("Wine Quality Regression Results:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")

# Plot predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test_wine, y_pred_wine, alpha=0.5)
plt.plot([y_test_wine.min(), y_test_wine.max()], [y_test_wine.min(), y_test_wine.max()], 'r--', lw=2)
plt.xlabel('Actual Quality Score')
plt.ylabel('Predicted Quality Score')
plt.title('Predicted vs Actual Wine Quality Scores')
plt.tight_layout()
plt.show()

## Feature Importance for Wine Quality

Let's analyze which chemical properties are most important for predicting wine quality.

In [None]:
# Calculate and plot feature importance
feature_importance_wine = global_explanations.calculate_feature_importance(
    encoder=encoder_wine,
    task_head=task_head_wine,
    dataset=test_dataset_wine,
    feature_names=numeric_features + categorical_features
)

plt.figure(figsize=(12, 6))
feature_importance_wine.sort_values().plot(kind='barh')
plt.title('Feature Importance - Wine Quality Prediction')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

## Part 2: NHANES Biomarker Prediction

Now we'll work with the NHANES dataset to predict health biomarker levels.

In [None]:
# Download NHANES dataset
nhanes_df = download_nhanes_dataset(save_csv=False)
print("NHANES dataset shape:", nhanes_df.shape)
print("\nFeature types:")
print(nhanes_df.dtypes)

In [None]:
# Select a biomarker to predict (e.g., blood pressure)
target_biomarker = 'BPXSY1'  # Systolic blood pressure

# Identify numeric and categorical columns
numeric_features = nhanes_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = nhanes_df.select_dtypes(include=['object']).columns.tolist()

# Remove target column from features
if target_biomarker in numeric_features:
    numeric_features.remove(target_biomarker)
if target_biomarker in categorical_features:
    categorical_features.remove(target_biomarker)

# Create train/test datasets
train_dataset_nhanes, test_dataset_nhanes, _ = TabularDataset.from_dataframe(
    dataframe=nhanes_df,
    numeric_columns=numeric_features,
    categorical_columns=categorical_features,
    target_columns={'main': [target_biomarker]},
    validation_split=0.2,
    random_state=42
)

In [None]:
# Get feature dimensions from preprocessor
feature_dims = train_dataset_nhanes.preprocessor.get_feature_dimensions()
numeric_dim = feature_dims['numeric_dim']
categorical_dims = feature_dims['categorical_dims']
categorical_embedding_dims = feature_dims['categorical_embedding_dims']

# Model configuration
config = TransformerConfig(
    embed_dim=128,
    num_heads=8,
    num_layers=4,
    dropout=0.2,
    variational=False
)

# Initialize transformer encoder
encoder_nhanes = TabularTransformer(
    numeric_dim=numeric_dim,
    categorical_dims=categorical_dims,
    categorical_embedding_dims=categorical_embedding_dims,
    config=config
)

# Initialize regression head
task_head_nhanes = RegressionHead(
    input_dim=128,  # Should match config.embed_dim
    output_dim=1  # Single target value
)

In [None]:
# Create data loaders
train_loader_nhanes = train_dataset_nhanes.create_dataloader(batch_size=64, shuffle=True)
test_loader_nhanes = test_dataset_nhanes.create_dataloader(batch_size=64, shuffle=False)

# Initialize trainer
trainer_nhanes = Trainer(
    encoder=encoder_nhanes,
    task_head=task_head_nhanes,
    optimizer=None,  # Will be created by trainer
    device=None  # Will use CUDA if available
)

# Train the model
history_nhanes = trainer_nhanes.train(
    train_loader=train_loader_nhanes,
    val_loader=test_loader_nhanes,
    num_epochs=25,
    early_stopping_patience=3
)

In [None]:
# Make predictions
predictions_nhanes = trainer_nhanes.predict(test_loader_nhanes)

# Get predictions for the main task
y_pred_nhanes = predictions_nhanes['main']['predictions'].numpy()
y_test_nhanes = test_dataset_nhanes.targets['main']

# Calculate metrics
mse = mean_squared_error(y_test_nhanes, y_pred_nhanes)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_nhanes, y_pred_nhanes)
r2 = r2_score(y_test_nhanes, y_pred_nhanes)

print("NHANES Biomarker Prediction Results:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")

# Plot predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test_nhanes, y_pred_nhanes, alpha=0.5)
plt.plot([y_test_nhanes.min(), y_test_nhanes.max()], [y_test_nhanes.min(), y_test_nhanes.max()], 'r--', lw=2)
plt.xlabel('Actual Biomarker Value')
plt.ylabel('Predicted Biomarker Value')
plt.title('Predicted vs Actual Biomarker Values')
plt.tight_layout()
plt.show()

## Local Explanations for NHANES Predictions

Let's examine individual predictions to understand the model's decision-making process.

In [None]:
# Get local explanations for a few examples
sample_indices = np.random.choice(len(test_dataset_nhanes), 3, replace=False)
for idx in sample_indices:
    explanation = local_explanations.explain_prediction(
        encoder=encoder_nhanes,
        task_head=task_head_nhanes,
        instance_idx=idx,
        dataset=test_dataset_nhanes,
        feature_names=numeric_features + categorical_features
    )
    
    print(f"\nExample {idx+1}:")
    print(f"True value: {y_test_nhanes[idx]:.2f}")
    print(f"Predicted value: {y_pred_nhanes[idx][0]:.2f}")
    print("\nTop feature contributions:")
    sorted_features = sorted(explanation.items(), key=lambda x: abs(x[1]), reverse=True)[:5]
    for feature, contribution in sorted_features:
        print(f"{feature}: {contribution:.4f}")

## Conclusion

This notebook demonstrated regression capabilities of the TTML model on two different datasets:

1. Wine Quality Prediction
   - Successfully predicted wine quality scores
   - Identified important chemical properties
   - Achieved good R² score

2. NHANES Biomarker Prediction
   - Accurately predicted health biomarker levels
   - Provided interpretable predictions
   - Demonstrated handling of complex health data

The TTML model showed strong performance in both tasks and provided valuable insights through its explainability features.