# Machine Learning Optimization with Interaction Terms
## Human-in-the-Loop Approach to Feature Engineering

This notebook demonstrates a systematic approach to improving machine learning models by:
1. Analyzing feature correlations
2. Engineering interaction terms
3. Comparing baseline vs. enhanced models
4. Providing human-guided optimization recommendations

In [None]:
# Setup
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from data_loader import DataLoader
from correlation_analyzer import CorrelationAnalyzer
from feature_engineer import FeatureEngineer
from model_optimizer import ModelOptimizer

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("✓ Environment setup complete")

## 1. Data Loading

Load your dataset or use a sample dataset for demonstration.

In [None]:
# Initialize data loader
loader = DataLoader()

# Option 1: Load your own CSV
# data, target = loader.load_csv('../data/your_dataset.csv', 'target_column'), 'target_column'

# Option 2: Load sample dataset
data, target = loader.load_sample_dataset('diabetes')  # Options: 'diabetes', 'breast_cancer', 'california_housing'

# Display first few rows
print("\nDataset Preview:")
data.head(10)

In [None]:
# Quick data exploration
print("Dataset Shape:", data.shape)
print("\nFeature Types:")
print(data.dtypes.value_counts())
print("\nMissing Values:")
print(data.isnull().sum().sum())
print("\nTarget Variable Statistics:")
if data[target].dtype in ['object', 'category'] or data[target].nunique() < 20:
    print(data[target].value_counts())
else:
    print(data[target].describe())

## 2. Correlation Analysis

Analyze feature correlations to understand relationships and identify potential interaction terms.

In [None]:
# Initialize correlation analyzer
analyzer = CorrelationAnalyzer(data, target)

# Compute correlation matrix
corr_matrix = analyzer.compute_correlations(method='pearson')

In [None]:
# Visualize correlation heatmap
analyzer.plot_correlation_heatmap(figsize=(14, 12))

In [None]:
# Visualize target correlations
analyzer.plot_target_correlations(top_n=15)

In [None]:
# Check for multicollinearity
multicollinearity = analyzer.identify_multicollinearity(threshold=0.85)

In [None]:
# Suggest interaction terms
suggested_interactions = analyzer.suggest_interaction_terms(
    min_correlation=0.15,
    max_correlation=0.80,
    top_n=15
)

# Convert to DataFrame for better visualization
interactions_df = pd.DataFrame([
    {
        'Feature 1': f1,
        'Feature 2': f2,
        'Score': score,
        'F1 → Target': analyzer.target_correlations[f1],
        'F2 → Target': analyzer.target_correlations[f2],
        'F1 ↔ F2': analyzer.correlation_matrix.loc[f1, f2]
    }
    for f1, f2, score in suggested_interactions
])

print("\nSuggested Interaction Terms:")
interactions_df

## 3. Feature Engineering

Create interaction terms based on correlation analysis.

In [None]:
# Initialize feature engineer
engineer = FeatureEngineer(data, target)

# Extract top interaction pairs
interaction_pairs = [(feat1, feat2) for feat1, feat2, _ in suggested_interactions[:10]]

# Create interaction terms
data_with_interactions = engineer.create_interaction_terms(
    interaction_pairs=interaction_pairs,
    interaction_type='multiply'  # Options: 'multiply', 'add', 'divide', 'subtract', 'all'
)

In [None]:
# Optional: Create polynomial features for top correlated features
top_features = analyzer.target_correlations.head(5).index.tolist()
print(f"Creating polynomial features for: {top_features}")

data_with_interactions = engineer.create_polynomial_features(
    features=top_features,
    degree=2
)

In [None]:
# Remove low variance features
data_with_interactions = engineer.remove_low_variance_features(threshold=0.01)

In [None]:
# Feature engineering summary
feature_summary = engineer.get_feature_summary()

print(f"\nNew Features Created: {feature_summary['created_features']}")
print(f"Feature List:")
for feat in engineer.created_features[:20]:
    print(f"  • {feat}")
if len(engineer.created_features) > 20:
    print(f"  ... and {len(engineer.created_features) - 20} more")

## 4. Model Training: Baseline vs. Enhanced

Train two models and compare performance:
- **Baseline**: Original features only
- **Enhanced**: Original + interaction features

In [None]:
# Determine task type
task_type = 'regression' if data[target].nunique() > 20 else 'classification'
print(f"Task Type: {task_type.upper()}")

# Initialize optimizer
optimizer = ModelOptimizer(task_type=task_type, test_size=0.2, random_state=42)

In [None]:
# Prepare baseline data (original features only)
original_features = [col for col in data.columns if col != target]
baseline_data = data[original_features + [target]].copy()

X_train, X_test, y_train, y_test = optimizer.prepare_data(
    baseline_data,
    target,
    scale_features=True
)

In [None]:
# Train baseline model
baseline_model = optimizer.train_baseline_model(
    model_type='auto'  # Options: 'auto', 'logistic', 'random_forest', 'gradient_boosting'
)

In [None]:
# Get baseline feature importance
baseline_importance = optimizer.get_feature_importance(
    baseline_model,
    X_train.columns.tolist(),
    top_n=15
)

# Visualize
plt.figure(figsize=(10, 6))
plt.barh(range(len(baseline_importance)), baseline_importance['importance'], color='steelblue')
plt.yticks(range(len(baseline_importance)), baseline_importance['feature'])
plt.xlabel('Importance')
plt.title('Baseline Model - Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Prepare enhanced data (with interactions)
enhanced_data = data_with_interactions.copy()
interaction_feature_names = engineer.created_features

X_train_enh, X_test_enh, y_train_enh, y_test_enh = optimizer.prepare_data(
    enhanced_data,
    target,
    scale_features=True
)

In [None]:
# Train enhanced model
enhanced_model = optimizer.train_enhanced_model(
    baseline_features=original_features,
    interaction_features=interaction_feature_names,
    model_type='auto'
)

In [None]:
# Get enhanced feature importance
all_features = [f for f in original_features + interaction_feature_names if f in X_train_enh.columns]
enhanced_importance = optimizer.get_feature_importance(
    enhanced_model,
    all_features,
    top_n=15
)

# Visualize
plt.figure(figsize=(10, 6))
colors = ['orangered' if '_X_' in f or '_POW' in f else 'steelblue' 
          for f in enhanced_importance['feature']]
plt.barh(range(len(enhanced_importance)), enhanced_importance['importance'], color=colors)
plt.yticks(range(len(enhanced_importance)), enhanced_importance['feature'])
plt.xlabel('Importance')
plt.title('Enhanced Model - Feature Importance (Orange = Interaction Terms)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Model Comparison & Analysis

In [None]:
# Compare models
comparison = optimizer.compare_models()
comparison

In [None]:
# Calculate improvement
if task_type == 'classification':
    baseline_score = optimizer.baseline_results['test_accuracy']
    enhanced_score = optimizer.enhanced_results['test_accuracy']
    metric_name = 'Accuracy'
else:
    baseline_score = optimizer.baseline_results['test_r2']
    enhanced_score = optimizer.enhanced_results['test_r2']
    metric_name = 'R² Score'

improvement = ((enhanced_score - baseline_score) / baseline_score) * 100

# Visualize comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Performance comparison
models = ['Baseline', 'Enhanced']
scores = [baseline_score, enhanced_score]
colors = ['steelblue', 'orangered' if improvement > 0 else 'gray']

ax1.bar(models, scores, color=colors, alpha=0.7)
ax1.set_ylabel(metric_name)
ax1.set_title(f'Model Performance Comparison\n(Improvement: {improvement:+.2f}%)')
ax1.set_ylim([min(scores)*0.95, max(scores)*1.05])
for i, v in enumerate(scores):
    ax1.text(i, v, f'{v:.4f}', ha='center', va='bottom')

# Feature count comparison
n_features = [optimizer.baseline_results['n_features'], optimizer.enhanced_results['n_features']]
ax2.bar(models, n_features, color=['steelblue', 'orangered'], alpha=0.7)
ax2.set_ylabel('Number of Features')
ax2.set_title('Feature Count Comparison')
for i, v in enumerate(n_features):
    ax2.text(i, v, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

print(f"\n{'='*60}")
print(f"PERFORMANCE IMPROVEMENT: {improvement:+.2f}%")
print(f"{'='*60}")

## 6. Human-in-the-Loop Recommendations

Based on the analysis, here are actionable recommendations:

In [None]:
print("="*80)
print(" "*25 + "RECOMMENDATIONS")
print("="*80)

if improvement > 5:
    print("\n✓ STRONG IMPROVEMENT detected!")
    print("  → Interaction terms are highly beneficial for this problem")
    print("  → Consider using the enhanced model in production")
    print("  → Explore additional domain-specific interactions")
    print("  → Investigate top interaction features for insights")
    
elif improvement > 1:
    print("\n✓ MODERATE IMPROVEMENT detected")
    print("  → Interaction terms provide some benefit")
    print("  → Consider feature selection to reduce model complexity")
    print("  → Monitor for overfitting using cross-validation")
    print("  → Try different model algorithms (XGBoost, LightGBM)")
    
elif improvement > -1:
    print("\n→ MINIMAL CHANGE")
    print("  → Interactions do not add significant value")
    print("  → Consider using baseline model for simplicity")
    print("  → Explore alternative feature engineering approaches:")
    print("     - Domain-specific transformations")
    print("     - Temporal features (if time-series data)")
    print("     - Feature selection methods")
    
else:
    print("\n⚠ PERFORMANCE DEGRADATION")
    print("  → Interaction terms may cause overfitting")
    print("  → Recommendations:")
    print("     1. Use baseline model")
    print("     2. Apply feature selection (remove low-importance interactions)")
    print("     3. Use regularization (L1/L2)")
    print("     4. Increase training data if available")
    print("     5. Try ensemble methods with regularization")

# Identify valuable interaction terms
print("\n" + "-"*80)
print("VALUABLE INTERACTION TERMS:")
print("-"*80)

interaction_features_in_model = [
    f for f in interaction_feature_names
    if f in enhanced_importance['feature'].values
]

if interaction_features_in_model:
    print(f"\nTop {min(5, len(interaction_features_in_model))} interaction features by importance:")
    for i, feat in enumerate(interaction_features_in_model[:5], 1):
        if feat in enhanced_importance['feature'].values:
            imp = enhanced_importance[enhanced_importance['feature'] == feat]['importance'].values[0]
            print(f"  {i}. {feat}: {imp:.4f}")
else:
    print("\nNo interaction terms in top features - interactions may need refinement")

print("\n" + "="*80)

## 7. Save Results

In [None]:
# Save comparison results
comparison.to_csv('../results/model_comparison.csv', index=False)
print("✓ Comparison results saved")

# Save models
optimizer.save_models(
    baseline_path='../results/models/baseline_model.joblib',
    enhanced_path='../results/models/enhanced_model.joblib'
)

# Save feature importance
baseline_importance.to_csv('../results/baseline_feature_importance.csv', index=False)
enhanced_importance.to_csv('../results/enhanced_feature_importance.csv', index=False)
print("✓ Feature importance saved")

print("\n✓ All results saved successfully!")

## Next Steps

1. **Experiment with different interaction types** (add, divide, subtract)
2. **Try alternative model algorithms** (XGBoost, LightGBM, Neural Networks)
3. **Perform cross-validation** for more robust evaluation
4. **Feature selection** to reduce model complexity
5. **Hyperparameter tuning** for both models
6. **Domain-specific feature engineering** based on problem context
7. **Deploy best model** to production environment