# pymars Demo Notebook

This notebook demonstrates the core capabilities of pymars, a pure Python implementation of the Multivariate Adaptive Regression Splines (MARS) algorithm.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pymars as earth

# For reproducibility
np.random.seed(42)

## Basic Regression Example

In [None]:
# Generate synthetic data with interactions and non-linearities
n_samples = 200
X = np.random.rand(n_samples, 4)
y = (2 * X[:, 0] + 
     np.sin(X[:, 1] * np.pi) + 
     X[:, 2] * X[:, 3] +  # Interaction term
     np.random.normal(0, 0.1, n_samples))

print(f"Dataset shape: {X.shape}")
print(f"Target range: [{y.min():.3f}, {y.max():.3f}]")

In [None]:
# Fit Earth model
model = earth.Earth(max_degree=2, penalty=3.0)
model.fit(X, y)

print(f"Model fitted: {model.fitted_}")
print(f"Number of basis functions: {len(model.basis_)}")
print(f"GCV score: {model.gcv_:.6f}")
print(f"R² score: {model.score(X, y):.6f}")

In [None]:
# Make predictions
predictions = model.predict(X[:10])
print("Sample predictions:")
for i, (pred, actual) in enumerate(zip(predictions[:5], y[:5])):
    print(f"  Sample {i}: Predicted={pred:.3f}, Actual={actual:.3f}")

## Feature Importance Analysis

In [None]:
# Calculate feature importances
model.feature_importance_type = 'nb_subsets'
importances = model.feature_importances_

print("Feature Importances (nb_subsets):")
for i, imp in enumerate(importances):
    print(f"  Feature {i}: {imp:.4f}")

In [None]:
# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances)
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.title('Feature Importances from Earth Model')
plt.xticks(range(len(importances)), [f'Feature {i}' for i in range(len(importances))])
plt.grid(True, alpha=0.3)
plt.show()

## Model Interpretability

In [None]:
# Generate model explanation
explanation = earth.get_model_explanation(model, X, feature_names=[f'Feature {i}' for i in range(X.shape[1])])
print(explanation)

In [None]:
# Plot basis functions
fig, ax = earth.plot_basis_functions(model, X)
plt.show()

In [None]:
# Plot residuals
fig, ax = earth.plot_residuals(model, X, y)
plt.show()

## Scikit-learn Integration

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('earth', earth.EarthRegressor(max_degree=2))
])

# Fit pipeline
pipeline.fit(X_train, y_train)

# Evaluate
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Pipeline Train R²: {train_score:.6f}")
print(f"Pipeline Test R²: {test_score:.6f}")

In [None]:
# Hyperparameter tuning
param_grid = {
    'earth__max_degree': [1, 2],
    'earth__penalty': [2.0, 3.0, 4.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.6f}")

## Generalized Linear Models

In [None]:
# Binary classification example
y_binary = (y > np.median(y)).astype(int)

# Fit GLM Earth with logistic regression
glm_model = earth.GLMEarth(family='logistic', max_degree=2)
glm_model.fit(X, y_binary)

print(f"GLM Model fitted: {glm_model.fitted_}")
print(f"Number of basis functions: {len(glm_model.basis_)}")
print(f"GCV score: {glm_model.gcv_:.6f}")

In [None]:
# Make predictions
glm_predictions = glm_model.predict(X[:10])
glm_probabilities = glm_model.predict_proba(X[:10])

print("GLM Sample predictions and probabilities:")
for i, (pred, prob, actual) in enumerate(zip(glm_predictions[:5], glm_probabilities[:5], y_binary[:5])):
    print(f"  Sample {i}: Predicted={pred}, Probability={prob[1]:.3f}, Actual={actual}")

## Cross-Validation Helper

In [None]:
# Use EarthCV for cross-validation
cv_model = earth.EarthCV(
    earth.EarthRegressor(max_degree=2), 
    cv=5,
    param_grid={'penalty': [2.0, 3.0, 4.0], 'max_terms': [10, 15, 20]}
)
cv_model.fit(X, y)

print(f"Best parameters: {cv_model.best_params_}")
print(f"Best cross-validation score: {cv_model.best_score_:.6f}")
print(f"Number of basis functions in best model: {len(cv_model.best_estimator_.basis_)}")

## Categorical Features and Missing Values

In [None]:
# Example with categorical features and missing values
X_cat = X.copy()
X_cat[:20, 0] = np.nan  # Add some missing values
X_cat[20:40, 1] = 'Category_A'  # Add categorical values
X_cat[40:60, 1] = 'Category_B'
X_cat[60:, 1] = 'Category_C'

# Fit model with missing value and categorical feature support
cat_model = earth.Earth(allow_missing=True, categorical_features=[1])
cat_model.fit(X_cat, y)

print(f"Categorical Model fitted: {cat_model.fitted_}")
print(f"Number of basis functions: {len(cat_model.basis_)}")

## Conclusion

This notebook demonstrated the core capabilities of pymars:

- Basic regression with Earth models
- Feature importance analysis
- Model interpretability tools
- Scikit-learn integration with pipelines and hyperparameter tuning
- Generalized Linear Models for classification
- Cross-validation helper for model selection
- Support for categorical features and missing values

pymars provides a complete, production-ready implementation of the MARS algorithm with full scikit-learn compatibility.