# math and the numerical algorithms 

- overview, advantages and disadvantages
- Representation, Loss, Optimizer
- equations to explain math
- pseudo-code to explain how numerical algorithms work
- 

# references and citations / previous work

- Find at least one previous work where your ML algorithm is applied on a public dataset
- two parts of citations and references, one in the Overview and one in the Check Model 
- Use the Harvard citation format
- 

# Model

In [1]:
# Add docstrings to each method and function and explain what they do and what the inputs and outputs are
import numpy as np

class AdaBoost:
    def __init__(self, n_estimators=50):
        """
        Initialize AdaBoost parameters.
        """
        self.n_estimators = n_estimators
        self.models = []  # Weak classifiers
        self.alphas = []  # Classifier weights

    def preprocess(self, X, method='minmax'):
        """
        Preprocess data using MinMaxScaler or StandardScaler.
        """
        if method == 'minmax':
            X_min = np.min(X, axis=0)
            X_max = np.max(X, axis=0)
            return (X - X_min) / (X_max - X_min)
        elif method == 'standard':
            mean = np.mean(X, axis=0)
            std = np.std(X, axis=0)
            return (X - mean) / std
        else:
            raise ValueError("Invalid preprocessing method. Choose 'minmax' or 'standard'.")

    def split_data(self, X, y, train_ratio=0.6, val_ratio=0.2, test_ratio=0.2):
        """
        Split dataset into training, validation, and test sets.
        """
        np.random.seed(42)
        indices = np.arange(len(y))
        np.random.shuffle(indices)
        
        train_end = int(train_ratio * len(y))
        val_end = train_end + int(val_ratio * len(y))
        
        train_idx, val_idx, test_idx = indices[:train_end], indices[train_end:val_end], indices[val_end:]
        return X[train_idx], y[train_idx], X[val_idx], y[val_idx], X[test_idx], y[test_idx]

    def train(self, X, y):
        """
        Train AdaBoost using decision stumps.
        """
        n_samples, n_features = X.shape
        weights = np.ones(n_samples) / n_samples

        for estimator_idx in range(self.n_estimators):
            # Train a weak classifier (decision stump)
            stump = self._train_stump(X, y, weights)
            pred = stump.predict(X)

            # Compute error and alpha
            error = np.sum(weights * (pred != y))
            if error == 0:
                error = 1e-10  # Avoid division by zero
            alpha = 0.5 * np.log((1 - error) / error)

            # Update weights
            weights *= np.exp(-alpha * y * pred)
            weights /= np.sum(weights)

            # Store model and alpha
            self.models.append(stump)
            self.alphas.append(alpha)

    def predict(self, X):
        """
        Predict using the trained AdaBoost model.
        """
        final_pred = np.zeros(X.shape[0])
        for alpha, model in zip(self.alphas, self.models):
            final_pred += alpha * model.predict(X)
        return np.sign(final_pred)

    def loss(self, y_true, y_pred):
        """
        Compute misclassification error.
        """
        return np.mean(y_true != y_pred)

    def _train_stump(self, X, y, weights):
        """
        Train a decision stump (weak classifier).
        """
        n_samples, n_features = X.shape
        best_stump = None
        best_error = float('inf')

        for feature_idx in range(n_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                for polarity in [-1, 1]:
                    stump = DecisionStump(feature_idx, threshold, polarity)
                    pred = stump.predict(X)
                    error = np.sum(weights * (pred != y))

                    if error < best_error:
                        best_error = error
                        best_stump = stump

        return best_stump

class DecisionStump:
    def __init__(self, feature_idx=None, threshold=None, polarity=1):
        """
        Decision stump classifier.
        """
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.polarity = polarity

    def predict(self, X):
        """
        Predict using the decision stump.
        """
        pred = np.ones(X.shape[0])
        if self.polarity == 1:
            pred[X[:, self.feature_idx] < self.threshold] = -1
        else:
            pred[X[:, self.feature_idx] >= self.threshold] = -1
        return pred


# Main

In [2]:
# Main execution
if __name__ == "__main__":
    # Load dataset
    data = np.loadtxt('/Users/emmasun/Desktop/2060/project/spambase/spambase.data', delimiter=',')
    X, y = data[:, :-1], data[:, -1]
    y = np.where(y == 0, -1, 1)  # Convert to {-1, 1}

    # Initialize AdaBoost
    model = AdaBoost(n_estimators=50)

    # Preprocess and split data
    X_scaled = model.preprocess(X, method='standard')
    X_train, y_train, X_val, y_val, X_test, y_test = model.split_data(X_scaled, y)

    # Train the model
    model.train(X_train, y_train)

    # Evaluate on validation set
    y_pred_val = model.predict(X_val)
    print(f"Validation Loss: {model.loss(y_val, y_pred_val)}")

    # Test the model
    y_pred_test = model.predict(X_test)
    print(f"Test Loss: {model.loss(y_test, y_pred_test)}")


Validation Loss: 0.06739130434782609
Test Loss: 0.05754614549402823


# Unit Test

# Explain either as comments or in a markdown cell what the goal of each test is and/or what edge case it tests for

- what the goal of each test is
- what edge case it tests for

In [3]:
import numpy as np
import pytest

@pytest.fixture
def synthetic_data():
    """Generate synthetic dataset for testing."""
    X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
    y = np.array([1, -1, 1, -1])
    return X, y

@pytest.fixture
def ada_model():
    """Return an instance of the AdaBoost class."""
    from adaboost import AdaBoost  # Import your implementation
    return AdaBoost(n_estimators=5)

def test_preprocess_minmax(synthetic_data, ada_model):
    """
    Test the MinMaxScaler preprocessing.
    Goal: Ensure data is scaled to [0, 1].
    """
    X, _ = synthetic_data
    X_scaled = ada_model.preprocess(X, method='minmax')
    assert np.allclose(np.min(X_scaled, axis=0), 0), "Min value should be 0."
    assert np.allclose(np.max(X_scaled, axis=0), 1), "Max value should be 1."

def test_preprocess_standard(synthetic_data, ada_model):
    """
    Test the StandardScaler preprocessing.
    Goal: Ensure data has mean 0 and std 1.
    """
    X, _ = synthetic_data
    X_scaled = ada_model.preprocess(X, method='standard')
    assert np.allclose(np.mean(X_scaled, axis=0), 0, atol=1e-6), "Mean should be 0."
    assert np.allclose(np.std(X_scaled, axis=0), 1, atol=1e-6), "Standard deviation should be 1."

def test_split_data(synthetic_data, ada_model):
    """
    Test the dataset splitting functionality.
    Goal: Ensure correct splits and no data leakage.
    """
    X, y = synthetic_data
    X_train, y_train, X_val, y_val, X_test, y_test = ada_model.split_data(X, y, 0.5, 0.25, 0.25)
    assert len(X_train) == 2, "Training data size mismatch."
    assert len(X_val) == 1, "Validation data size mismatch."
    assert len(X_test) == 1, "Test data size mismatch."
    assert len(X_train) + len(X_val) + len(X_test) == len(X), "Data split does not match total size."

def test_train_basic(synthetic_data, ada_model):
    """
    Test the basic functionality of the train method.
    Goal: Ensure no errors during training.
    """
    X, y = synthetic_data
    ada_model.train(X, y)
    assert len(ada_model.models) == ada_model.n_estimators, "Incorrect number of weak classifiers."
    assert len(ada_model.alphas) == ada_model.n_estimators, "Incorrect number of alphas."

def test_predict_basic(synthetic_data, ada_model):
    """
    Test the basic functionality of the predict method.
    Goal: Ensure predictions are correct for a trained model.
    """
    X, y = synthetic_data
    ada_model.train(X, y)
    predictions = ada_model.predict(X)
    assert len(predictions) == len(y), "Number of predictions does not match input size."
    assert np.all(np.isin(predictions, [-1, 1])), "Predictions should be in {-1, 1}."

def test_loss_basic(synthetic_data, ada_model):
    """
    Test the loss calculation method.
    Goal: Ensure loss is calculated correctly.
    """
    X, y = synthetic_data
    ada_model.train(X, y)
    predictions = ada_model.predict(X)
    loss = ada_model.loss(y, predictions)
    assert 0 <= loss <= 1, "Loss should be in [0, 1]."

def test_edge_case_uniform_data(ada_model):
    """
    Test for uniform data.
    Goal: Ensure AdaBoost handles cases where all features are the same.
    """
    X = np.array([[1, 1], [1, 1], [1, 1], [1, 1]])
    y = np.array([1, -1, 1, -1])
    ada_model.train(X, y)
    predictions = ada_model.predict(X)
    assert np.all(np.isin(predictions, [-1, 1])), "Predictions should be in {-1, 1}."
    
    
# your implementation can correctly reproduce results 
# obtained from sklearn, textbooks, or results of peer-reviewed journal articles. 
# Previous work needs to be referenced: loss: 7%
def test_reproduce_results(ada_model):
    """
    Test reproducibility with the spambase dataset.
    Goal: Ensure AdaBoost reproduces consistent results.
    """
    data = np.loadtxt('/Users/emmasun/Desktop/2060/project/spambase/spambase.data', delimiter=',')
    X, y = data[:, :-1], data[:, -1]
    y = np.where(y == 0, -1, 1)
    
    X_scaled = ada_model.preprocess(X, method='standard')
    X_train, y_train, X_val, y_val, X_test, y_test = ada_model.split_data(X_scaled, y)
    
    ada_model.train(X_train, y_train)
    predictions = ada_model.predict(X_test)
    loss = ada_model.loss(y_test, predictions)
    assert loss < 0.2, "Test loss should be less than 20%."