In [1]:
import numpy as np
import matplotlib
 
import matplotlib.pyplot as plt
from libsvm.svmutil import svm_read_problem
from tqdm import tqdm  # For progress bar

In [2]:
def decision_stump(X, y, weights):

    n_samples, n_features = X.shape
    min_error = float('inf')
    best_stump = {}
    
    for feature in range(n_features):
        feature_values = X[:, feature]
        unique_values = np.unique(feature_values)
        thresholds = (unique_values[:-1] + unique_values[1:]) / 2  # Midpoints
        
        for threshold in thresholds:
            for polarity in [1, -1]:
                predictions = polarity * np.sign(X[:, feature] - threshold)
                predictions[predictions == 0] = 1  # Handle zero as positive class
                
                misclassified = predictions != y
                weighted_error = np.sum(weights * misclassified)
                
                if weighted_error < min_error:
                    min_error = weighted_error
                    best_stump = {
                        'feature': feature,
                        'threshold': threshold,
                        'polarity': polarity
                    }
                    
    return best_stump, min_error


In [3]:
def adaboost_stump(X_train, y_train, X_test, y_test, T=500):
    """
    Implement AdaBoost with decision stumps.
    
    Parameters:
    - X_train: numpy array of shape (n_samples_train, n_features)
    - y_train: numpy array of shape (n_samples_train,)
    - X_test: numpy array of shape (n_samples_test, n_features)
    - y_test: numpy array of shape (n_samples_test,)
    - T: number of iterations
    
    Returns:
    - Ein: list of in-sample errors at each iteration
    - Eout: list of out-of-sample errors at each iteration
    - epsilons: list of weighted errors at each iteration
    """
    n_train = X_train.shape[0]
    n_test = X_test.shape[0]
    
    # Initialize weights
    weights = np.ones(n_train) / n_train
    
    # To store weak learners and their alphas
    weak_learners = []
    alphas = []
    
    # To store errors
    Ein = []
    Eout = []
    epsilons = []
    
    for t in tqdm(range(1, T + 1), desc="AdaBoost Iterations"):
        # Find the best decision stump
        stump, error = decision_stump(X_train, y_train, weights)
        
        # Avoid division by zero
        error = max(error, 1e-10)
        
        # Compute alpha
        alpha = 0.5 * np.log((1 - error) / error)
        
        # Store the weak learner and alpha
        weak_learners.append(stump)
        alphas.append(alpha)
        
        # Make predictions
        predictions = stump['polarity'] * np.sign(X_train[:, stump['feature']] - stump['threshold'])
        predictions[predictions == 0] = 1  # Handle zero as positive class
        
        # Update weights
        weights *= np.exp(-alpha * y_train * predictions)
        weights /= np.sum(weights)  # Normalize
        
        # Compute Ein(gt)
        gt_train = np.sign(np.sum([alphas[i] * (weak_learners[i]['polarity'] * np.sign(X_train[:, weak_learners[i]['feature']] - weak_learners[i]['threshold'])) for i in range(t)], axis=0))
        gt_train[gt_train == 0] = 1
        Ein_t = np.mean(gt_train != y_train)
        Ein.append(Ein_t)
        
        # Compute weighted error epsilon_t
        epsilons.append(error)
        
        # Compute Eout(gt)
        gt_test = np.sign(np.sum([alphas[i] * (weak_learners[i]['polarity'] * np.sign(X_test[:, weak_learners[i]['feature']] - weak_learners[i]['threshold'])) for i in range(t)], axis=0))
        gt_test[gt_test == 0] = 1
        Eout_t = np.mean(gt_test != y_test)
        Eout.append(Eout_t)
        
    return Ein, Eout, epsilons


In [4]:
def preprocess_data(x):
    """
    Convert list of dictionaries to numpy array.
    
    Parameters:
    - x: list of dictionaries
    
    Returns:
    - X: numpy array of shape (n_samples, n_features)
    """
    n_samples = len(x)
    n_features = max([max(sample.keys()) if sample else 0 for sample in x])
    X = np.zeros((n_samples, n_features))
    for i, sample in enumerate(x):
        for key, value in sample.items():
            X[i, key - 1] = value  # libsvm format indices start at 1
    return X


In [5]:
def main():
    # Load data using libsvm format
    print("Loading training data...")
    y_train, x_train = svm_read_problem('madelon_train.txt')
    print("Loading testing data...")
    y_test, x_test = svm_read_problem('madelon_test.txt')
    
    # Convert labels from {0,1} to {-1,+1} if necessary
    y_train = np.array([1 if label > 0 else -1 for label in y_train])
    y_test = np.array([1 if label > 0 else -1 for label in y_test])
    
    # Preprocess data to numpy arrays
    print("Preprocessing training data...")
    X_train = preprocess_data(x_train)
    print("Preprocessing testing data...")
    X_test = preprocess_data(x_test)
    
    # Run AdaBoost-Stump
    print("Running AdaBoost-Stump algorithm...")
    Ein, Eout, epsilons = adaboost_stump(X_train, y_train, X_test, y_test, T=500)
    
    # Plotting Ein and epsilon_t
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, 501), Ein, label='Ein(gt)', color='blue')
    plt.plot(range(1, 501), epsilons, label='ϵ_t', color='red')
    plt.xlabel('Iteration t')
    plt.ylabel('Error')
    plt.title('Ein(gt) and ϵ_t over 500 Iterations')
    plt.legend()
    plt.grid(True)
    plt.savefig('Ein_Epsilon_t_plot.png')
    plt.close()
    
    # Print final average errors
    print(f"Final Average Ein(gt): {Ein[-1]:.4f}")
    print(f"Final Average Eout(gt): {Eout[-1]:.4f}")
    
    # Optionally, save Ein and Eout for further analysis
    np.save('Ein.npy', Ein)
    np.save('Eout.npy', Eout)
    np.save('Epsilons.npy', epsilons)
    
if __name__ == "__main__":
    main()


Loading training data...
Loading testing data...
Preprocessing training data...
Preprocessing testing data...
Running AdaBoost-Stump algorithm...


AdaBoost Iterations: 100%|██████████| 500/500 [27:37<00:00,  3.32s/it]


Final Average Ein(gt): 0.0410
Final Average Eout(gt): 0.3933
