# Automated Machine Learning from Scratch

Group 18 Members:

- Clara Pichler, 11917694
- Hannah Knapp, 11901857 
- Sibel Toprakkiran, 09426341

### Overview

1. Our Implementation

2. Data Sets

3. Evaluation
- Iris Dataset
- Congressional Voting Dataset
- gym session tracking Dataset
- Abalone Data set


The comparison with TPOT and auto-sklearn will be done in the files `tpot.ipynb` and `auto_sklearn.ipynb`.

In [2]:
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC
import time
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression, Lasso

In [3]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Our Implementation

In [4]:
class AutoML_18:
    def __init__(self, initial_temp=100, cooling_rate=0.99, max_iterations=100, min_training_time=3600, classifier = True):
        self.initial_temp = initial_temp
        self.cooling_rate = cooling_rate
        self.max_iterations = max_iterations
        self.min_training_time = min_training_time
        self.classifier = classifier

        self.algorithms_classifier = {
            "MLPClassifier": {
                "class": MLPClassifier,
                "parameters": ["max_iter", "activation", "solver", "alpha"],
                "values": [[1000, 2000, 3000], ['relu', 'tanh', 'logistic'], ['adam', 'sgd'], [0.0001, 0.001, 0.01]]
            },
            "RandomForestClassifier": {
                "class": RandomForestClassifier,
                "parameters": ["n_estimators", "max_depth", "min_samples_split", "max_features", "criterion"],
                "values": [[10, 25, 50, 100, 150], [5, 10, 15], [2, 3, 3, 4], ['sqrt', 'log2', None], ['gini', 'log_loss', 'entropy']]
            },
            "KNClassifier": {
                "class": KNeighborsClassifier,
                "parameters": ["n_neighbors", "weights", "leaf_size"],
                "values": [[3, 5, 7, 9, 11], ['uniform', 'distance'], [10, 20, 30, 40, 50]]
            },
            "SVM": {
                "class": SVC,
                "parameters": ["C", "kernel", "gamma"],
                "values": [[1, 10, 100, 1000], ['linear', 'poly', 'rbf', 'sigmoid'], ['scale', 'auto']]
            },
            "AdaBoostClassifier": {
                "class": AdaBoostClassifier,
                "parameters": ["n_estimators", "learning_rate"],
                "values": [[10, 25, 50, 100, 150], [0.1, 0.5, 1, 1.5, 2]]
            },
        }

        self.algorithms_regressor = {
            'RandomForestRegressor': {
                'class': RandomForestRegressor,
                'parameters': ["n_estimators", "max_depth", "min_samples_split", "max_features", "criterion"],
                'values': [[10, 25, 50, 100, 150], [5, 10, 15], [2, 3, 3, 4], ['sqrt', 'log2', None], ['squared_error', 'absolute_error']]
            },
            'GradientBoostingRegressor': {
                'class': GradientBoostingRegressor,
                'parameters': ["n_estimators", "learning_rate", "loss"],
                'values': [[10, 25, 50, 100, 150], [0.1, 0.5, 1, 1.5, 2], ['squared_error', 'absolute_error', 'huber']] 
            },
            'LinearRegression': {
                'class': LinearRegression,
                'parameters': ['n_jobs'],
                'values': [[3, 5, 7, 9]]
            },
            'LassoRegression': {
                'class': Lasso,
                'parameters': ["alpha", "max_iter"],
                'values': [[0.1, 0.5, 1, 1.5, 2], [1000, 2000, 3000, 4000, 5000]]
            },
            'KNRegressor': {
                'class': KNeighborsRegressor,
                'parameters': ["n_neighbors", "weights", "algorithm", "leaf_size"],
                'values': [[3, 5, 7, 9, 11], ['uniform', 'distance'], ['auto', 'ball_tree', 'kd_tree', 'brute'], [10, 20, 30, 40, 50]]
            },
        }
        
        self.best_solution = None
        self.best_score = 0
        self.model = None
        
    def eval(self, model, X_train, y_train, X_val, y_val):
        model.fit(X_train, y_train) 
        predictions = model.predict(X_val)  
        if self.classifier:
            score = accuracy_score(y_val, predictions) 
        else:
            score = -mean_squared_error(y_val, predictions)
      
        return score

    """ def generate_neighborhood(self, current_solution):
        algorithm_dict = self.algorithms_classifier if self.classifier else self.algorithms_regressor
        new_solution = current_solution[:]
        if np.random.rand() < 0.1:
            new_algorithm_name = np.random.choice(list(algorithm_dict.keys()))
            algorithm_info = algorithm_dict[new_algorithm_name]
            new_solution = [new_algorithm_name] + [
                np.random.choice(values) for values in algorithm_info["values"]
            ]
        else:
            algorithm_name = new_solution[0]
            algorithm_info = algorithm_dict[algorithm_name]

            if algorithm_info["parameters"]:
                param_idx = np.random.randint(1, len(new_solution))  
                param_values = algorithm_info["values"][param_idx - 1]
                
                if isinstance(param_values[0], (int, float)):
                    current_value = new_solution[param_idx]
                    new_value = current_value + np.random.uniform(-0.1, 0.1) * current_value
                    new_solution[param_idx] = np.clip(new_value, min(param_values), max(param_values))
                else:
                    new_solution[param_idx] = np.random.choice(param_values)

        return new_solution """
    
    def generate_neighborhood(self, current_solution):
        algorithm_dict = self.algorithms_classifier if self.classifier else self.algorithms_regressor
        algorithm_name = current_solution[0]
        algorithm_info = algorithm_dict[algorithm_name]
        
        new_solution = current_solution[:]
        
        if not algorithm_info['parameters']:
            new_solution[0] = np.random.choice(list(algorithm_dict.keys()))
            return new_solution
        
        while len(new_solution) < len(algorithm_info['parameters']) + 1:
            param_index = len(new_solution) - 1
            new_solution.append(np.random.choice(algorithm_info['values'][param_index]))

        param_idx = np.random.randint(1, len(new_solution)) 
        param_values = algorithm_info['values'][param_idx - 1]  

        new_solution[param_idx] = np.random.choice(param_values)

        # 10% probability that a new algoeithm is chosen
        if np.random.rand() < 0.1:
            new_solution[0] = np.random.choice(list(algorithm_dict.keys()))
            new_algorithm_info = algorithm_dict[new_solution[0]]
            new_solution = [new_solution[0]] + [
                np.random.choice(values) for values in new_algorithm_info["values"]
            ]

        return new_solution




    def create_model(self, solution):
        algorithm_name = solution[0]
        hyperparameters = solution[1:]
        algorithm_dict = self.algorithms_classifier if self.classifier else self.algorithms_regressor
        algorithm_class = algorithm_dict[algorithm_name]['class']

        if algorithm_name not in algorithm_dict:
            print(f"Algorithm {algorithm_name} not found in dictionary!")
            return None 
        
        elif algorithm_name == 'MLPClassifier':
            return algorithm_class(
                max_iter=hyperparameters[0],
                activation=hyperparameters[1],
                solver=hyperparameters[2],
                alpha=hyperparameters[3]
            )
        elif algorithm_name == 'RandomForestClassifier':
            return algorithm_class(
                n_estimators=hyperparameters[0],
                max_depth=hyperparameters[1],
                min_samples_split=hyperparameters[2],
                max_features=hyperparameters[3],
                criterion=hyperparameters[4]
            )
        elif algorithm_name == 'KNClassifier':
            return algorithm_class(
                n_neighbors=hyperparameters[0],
                weights=hyperparameters[1],
                leaf_size=hyperparameters[2]
            )
        elif algorithm_name == 'SVM':
            return algorithm_class(
                C=hyperparameters[0],
                kernel=hyperparameters[1],
                gamma=hyperparameters[2]
            )
        elif algorithm_name == 'AdaBoostClassifier':
            return algorithm_class(
                n_estimators=hyperparameters[0],
                learning_rate=hyperparameters[1],
            )
        elif algorithm_name == 'RandomForestRegressor':
            return algorithm_class(
                n_estimators=hyperparameters[0],
                max_depth=hyperparameters[1],
                min_samples_split=hyperparameters[2],
                max_features=hyperparameters[3],
                criterion=hyperparameters[4]
            )
        elif algorithm_name == 'GradientBoostingRegressor':
            return algorithm_class(
                n_estimators=hyperparameters[0],
                learning_rate=hyperparameters[1],
                loss=hyperparameters[2]
            )
        elif algorithm_name == 'Polynomial Regression':
            return algorithm_class(
                degree=hyperparameters[0],
                order=hyperparameters[1]
            )
        elif algorithm_name == 'LassoRegression':
            return algorithm_class(
                alpha=hyperparameters[0],
                max_iter=hyperparameters[1]
            )
        elif algorithm_name == 'KNRegressor':
            return algorithm_class(
                n_neighbors=hyperparameters[0],
                weights=hyperparameters[1],
                algorithm=hyperparameters[2],
                leaf_size=hyperparameters[3]
            )
        elif algorithm_name == 'DummyClassifier':
            return DummyClassifier(strategy='most_frequent')
        elif algorithm_name == 'DummyRegressor':
            return DummyRegressor(strategy='mean')
        elif algorithm_name == 'LinearRegression':
            return algorithm_class()
        

    def fit(self, X_train, y_train, X_val, y_val):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.simulated_annealing()

    def predict(self, X):
        if self.model is None:
            raise ValueError("The model has not been fit yet. Please call the fit method first.")
        return self.model.predict(X)
    
    def simulated_annealing(self):
        start_time = time.time()  
        
        zero_r_model = DummyClassifier(strategy='most_frequent') if self.classifier else DummyRegressor(strategy='mean')
        current_solution = ['DummyClassifier'] if self.classifier else ['DummyRegressor']
        algorithms_dict = self.algorithms_classifier if self.classifier else self.algorithms_regressor

        current_score = self.eval(zero_r_model, self.X_train, self.y_train, self.X_val, self.y_val)
        best_solution = current_solution
        best_score = current_score
    
        temperature = self.initial_temp
    
        while time.time() - start_time < self.min_training_time:
            for i in range(1, self.max_iterations):
                if i % 50 == 0:
                    print(f"Iteration {i}, Temperature {temperature:.3f}, Best Evaluation {best_score:.5f}")
                    
                if current_solution[0] in ['DummyClassifier', 'DummyRegressor']:
                    new_solution = self.generate_neighborhood(['KNClassifier' if self.classifier else 'KNRegressor'])
                else:
                    new_solution = self.generate_neighborhood(current_solution)

                new_model = self.create_model(new_solution)
                new_score = self.eval(new_model, self.X_train, self.y_train, self.X_val, self.y_val)

                if new_score > current_score or np.random.rand() < np.exp((new_score - current_score) / max(temperature, 1e-3)):
                    current_solution = new_solution
                    current_score = new_score
                    if new_score > best_score:
                        best_solution = new_solution
                        best_score = new_score
    
            temperature *= self.cooling_rate
    
        self.best_solution = best_solution
        self.best_score = best_score
        self.model = self.create_model(best_solution)
        self.model.fit(self.X_train, self.y_train)
        
        algorithm_name = best_solution[0]
        hyperparameters = best_solution[1:]
        if hyperparameters:
            param_str = ', '.join(
                f"{param}={round(value, 4) if isinstance(value, float) else value}"
                for param, value in zip(algorithms_dict[algorithm_name]['parameters'], hyperparameters)
            )
            formatted_solution = f"{algorithm_name}({param_str})"
        else:
            formatted_solution = algorithm_name  

        print(f"The best model is {formatted_solution} with a score of {round(best_score, 4)}")
        

## Data sets

In [5]:
iris = datasets.load_iris()
iris_data = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])
iris_data['target'] = iris_data['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

df_voting = pd.read_csv('data/CongressionalVotingID.shuf.lrn.csv')

df_airfoil = pd.read_csv("data/airfoil_noise_data.csv")

url='./data/abalone.csv'
column_names = ["Sex", "Length", "Diameter", "Height", "Whole_weight", "Shucked_weight", "Viscera_weight", "Shell_weight", "Rings"]
df_abalone = pd.read_csv(url, header=0, names=column_names)
df_abalone = df_abalone[df_abalone.Height != 0]
df_abalone = pd.get_dummies(df_abalone, columns=['Sex'], drop_first=False)


### Pre-processing

In [6]:
pd.set_option('future.no_silent_downcasting', True)
df_voting = df_voting.replace({"democrat": 0,"republican": 1,"n": 0,"y": 1,"unknown": np.nan})
df_voting = df_voting.drop(columns=['ID'])

imp = IterativeImputer(max_iter=10, random_state=0)
df_voting = pd.DataFrame(imp.fit_transform(df_voting), columns=df_voting.columns)



In [7]:
df_abalone = df_abalone[df_abalone.Height != 0]

### test-validation-train split

In [8]:
X_iris = iris_data.drop(['target'], axis=1)
y_iris = iris_data['target']

X_train_iris, X_temp, y_train_iris, y_temp = train_test_split(X_iris, y_iris, test_size=0.6, random_state=42)
X_val_iris, X_test_iris, y_val_iris, y_test_iris = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [9]:
X_voting = df_voting.drop(['class'], axis=1)
y_voting = df_voting['class']

X_train_voting, X_temp, y_train_voting, y_temp = train_test_split(X_voting, y_voting, test_size=0.6, random_state=42)
X_val_voting, X_test_voting, y_val_voting, y_test_voting = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [10]:
X_airfoil = df_airfoil.drop(['y'], axis=1)
y_airfoil = df_airfoil['y']

X_train_airfoil, X_temp, y_train_airfoil, y_temp = train_test_split(X_airfoil, y_airfoil, test_size=0.6, random_state=42)
X_val_airfoil, X_test_airfoil, y_val_airfoil, y_test_airfoil = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [11]:
X_abalone_reg = df_abalone.drop(['Rings'], axis=1)
y_abalone_reg = df_abalone['Rings']

X_train_abalone_reg, X_temp_reg, y_train_abalone_reg, y_temp_reg = train_test_split(X_abalone_reg, y_abalone_reg, test_size=0.6, random_state=42)
X_val_abalone_reg, X_test_abalone_reg, y_val_abalone_reg, y_test_abalone_reg = train_test_split(X_temp_reg, y_temp_reg, test_size=0.5, random_state=42)

## Evaluation

### Iris

In [13]:
automl = AutoML_18(min_training_time=3600, max_iterations=50)

print("Fitting the AutoML algorithm")
automl.fit(X_train_iris, y_train_iris, X_val_iris, y_val_iris)

print("\nEvaluating on the test data")
predictions = automl.predict(X_test_iris)

test_accuracy = accuracy_score(y_test_iris, predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_iris, predictions))

Fitting the AutoML algorithm




The best model is KNClassifier(n_neighbors=9, weights=distance, leaf_size=20) with a score of 0.9778

Evaluating on the test data
Test Accuracy: 0.9778

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        18
  versicolor       1.00      0.91      0.95        11
   virginica       0.94      1.00      0.97        16

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45





### Congressional Voting

In [None]:
print("Fitting the AutoML algorithm")
automl.fit(X_train_voting, y_train_voting, X_val_voting, y_val_voting)

print("\nEvaluating on the test data")
predictions = automl.predict(X_test_voting)

test_accuracy = accuracy_score(y_test_voting, predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_voting, predictions))

Fitting the AutoML algorithm




### Airfoil

In [13]:
automl = AutoML_18(min_training_time=3600, max_iterations=10, classifier=False)

print("Fitting the AutoML algorithm")
automl.fit(X_train_airfoil, y_train_airfoil, X_val_airfoil, y_val_airfoil)

print("\nEvaluating on the test data")
predictions = automl.predict(X_test_airfoil)

test_mse = mean_squared_error(y_test_airfoil, predictions)
test_rmse = np.sqrt(test_mse)  
test_mae = mean_absolute_error(y_test_airfoil, predictions)
test_r2 = r2_score(y_test_airfoil, predictions)

print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test R^2: {test_r2:.4f}")

Fitting the AutoML algorithm
The best model is RandomForestRegressor(n_estimators=25, max_depth=15, min_samples_split=2, max_features=None, criterion=absolute_error) with a score of -4.8266

Evaluating on the test data
Test MSE: 5.1866
Test RMSE: 2.2774
Test MAE: 1.6701
Test R^2: 0.8964


### Abalone (Regression)

In [None]:
print("Fitting the AutoML algorithm")
automl.fit(X_train_abalone_reg, y_train_abalone_reg, X_val_abalone_reg, y_val_abalone_reg)

print("\nEvaluating on the test data")
predictions = automl.predict(X_test_abalone_reg)

test_mse = mean_squared_error(y_test_abalone_reg, predictions)
test_rmse = np.sqrt(test_mse)  
test_mae = mean_absolute_error(y_test_abalone_reg, predictions)
test_r2 = r2_score(y_test_abalone_reg, predictions)

print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test R^2: {test_r2:.4f}")

### Abalone (Classification)

In [14]:
automl = AutoML_18(min_training_time=60, max_iterations=50)

print("Fitting the AutoML algorithm")
automl.fit(X_train_abalone_class, y_train_abalone_class, X_val_abalone_class, y_val_abalone_class)

print("\nEvaluating on the test data")
predictions = automl.predict(X_test_abalone_class)

test_accuracy = accuracy_score(y_test_abalone_class, predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_abalone_class, predictions))

Fitting the AutoML algorithm
The best model is MLPClassifier(max_iter=3000, activation=tanh, solver=adam, alpha=0.001) with a score of 0.588

Evaluating on the test data
Test Accuracy: 0.5502

Classification Report:
              precision    recall  f1-score   support

           F       0.52      0.35      0.42       278
           I       0.64      0.85      0.73       267
           M       0.46      0.47      0.47       291

    accuracy                           0.55       836
   macro avg       0.54      0.56      0.54       836
weighted avg       0.54      0.55      0.53       836

