# Automated Machine Learning from Scratch

Group 18 Members:

- Clara Pichler, 11917694
- Hannah Knapp, 11901857 
- Sibel Toprakkiran, 09426341

### Overview

1. Data Set Splitting and Preprocessing

2. generate neighborhood
- `generate_neighborhood(self, current_solution)`

3. create model
- `create_model(self, solution)`

4. simulated annealing
- `simulated_annealing(self)`

5. Comparison with two state of the art AutoML systems
- auto-sklearn 
- TPOT

6. Evaluation
- Iris Dataset
- Congressional Voting Dataset
- gym session tracking Dataset
- Abalone Data set

In [42]:
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC
import time
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression, Lasso

## Data sets

- Iris Dataset
- Congressional Voting Dataset
- gym session tracking Dataset
- Abalone Data set

In [43]:
iris = datasets.load_iris()
iris_data = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])
iris_data['target'] = iris_data['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

df_voting = pd.read_csv('data/CongressionalVotingID.shuf.lrn.csv')

df_airfoil = pd.read_csv("data/airfoil_noise_data.csv")

url='./data/abalone.csv'
column_names = ["Sex", "Length", "Diameter", "Height", "Whole_weight", "Shucked_weight", "Viscera_weight", "Shell_weight", "Rings"]
abalone_df = pd.read_csv(url, header=0, names=column_names)

### Pre-processing

In [44]:
pd.set_option('future.no_silent_downcasting', True)
df_voting = df_voting.replace({"democrat": 0,"republican": 1,"n": 0,"y": 1,"unknown": np.nan})
df_voting = df_voting.drop(columns=['ID'])

imp = IterativeImputer(max_iter=10, random_state=0)
df_voting = pd.DataFrame(imp.fit_transform(df_voting), columns=df_voting.columns)



### test-validation-train split

In [45]:
X_iris = iris_data.drop(['target'], axis=1)
y_iris = iris_data['target']

X_train_iris, X_temp, y_train_iris, y_temp = train_test_split(X_iris, y_iris, test_size=0.4, random_state=42)
X_val_iris, X_test_iris, y_val_iris, y_test_iris = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [46]:
X_voting = df_voting.drop(['class'], axis=1)
y_voting = df_voting['class']

X_train_voting, X_temp, y_train_voting, y_temp = train_test_split(X_voting, y_voting, test_size=0.4, random_state=42)
X_val_voting, X_test_voting, y_val_voting, y_test_voting = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [47]:
X_airfoil = df_airfoil.drop(['y'], axis=1)
y_airfoil = df_airfoil['y']

X_train_airfoil, X_temp, y_train_airfoil, y_temp = train_test_split(X_airfoil, y_airfoil, test_size=0.4, random_state=42)
X_val_airfoil, X_test_airfoil, y_val_airfoil, y_test_airfoil = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [48]:
X_abalone = abalone_df.drop(['Sex'], axis=1)
y_abalone = abalone_df['Sex']

X_train_abalone, X_temp, y_train_abalone, y_temp = train_test_split(X_abalone, y_abalone, test_size=0.4, random_state=42)
X_val_abalone, X_test_abalone, y_val_abalone, y_test_abalone = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

## Models
- MLP
- RF
- KNN 
- SVM
- AdaBoost

## AutoML algorithm

In [51]:
class AutoML_18:
    def __init__(self, initial_temp=100, cooling_rate=0.99, max_iterations=100, min_training_time=3600, classifier = True):
        self.initial_temp = initial_temp
        self.cooling_rate = cooling_rate
        self.max_iterations = max_iterations
        self.min_training_time = min_training_time
        self.classifier = classifier
        self.algorithms_classifier = {
            "MLPClassifier": {
                "class": MLPClassifier,
                "parameters": ["max_iter", "activation", "solver", "alpha"],
                "values": [[1000, 2000, 3000], ['relu', 'tanh', 'logistic'], ['adam', 'sgd'], [0.0001, 0.001, 0.01]]
            },
            "RandomForestClassifier": {
                "class": RandomForestClassifier,
                "parameters": ["n_estimators", "max_depth", "min_samples_split", "max_features", "criterion"],
                "values": [[10, 25, 50, 100, 150], [5, 10, 15], [2, 3, 3, 4], ['sqrt', 'log2', None], ['gini', 'log_loss', 'entropy']]
            },
            "KNClassifier": {
                "class": KNeighborsClassifier,
                "parameters": ["n_neighbors", "weights", "leaf_size"],
                "values": [[3, 5, 7, 9, 11], ['uniform', 'distance'], [10, 20, 30, 40, 50]]
            },
            "SVM": {
                "class": SVC,
                "parameters": ["C", "kernel", "gamma"],
                "values": [[1, 10, 100, 1000], ['linear', 'poly', 'rbf', 'sigmoid'], ['scale', 'auto']]
            },
            "AdaBoostClassifier": {
                "class": AdaBoostClassifier,
                "parameters": ["n_estimators", "learning_rate"],
                "values": [[10, 25, 50, 100, 150], [0.1, 0.5, 1, 1.5, 2]]
            },
        }
        self.algorithms_regressor = {
            
            'RandomForestRegressor': {
                'class': RandomForestRegressor,
                'parameters': ["n_estimators", "max_depth", "min_samples_split", "max_features", "criterion"],
                'values': [[10, 25, 50, 100, 150], [5, 10, 15], [2, 3, 3, 4], ['sqrt', 'log2', None], ['squared_error', 'absolute_error']]
            },
            'GradientBoostingRegressor': {
                'class': GradientBoostingRegressor,
                'parameters': ["n_estimators", "learning_rate", "loss"],
                'values': [[10, 25, 50, 100, 150], [0.1, 0.5, 1, 1.5, 2], ['squared_error', 'absolute_error', 'huber']] 
            },
            'LinearRegression': {
                'class': LinearRegression,
                'parameters': ['n_jobs'],
                'values': [[3, 5, 7, 9]]
            },
            'LassoRegression': {
                'class': Lasso,
                'parameters': ["alpha", "max_iter"],
                'values': [[0.1, 0.5, 1, 1.5, 2], [1000, 2000, 3000, 4000, 5000]]
            },
            'KNRegressor': {
                'class': KNeighborsRegressor,
                'parameters': ["n_neighbors", "weights", "algorithm", "leaf_size"],
                'values': [[3, 5, 7, 9, 11], ['uniform', 'distance'], ['auto', 'ball_tree', 'kd_tree', 'brute'], [10, 20, 30, 40, 50]]
            },
        }
        self.best_solution = None
        self.best_score = 0
        self.model = None
        
    def eval(self, model, X_train, y_train, X_val, y_val):
        model.fit(X_train, y_train) 
        predictions = model.predict(X_val)  
        if self.classifier:
            score = accuracy_score(y_val, predictions) 
            #print(f'Accuracy: {score:.4f}')
        else:
            score = -mean_squared_error(y_val, predictions)
            #print(f'MSE: {-score:.4f}')
      
        return score

    def generate_neighborhood(self, current_solution):
        algorithm_dict = self.algorithms_classifier if self.classifier else self.algorithms_regressor
        algorithm_name = np.random.choice(list(algorithm_dict.keys()))
        algorithm_info = algorithm_dict[algorithm_name]
        
        new_solution = [algorithm_name]
        
        if not algorithm_info['parameters']:
            return new_solution
        
        while len(new_solution) < len(algorithm_info['parameters']) + 1:
            param_index = len(new_solution) - 1
            new_solution.append(np.random.choice(algorithm_info['values'][param_index]))
        
        param_idx = np.random.randint(1, len(new_solution))  # Avoid choosing the model name
        new_solution[param_idx] = np.random.choice(algorithm_info['values'][param_idx - 1])
        
        if np.random.rand() < 0.1:
            new_solution[0] = np.random.choice(list(algorithm_dict.keys()))
            algorithm_info = algorithm_dict[new_solution[0]]
            new_solution = [new_solution[0]] + [
                np.random.choice(values) for values in algorithm_info["values"]
            ]
        
        #print(f"Generated neighborhood for algorithm: {new_solution[0]}, parameters: {new_solution[1:]}")
        return new_solution


    def create_model(self, solution):
        algorithm_name = solution[0]
        hyperparameters = solution[1:]
        algorithm_dict = self.algorithms_classifier if self.classifier else self.algorithms_regressor
        algorithm_class = algorithm_dict[algorithm_name]['class']
            
        if algorithm_name == 'MLPClassifier':
            return algorithm_class(
                max_iter=hyperparameters[0],
                activation=hyperparameters[1],
                solver=hyperparameters[2],
                alpha=hyperparameters[3]
            )
        elif algorithm_name == 'RandomForestClassifier':
            return algorithm_class(
                n_estimators=hyperparameters[0],
                max_depth=hyperparameters[1],
                min_samples_split=hyperparameters[2],
                max_features=hyperparameters[3],
                criterion=hyperparameters[4]
            )
        elif algorithm_name == 'KNClassifier':
            return algorithm_class(
                n_neighbors=hyperparameters[0],
                weights=hyperparameters[1],
                leaf_size=hyperparameters[2]
            )
        elif algorithm_name == 'SVM':
            return algorithm_class(
                C=hyperparameters[0],
                kernel=hyperparameters[1],
                gamma=hyperparameters[2]
            )
        elif algorithm_name == 'AdaBoostClassifier':
            return algorithm_class(
                n_estimators=hyperparameters[0],
                learning_rate=hyperparameters[1],
            )
        elif algorithm_name == 'RandomForestRegressor':
            return algorithm_class(
                n_estimators=hyperparameters[0],
                max_depth=hyperparameters[1],
                min_samples_split=hyperparameters[2],
                max_features=hyperparameters[3],
                criterion=hyperparameters[4]
            )
        elif algorithm_name == 'GradientBoostingRegressor':
            return algorithm_class(
                n_estimators=hyperparameters[0],
                learning_rate=hyperparameters[1],
                loss=hyperparameters[2]
            )
        elif algorithm_name == 'Polynomial Regression':
            return algorithm_class(
                degree=hyperparameters[0],
                order=hyperparameters[1]
            )
        elif algorithm_name == 'LassoRegression':
            return algorithm_class(
                alpha=hyperparameters[0],
                max_iter=hyperparameters[1]
            )
        elif algorithm_name == 'KNRegressor':
            return algorithm_class(
                n_neighbors=hyperparameters[0],
                weights=hyperparameters[1],
                algorithm=hyperparameters[2],
                leaf_size=hyperparameters[3]
            )

    def fit(self, X_train, y_train, X_val, y_val):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.simulated_annealing()

    def predict(self, X):
        if self.model is None:
            raise ValueError("The model has not been fit yet. Please call the fit method first.")
        return self.model.predict(X)
    
    def simulated_annealing(self):
        start_time = time.time()  
        # 0 rule model as initial model as base model
        if self.classifier:
            zero_r_model = DummyClassifier(strategy='most_frequent')
            current_solution = ['DummyClassifier']
        
        else:
            zero_r_model = DummyRegressor(strategy='mean')
            current_solution = ['DummyRegressor']

        current_score = self.eval(zero_r_model, self.X_train, self.y_train, self.X_val, self.y_val)
        best_solution = current_solution
        best_score = current_score
    
        temperature = self.initial_temp
    
        while time.time() - start_time < self.min_training_time:
            for i in range(1, self.max_iterations):
                if i % 50 == 0:
                    print(f"Iteration {i}, Temperature {temperature:.3f}, Best Evaluation {best_score:.5f}")
                    
                if current_solution[0] == 'DummyClassifier':
                    new_solution = self.generate_neighborhood(['KNClassifier'])
                    new_score = self.eval(self.create_model(new_solution), self.X_train, self.y_train, self.X_val, self.y_val)
                elif current_solution[0] == 'DummyRegressor':
                    new_solution = self.generate_neighborhood(['KNRegressor'])
                    new_score = self.eval(self.create_model(new_solution), self.X_train, self.y_train, self.X_val, self.y_val)
                else:
                    new_solution = self.generate_neighborhood(current_solution)
                    new_score = self.eval(self.create_model(new_solution), self.X_train, self.y_train, self.X_val, self.y_val)
    
                if new_score > current_score or np.random.rand() < np.exp((new_score - current_score) / max(temperature, 1e-3)):
                    current_solution = new_solution
                    current_score = new_score
                    if new_score > best_score:
                        best_solution = new_solution
                        best_score = new_score
    
            temperature *= self.cooling_rate
    
        self.best_solution = best_solution
        self.best_score = best_score
        self.model = self.create_model(best_solution)
        self.model.fit(self.X_train, self.y_train)
        
        algorithm_name = best_solution[0]
        hyperparameters = best_solution[1:]
        if hyperparameters:
            param_str = ', '.join(
                f"{param}={round(value, 4) if isinstance(value, float) else value}"
                for param, value in zip(self.algorithms_classifier[algorithm_name]['parameters'], hyperparameters)
            )
            formatted_solution = f"{algorithm_name}({param_str})"
        else:
            formatted_solution = algorithm_name  

        print(f"The best model is {formatted_solution} with a score of {round(best_score, 4)}")
        

## Sklearn Automated Machine Learning Algorithm

## TPOT Automated Machine Learning Algorithm

## Evaluation

### Our Auto ML Iris

In [52]:
automl = AutoML_18(min_training_time=60, max_iterations=50)

print("Fitting the AutoML algorithm")
automl.fit(X_train_iris, y_train_iris, X_val_iris, y_val_iris)

print("\nEvaluating on the test data")
predictions = automl.predict(X_test_iris)

test_accuracy = accuracy_score(y_test_iris, predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_iris, predictions))

Fitting the AutoML algorithm




KeyboardInterrupt: 

### Our AutoML Congressional Voting

In [None]:
print("Fitting the AutoML algorithm")
automl.fit(X_train_voting, y_train_voting, X_val_voting, y_val_voting)

print("\nEvaluating on the test data")
predictions = automl.predict(X_test_voting)

test_accuracy = accuracy_score(y_test_voting, predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_voting, predictions))

Fitting the AutoML algorithm




The best model is MLPClassifier(max_iter=2000, activation=logistic, solver=adam, alpha=0.0001) with a score of 1.0

Evaluating on the test data
Test Accuracy: 0.9773

Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98        24
         1.0       1.00      0.95      0.97        20

    accuracy                           0.98        44
   macro avg       0.98      0.97      0.98        44
weighted avg       0.98      0.98      0.98        44



### Our AutoML airfoil

In [53]:
automl = AutoML_18(min_training_time=60, max_iterations=10, classifier=False)

print("Fitting the AutoML algorithm")
automl.fit(X_train_airfoil, y_train_airfoil, X_val_airfoil, y_val_airfoil)

print("\nEvaluating on the test data")
predictions = automl.predict(X_test_airfoil)

test_accuracy = accuracy_score(y_test_airfoil, predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_airfoil, predictions))

Fitting the AutoML algorithm


AttributeError: 'NoneType' object has no attribute 'fit'

### Our AutoML abalone

In [28]:
print("Fitting the AutoML algorithm")
automl.fit(X_train_abalone, y_train_abalone, X_val_abalone, y_val_abalone)

print("\nEvaluating on the test data")
predictions = automl.predict(X_test_abalone)

test_accuracy = accuracy_score(y_test_abalone, predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_abalone, predictions))

Fitting the AutoML algorithm
The best model is MLPClassifier(max_iter=2000, activation=relu, solver=sgd, alpha=0.001) with a score of 0.588

Evaluating on the test data
Test Accuracy: 0.5215

Classification Report:
              precision    recall  f1-score   support

           F       0.51      0.18      0.27       278
           I       0.66      0.76      0.71       267
           M       0.42      0.63      0.51       291

    accuracy                           0.52       836
   macro avg       0.53      0.52      0.49       836
weighted avg       0.53      0.52      0.49       836

