In [None]:
import os

import mlrose_hiive as mlrose
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

#### Data

In [None]:
census_data_path = '../Project 1/census/'

In [None]:
with open(os.path.join(census_data_path, 'adult.names')) as f:
    names = f.readlines()
cols = [c for c in names if c[0] != '|']
cols = [c.replace('\n', '') for c in cols]
cols = [c.split(':')[0] for c in cols]
cols = [c for c in cols if c]
cols = cols[1:] + [cols[0]]
df = pd.read_csv(os.path.join(census_data_path, 'adult.data'), names=cols)

In [None]:
X_cols = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',
    'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'
]
categorical_cols = [
    'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'
]
X = pd.get_dummies(df[X_cols], columns=categorical_cols, drop_first=True)
y = df['>50K, <=50K.'].map(lambda x: {'>50K': 1, '<=50K': 0}.get(x.strip()))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=.8, shuffle=True, random_state=42)

In [None]:
ss = StandardScaler()
x_cols = X.columns
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

#### Hill Climbing Tuning

In [None]:
# HILL CLIMBING TUNING (restarts and max attempts)
max_iter = 1000
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'restarts': [25]
}
hc_tuning_results = pd.DataFrame(columns=['iter', 'max_attempts', 'restart', 'fitness', 'time'])
for max_attempts in [10, 25, 50, 100]:
    grid_search_parameters['max_attempts'] = [max_attempts]
    for i in range(3):
        print(f'max attempts: {max_attempts}\titeration: {i}')
        hc_runner = mlrose.NNGSRunner(
            x_train=X_train,
            y_train=y_train,
            x_test=X_test,
            y_test=y_test,
            experiment_name='',
            algorithm=mlrose.random_hill_climb,
            grid_search_parameters=grid_search_parameters,
            iteration_list=[max_iter],
            hidden_layer_sizes=[[50]],
            bias=True,
            early_stopping=True,
            max_attempts=max_attempts,
            n_jobs=5,
            seed=42*(i+1),
            output_directory=None
        )
        stats = hc_runner.run()
        end_stats = stats[0].query('Iteration==@max_iter').reset_index(drop=True)
        results = pd.DataFrame(index=range(end_stats.shape[0]), columns=hc_tuning_results.columns)
        results['iter'] = i + 1
        results['max_attempts'] = max_attempts
        results['restart'] = end_stats['current_restart'].values
        results['fitness'] = end_stats['Fitness'].cummin().values
        results['time'] = end_stats['Time'].values
        hc_tuning_results = pd.concat([hc_tuning_results, results], axis=0)
        hc_tuning_results.reset_index(drop=True, inplace=True)
        hc_tuning_results.to_csv('./results/hc_tuning_nn.csv')

In [None]:
# HILL CLIMBING TUNING (learning rate)
max_iter = 1000
max_attempts = 10
restarts = 10
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'restarts': [restarts],
    'max_attempts': [max_attempts]
}
cols = ['iter', 'max_attempts', 'restarts', 'learning_rate', 'fitness', 'time']
hc_tuning_results = pd.DataFrame(columns=cols)
for alpha in [.001, .01, .1, .5]:
    grid_search_parameters['learning_rate'] = [alpha]
    grid_search_parameters['learning_rate_init'] = [alpha]
    for i in range(3):
        print(f'alpha: {alpha}\titeration: {i}')
        hc_runner = mlrose.NNGSRunner(
            x_train=X_train,
            y_train=y_train,
            x_test=X_test,
            y_test=y_test,
            experiment_name='',
            algorithm=mlrose.random_hill_climb,
            grid_search_parameters=grid_search_parameters,
            iteration_list=[max_iter],
            hidden_layer_sizes=[[50]],
            bias=True,
            early_stopping=True,
            max_attempts=max_attempts,
            n_jobs=5,
            seed=42*(i+1),
            output_directory=None
        )
        stats = hc_runner.run()
        end_stats = stats[0].query('Iteration==@max_iter').reset_index(drop=True)
        results = pd.DataFrame(index=range(1), columns=hc_tuning_results.columns)
        results['iter'] = i + 1
        results['max_attempts'] = max_attempts
        results['restarts'] = restarts
        results['learning_rate'] = alpha
        results['fitness'] = end_stats['Fitness'].min()
        results['time'] = end_stats['Time'].max()
        hc_tuning_results = pd.concat([hc_tuning_results, results], axis=0)
        hc_tuning_results.reset_index(drop=True, inplace=True)
        hc_tuning_results.to_csv('./results/hc_tuning_nn_2.csv')

#### Simulated Annealing Tuning

In [None]:
# SIMULATED ANNEALING TUNING (max attempts)
max_iter = 10000
init_temp = 1
decay = .99
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'schedule': [mlrose.GeomDecay(init_temp=init_temp, decay=decay)],
}
sa_tuning_results = pd.DataFrame(columns=['iter', 'max_attempts', 'init_temp', 'decay', 'fitness', 'time'])
for max_attempts in [10, 25, 50, 100]:
    grid_search_parameters['max_attempts'] = [max_attempts]
    for i in range(3):
        print(f'max attempts: {max_attempts}\titeration: {i}')
        sa_runner = mlrose.NNGSRunner(
            x_train=X_train,
            y_train=y_train,
            x_test=X_test,
            y_test=y_test,
            experiment_name='',
            algorithm=mlrose.simulated_annealing,
            grid_search_parameters=grid_search_parameters,
            iteration_list=[max_iter],
            hidden_layer_sizes=[[50]],
            bias=True,
            early_stopping=True,
            max_attempts=max_attempts,
            n_jobs=5,
            seed=42*(i+1),
            output_directory=None
        )
        stats = sa_runner.run()
        end_stats = stats[0].query('Iteration==@max_iter').reset_index(drop=True)
        results = pd.DataFrame(index=range(end_stats.shape[0]), columns=sa_tuning_results.columns)
        results['iter'] = i + 1
        results['max_attempts'] = max_attempts
        results['init_temp'] = init_temp
        results['decay'] = decay
        results['fitness'] = end_stats['Fitness'].values
        results['time'] = end_stats['Time'].values
        sa_tuning_results = pd.concat([sa_tuning_results, results], axis=0)
        sa_tuning_results.reset_index(drop=True, inplace=True)
        sa_tuning_results.to_csv('./results/sa_tuning_nn_1.csv')

In [None]:
# SIMULATED ANNEALING TUNING (decay)
max_iter = 10000
max_attempts = 10
init_temp = 1
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'max_attempts': [max_attempts],
}
sa_tuning_results = pd.DataFrame(columns=['iter', 'max_attempts', 'init_temp', 'decay', 'fitness', 'time'])
for decay in [.99, .995, .999, .9995]:
    grid_search_parameters['schedule'] = [mlrose.GeomDecay(init_temp=init_temp, decay=decay)]
    for i in range(3):
        print(f'decay: {decay}\titeration: {i}')
        sa_runner = mlrose.NNGSRunner(
            x_train=X_train,
            y_train=y_train,
            x_test=X_test,
            y_test=y_test,
            experiment_name='',
            algorithm=mlrose.simulated_annealing,
            grid_search_parameters=grid_search_parameters,
            iteration_list=[max_iter],
            hidden_layer_sizes=[[50]],
            bias=True,
            early_stopping=True,
            max_attempts=max_attempts,
            n_jobs=5,
            seed=42*(i+1),
            output_directory=None
        )
        stats = sa_runner.run()
        end_stats = stats[0].query('Iteration==@max_iter').reset_index(drop=True)
        results = pd.DataFrame(index=range(end_stats.shape[0]), columns=sa_tuning_results.columns)
        results['iter'] = i + 1
        results['max_attempts'] = max_attempts
        results['init_temp'] = init_temp
        results['decay'] = decay
        results['fitness'] = end_stats['Fitness'].values
        results['time'] = end_stats['Time'].values
        sa_tuning_results = pd.concat([sa_tuning_results, results], axis=0)
        sa_tuning_results.reset_index(drop=True, inplace=True)
        sa_tuning_results.to_csv('./results/sa_tuning_nn_2.csv')

In [None]:
# SIMULATED ANNEALING TUNING (init temp)
max_iter = 10000
max_attempts = 10
decay = .99
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'max_attempts': [max_attempts],
}
sa_tuning_results = pd.DataFrame(columns=['iter', 'max_attempts', 'init_temp', 'decay', 'fitness', 'time'])
for init_temp in [.1, .5, 1, 5, 10]:
    grid_search_parameters['schedule'] = [mlrose.GeomDecay(init_temp=init_temp, decay=decay)]
    for i in range(3):
        print(f'temp: {init_temp}\titeration: {i}')
        sa_runner = mlrose.NNGSRunner(
            x_train=X_train,
            y_train=y_train,
            x_test=X_test,
            y_test=y_test,
            experiment_name='',
            algorithm=mlrose.simulated_annealing,
            grid_search_parameters=grid_search_parameters,
            iteration_list=[max_iter],
            hidden_layer_sizes=[[50]],
            bias=True,
            early_stopping=True,
            max_attempts=max_attempts,
            n_jobs=5,
            seed=42*(i+1),
            output_directory=None
        )
        stats = sa_runner.run()
        end_stats = stats[0].query('Iteration==@max_iter').reset_index(drop=True)
        results = pd.DataFrame(index=range(end_stats.shape[0]), columns=sa_tuning_results.columns)
        results['iter'] = i + 1
        results['max_attempts'] = max_attempts
        results['init_temp'] = init_temp
        results['decay'] = decay
        results['fitness'] = end_stats['Fitness'].values
        results['time'] = end_stats['Time'].values
        sa_tuning_results = pd.concat([sa_tuning_results, results], axis=0)
        sa_tuning_results.reset_index(drop=True, inplace=True)
        sa_tuning_results.to_csv('./results/sa_tuning_nn_3.csv')

In [None]:
# SIMULATED ANNEALING TUNING (learning rate)
max_iter = 10000
max_attempts = 10
decay = .99
init_temp = .1
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'max_attempts': [max_attempts],
    'schedule': [mlrose.GeomDecay(init_temp=init_temp, decay=decay)]
}
cols = ['iter', 'max_attempts', 'init_temp', 'decay', 'learning_rate', 'fitness', 'time']
sa_tuning_results = pd.DataFrame(columns=cols)
for alpha in [.001, .01, .1, .5]:
    grid_search_parameters['learning_rate'] = [alpha]
    grid_search_parameters['learning_rate_init'] = [alpha]
    for i in range(3):
        print(f'alpha: {alpha}\titeration: {i}')
        sa_runner = mlrose.NNGSRunner(
            x_train=X_train,
            y_train=y_train,
            x_test=X_test,
            y_test=y_test,
            experiment_name='',
            algorithm=mlrose.simulated_annealing,
            grid_search_parameters=grid_search_parameters,
            iteration_list=[max_iter],
            hidden_layer_sizes=[[50]],
            bias=True,
            early_stopping=True,
            max_attempts=max_attempts,
            n_jobs=5,
            seed=42*(i+1),
            output_directory=None
        )
        stats = sa_runner.run()
        end_stats = stats[0].query('Iteration==@max_iter').reset_index(drop=True)
        results = pd.DataFrame(index=range(end_stats.shape[0]), columns=sa_tuning_results.columns)
        results['iter'] = i + 1
        results['max_attempts'] = max_attempts
        results['init_temp'] = init_temp
        results['decay'] = decay
        results['learning_rate'] = alpha
        results['fitness'] = end_stats['Fitness'].values
        results['time'] = end_stats['Time'].values
        sa_tuning_results = pd.concat([sa_tuning_results, results], axis=0)
        sa_tuning_results.reset_index(drop=True, inplace=True)
        sa_tuning_results.to_csv('./results/sa_tuning_nn_4.csv')

#### Genetic Algorithm Tuning

In [None]:
# GENETIC ALGORITHM TUNING (max attempts)
max_iter = 1000
pop_size = 1000
mutation_rate = .05
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'pop_size': [pop_size],
    'mutation_prob': [mutation_rate]
}
ga_tuning_results = pd.DataFrame(columns=['iter', 'max_attempts', 'pop_size', 'mutation_rate', 'fitness', 'time'])
for max_attempts in [1, 5, 10, 25]:
    grid_search_parameters['max_attempts'] = [max_attempts]
    for i in range(3):
        print(f'max attempts: {max_attempts}\titeration: {i}')
        ga_runner = mlrose.NNGSRunner(
            x_train=X_train,
            y_train=y_train,
            x_test=X_test,
            y_test=y_test,
            experiment_name='',
            algorithm=mlrose.genetic_alg,
            grid_search_parameters=grid_search_parameters,
            iteration_list=[max_iter],
            hidden_layer_sizes=[[50]],
            bias=True,
            early_stopping=True,
            max_attempts=max_attempts,
            n_jobs=5,
            seed=42*(i+1),
            output_directory=None
        )
        stats = ga_runner.run()
        end_stats = stats[0].query('Iteration==@max_iter').reset_index(drop=True)
        results = pd.DataFrame(index=range(end_stats.shape[0]), columns=ga_tuning_results.columns)
        results['iter'] = i + 1
        results['max_attempts'] = max_attempts
        results['pop_size'] = pop_size
        results['mutation_rate'] = mutation_rate
        results['fitness'] = end_stats['Fitness'].values
        results['time'] = end_stats['Time'].values
        ga_tuning_results = pd.concat([ga_tuning_results, results], axis=0)
        ga_tuning_results.reset_index(drop=True, inplace=True)
        ga_tuning_results.to_csv('./results/ga_tuning_nn_1.csv')

In [None]:
# GENETIC ALGORITHM TUNING (pop size)
max_iter = 1000
mutation_rate = .05
max_attempts = 3
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'max_attempts': [max_attempts],
    'mutation_prob': [mutation_rate]
}
ga_tuning_results = pd.DataFrame(columns=['iter', 'max_attempts', 'pop_size', 'mutation_rate', 'fitness', 'time'])
for pop_size in [200, 500, 1000, 2000, 5000]:
    grid_search_parameters['pop_size'] = [pop_size]
    for i in range(3):
        print(f'pop size: {pop_size}\titeration: {i}')
        ga_runner = mlrose.NNGSRunner(
            x_train=X_train,
            y_train=y_train,
            x_test=X_test,
            y_test=y_test,
            experiment_name='',
            algorithm=mlrose.genetic_alg,
            grid_search_parameters=grid_search_parameters,
            iteration_list=[max_iter],
            hidden_layer_sizes=[[50]],
            bias=True,
            early_stopping=True,
            max_attempts=max_attempts,
            n_jobs=5,
            seed=42*(i+1),
            output_directory=None
        )
        stats = ga_runner.run()
        end_stats = stats[0].query('Iteration==@max_iter').reset_index(drop=True)
        results = pd.DataFrame(index=range(end_stats.shape[0]), columns=ga_tuning_results.columns)
        results['iter'] = i + 1
        results['max_attempts'] = max_attempts
        results['pop_size'] = pop_size
        results['mutation_rate'] = mutation_rate
        results['fitness'] = end_stats['Fitness'].values
        results['time'] = end_stats['Time'].values
        ga_tuning_results = pd.concat([ga_tuning_results, results], axis=0)
        ga_tuning_results.reset_index(drop=True, inplace=True)
        ga_tuning_results.to_csv('./results/ga_tuning_nn_2.csv')

In [None]:
# GENETIC ALGORITHM TUNING (mutation rate)
max_iter = 1000
pop_size = 500
max_attempts = 3
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'pop_size': [pop_size],
    'max_attempts': [max_attempts]
}
ga_tuning_results = pd.DataFrame(columns=['iter', 'max_attempts', 'pop_size', 'mutation_rate', 'fitness', 'time'])
for mutation_rate in [.0001, .001, .01, .05, .1]:
    grid_search_parameters['mutation_prob'] = [mutation_rate]
    for i in range(3):
        print(f'mutation rate: {mutation_rate}\titeration: {i}')
        ga_runner = mlrose.NNGSRunner(
            x_train=X_train,
            y_train=y_train,
            x_test=X_test,
            y_test=y_test,
            experiment_name='',
            algorithm=mlrose.genetic_alg,
            grid_search_parameters=grid_search_parameters,
            iteration_list=[max_iter],
            hidden_layer_sizes=[[50]],
            bias=True,
            early_stopping=True,
            max_attempts=max_attempts,
            n_jobs=5,
            seed=42*(i+1),
            output_directory=None
        )
        stats = ga_runner.run()
        end_stats = stats[0].query('Iteration==@max_iter').reset_index(drop=True)
        results = pd.DataFrame(index=range(end_stats.shape[0]), columns=ga_tuning_results.columns)
        results['iter'] = i + 1
        results['max_attempts'] = max_attempts
        results['pop_size'] = pop_size
        results['mutation_rate'] = mutation_rate
        results['fitness'] = end_stats['Fitness'].values
        results['time'] = end_stats['Time'].values
        ga_tuning_results = pd.concat([ga_tuning_results, results], axis=0)
        ga_tuning_results.reset_index(drop=True, inplace=True)
        ga_tuning_results.to_csv('./results/ga_tuning_nn_3.csv')

#### Final Results

In [None]:
# HILL CLIMBING FINAL
max_iter = 20000
step = 10
iteration_list = np.arange(step, max_iter+1, step)
max_attempts = 10
restarts = 10
alpha = .5
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'max_attempts': [max_attempts],
    'learning_rate_init': [alpha],
    'learning_rate': [alpha],
    'restarts': [restarts]
}
cols = ['iter', 'iteration', 'max_attempts', 'restarts', 'learning_rate', 'fitness', 'time', 'f_evals']
hc_final_results = pd.DataFrame(columns=cols)
for i in range(3):
    print(f'\titeration: {i}')
    hc_runner = mlrose.NNGSRunner(
        x_train=X_train,
        y_train=y_train,
        x_test=X_test,
        y_test=y_test,
        experiment_name='',
        algorithm=mlrose.random_hill_climb,
        grid_search_parameters=grid_search_parameters,
        iteration_list=iteration_list,
        hidden_layer_sizes=[[50]],
        bias=True,
        early_stopping=True,
        max_attempts=max_attempts,
        n_jobs=5,
        seed=42*(i+1),
        output_directory=None
    )
    stats = hc_runner.run()
    stats = stats[0]
    n_rows = stats.shape[0]
    results = pd.DataFrame(index=range(n_rows), columns=hc_final_results.columns)
    results['iter'] = i + 1
    results['max_attempts'] = max_attempts
    results['restarts'] = restarts
    results['learning_rate'] = alpha
    results['iteration'] = stats['Iteration'].values
    results['fitness'] = stats['Fitness'].values
    results['time'] = stats['Time'].values
    results['f_evals'] = stats['FEvals'].values
    hc_final_results = pd.concat([hc_final_results, results], axis=0)
    hc_final_results.reset_index(drop=True, inplace=True)
    hc_final_results.to_csv('./results/hc_final_nn.csv')

In [None]:
# SIMULATED ANNEALING FINAL
max_iter = 20000
step = 10
iteration_list = np.arange(step, max_iter+1, step)
max_attempts = 10
decay = .99
init_temp = .1
alpha = .5
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'schedule': [mlrose.GeomDecay(init_temp=init_temp, decay=decay)],
    'max_attempts': [max_attempts],
    'learning_rate_init': [alpha],
    'learning_rate': [alpha]
}
cols = ['iter', 'iteration', 'max_attempts', 'init_temp', 'decay', 'learning_rate', 'fitness', 'time', 'f_evals']
sa_final_results = pd.DataFrame(columns=cols)
for i in range(3):
    print(f'\titeration: {i}')
    sa_runner = mlrose.NNGSRunner(
        x_train=X_train,
        y_train=y_train,
        x_test=X_test,
        y_test=y_test,
        experiment_name='',
        algorithm=mlrose.simulated_annealing,
        grid_search_parameters=grid_search_parameters,
        iteration_list=iteration_list,
        hidden_layer_sizes=[[50]],
        bias=True,
        early_stopping=True,
        max_attempts=max_attempts,
        n_jobs=5,
        seed=42*(i+1),
        output_directory=None
    )
    stats = sa_runner.run()
    stats = stats[0]
    n_rows = stats.shape[0]
    results = pd.DataFrame(index=range(n_rows), columns=sa_final_results.columns)
    results['iter'] = i + 1
    results['max_attempts'] = max_attempts
    results['init_temp'] = init_temp
    results['decay'] = decay
    results['learning_rate'] = alpha
    results['iteration'] = stats['Iteration'].values
    results['fitness'] = stats['Fitness'].values
    results['time'] = stats['Time'].values
    results['f_evals'] = stats['FEvals'].values
    sa_final_results = pd.concat([sa_final_results, results], axis=0)
    sa_final_results.reset_index(drop=True, inplace=True)
    sa_final_results.to_csv('./results/sa_final_nn.csv')

In [None]:
# GENETIC ALGORITHM FINAL
max_iter = 20000
step = 10
iteration_list = np.arange(step, max_iter+1, step)
max_attempts = 25
mutation_rate = .001
pop_size = 500
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'pop_size': [pop_size],
    'max_attempts': [max_attempts],
    'mutation_prob': [mutation_rate]
}
cols = ['iter', 'iteration', 'max_attempts', 'pop_size', 'mutation_rate', 'fitness', 'time', 'f_evals']
ga_final_results = pd.DataFrame(columns=cols)
for i in range(3):
    print(f'\titeration: {i}')
    ga_runner = mlrose.NNGSRunner(
        x_train=X_train,
        y_train=y_train,
        x_test=X_test,
        y_test=y_test,
        experiment_name='',
        algorithm=mlrose.genetic_alg,
        grid_search_parameters=grid_search_parameters,
        iteration_list=iteration_list,
        hidden_layer_sizes=[[50]],
        bias=True,
        early_stopping=True,
        max_attempts=max_attempts,
        n_jobs=5,
        seed=42*(i+1),
        output_directory=None
    )
    stats = ga_runner.run()
    stats = stats[0]
    n_rows = stats.shape[0]
    results = pd.DataFrame(index=range(n_rows), columns=ga_final_results.columns)
    results['iter'] = i + 1
    results['max_attempts'] = max_attempts
    results['pop_size'] = pop_size
    results['mutation_rate'] = mutation_rate
    results['iteration'] = stats['Iteration'].values
    results['fitness'] = stats['Fitness'].values
    results['time'] = stats['Time'].values
    results['f_evals'] = stats['FEvals'].values
    ga_final_results = pd.concat([ga_final_results, results], axis=0)
    ga_final_results.reset_index(drop=True, inplace=True)
    ga_final_results.to_csv('./results/ga_final_nn.csv')

In [None]:
# GRADIENT DESCENT FINAL
max_iter = 20000
step = 10
iteration_list = np.arange(step, max_iter+1, step)
max_attempts = 500
alpha = .00001
grid_search_parameters = {
    'max_iters': [max_iter],
    'activation': [mlrose.identity],
    'max_attempts': [max_attempts],
    'learning_rate_init': [alpha],
    'learning_rate': [alpha]
}
cols = ['iter', 'iteration', 'max_attempts', 'learning_rate', 'fitness', 'time', 'f_evals']
gd_final_results = pd.DataFrame(columns=cols)
for i in range(1):
    print(f'\titeration: {i}')
    gd_runner = mlrose.NNGSRunner(
        x_train=X_train,
        y_train=y_train,
        x_test=X_test,
        y_test=y_test,
        experiment_name='',
        algorithm=mlrose.gradient_descent,
        grid_search_parameters=grid_search_parameters,
        iteration_list=iteration_list,
        hidden_layer_sizes=[[50]],
        bias=True,
        early_stopping=True,
        max_attempts=max_attempts,
        n_jobs=5,
        seed=42*(i+1),
        cv=3,
        output_directory=None
    )
    stats = gd_runner.run()
    stats = stats[0]
    n_rows = stats.shape[0]
    results = pd.DataFrame(index=range(n_rows), columns=gd_final_results.columns)
    results['iter'] = i + 1
    results['max_attempts'] = max_attempts
    results['learning_rate'] = alpha
    results['iteration'] = stats['Iteration'].values
    results['fitness'] = stats['Fitness'].values
    results['time'] = stats['Time'].values
    results['f_evals'] = stats['FEvals'].values
    gd_final_results = pd.concat([gd_final_results, results], axis=0)
    gd_final_results.reset_index(drop=True, inplace=True)
    gd_final_results.to_csv('./results/gd_final_nn.csv')

In [None]:
# SIZE CURVES
alg_params = {
    'random_hill_climb': {
        'max_iters': [3000],
        'activation': [mlrose.identity],
        'restarts': [10],
        'max_attempts': [10],
        'learning_rate': [.5],
        'learning_rate_init': [.5]
    },
    'simulated_annealing': {
        'max_iters': [10000],
        'activation': [mlrose.identity],
        'max_attempts': [10],
        'schedule': [mlrose.GeomDecay(init_temp=.1, decay=.99)],
        'learning_rate': [.5],
        'learning_rate_init': [.5]
    },
    'genetic_alg': {
        'max_iters': [1000],
        'activation': [mlrose.identity],
        'max_attempts': [25],
        'pop_size': [500],
        'mutation_prob': [.001]
    },
    'gradient_descent': {
        'max_iters': [1000],
        'activation': [mlrose.identity],
        'max_attempts': [500],
        'learning_rate_init': [.0001],
        'learning_rate': [.0001]
    }
}
alg_names = {
    'random_hill_climb': 'Random Hill Climbing',
    'simulated_annealing': 'Simulated Annealing',
    'genetic_alg': 'Genetic Algorithm',
    'gradient_descent': 'Gradient Descent'
}
alg_map = {
    'random_hill_climb': mlrose.random_hill_climb,
    'simulated_annealing': mlrose.simulated_annealing,
    'genetic_alg': mlrose.genetic_alg,
    'gradient_descent': mlrose.gradient_descent
}
pcts = [.01, .05, .2]
cols = ['alg', 'time', 'loss', 'f_evals', 'train_accuracy', 'val_accuracy']
size_final_results = pd.DataFrame(columns=cols)
for pct in pcts:
    for alg, params in alg_params.items():
        alg_name = alg_names[alg]
        print(f'Size: {pct}\tAlg: {alg_name}')
        print(alg, alg_name, alg_map[alg])
        if pct < 1:
            X_sub, _, y_sub, _ = train_test_split(
                X_train, y_train.values, stratify=y_train.values, train_size=pct, shuffle=True, random_state=42
            )
        elif pct == 1:
            X_sub = X_train.copy()
            y_sub = y_train.copy()
        runner = mlrose.NNGSRunner(
            x_train=X_sub,
            y_train=y_sub,
            x_test=X_test,
            y_test=y_test,
            experiment_name='',
            algorithm=alg_map[alg],
            grid_search_parameters=params,
            iteration_list=params['max_iters'],
            hidden_layer_sizes=[[50]],
            bias=True,
            early_stopping=True,
            max_attempts=params['max_attempts'][0],
            n_jobs=-1,
            seed=42000,
            cv=3,
            output_directory=None
        )
        stats = runner.run()
        max_iters = params['max_iters']
        results = pd.DataFrame(index=range(1), columns=size_final_results.columns)
        results['alg'] = alg_name
        results['pct'] = pct
        results['time'] = stats[0]['Time'].max()
        results['loss'] = stats[0].query('Iteration==@max_iters')['Fitness'].min()
        results['f_evals'] = stats[0]['FEvals'].max()
        results['train_accuracy'] = stats[2]['mean_train_score']
        results['val_accuracy'] = stats[2]['mean_test_score']
        size_final_results = pd.concat([size_final_results, results], axis=0)
        size_final_results.reset_index(drop=True, inplace=True)
        size_final_results.to_csv('./results/size_final_nn.csv')