In [1]:
import time

import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

import load_data

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
class GridSearch:
    
    def __init__(self, estimators, estimator_params):
        self.estimators = estimators
        self.estimator_params = estimator_params
        self.results = {}
        
    def search_single(self, dataset):
        """ Perform a grid search using all estimators on a single dataset
        
        Args:
            dataset (tuple): a 4-tuple of X_train, X_test, y_train, y_test
        Returns:
            results dictionary with the best score, training time, and best 
            parameters for each estimator
        """
        
        X_train, X_test, y_train, y_test = dataset
        results = {}
        
        for estimator_name, estimator in self.estimators.items():
            print(estimator_name)
            params = self.estimator_params[estimator_name]
            grid_search = GridSearchCV(estimator, params)
            
            start = time.time()
            grid_search.fit(X_train, y_train)
            end = time.time()
            
            test_score = grid_search.score(X_test, y_test)
            
            result_data = {'best_score': grid_search.best_score_,
                           'best_params': grid_search.best_params_,
                           'total_time': end-start,
                           'test_score': test_score}
            
            results[estimator] = result_data
            
        return results
    
    def search(self, datasets):
        for dataset_name, dataset in datasets.items():
            print(f'Searching {dataset_name}')
            dataset_results = self.search_single(dataset)
            self.results[dataset_name] = dataset_results
            print()
            
        return self.results
    
    def print_results(self):
        complete_search_time = 0
        
        # Find the estimator that performed best for each dataset
        for dataset_name, dataset_results in self.results.items():
            total_time = [estimator_results['total_time'] for estimator_results in dataset_results.values()]
            time_dict = {estimator_name: estimator_results['total_time'] for estimator_name, estimator_results in dataset_results.items()}
            dataset_time = sum(total_time)
            complete_search_time += dataset_time
            
            # Get a mapping from the best score to the estimator
            scores = [estimator_results['best_score'] for estimator_results in dataset_results.values()]
            score_to_estimator = {dataset_results[estimator]['best_score']: estimator for estimator in dataset_results.keys()}
            
            best_score = max(scores)
            best_estimator = score_to_estimator[best_score]
            best_params = dataset_results[best_estimator]['best_params']
            best_test_score = dataset_results[best_estimator]['test_score']
            
            print(f'Dataset {dataset_name}: \nBest Estimator: {best_estimator} \n Params: {best_params} \nScore: {best_score} \nTotal time:{dataset_time} \nTesting score:{best_test_score} \nTime Dict:{time_dict}\n\n')
        
        results_df = pd.DataFrame(self.results)
        print('Grid Search total time:', complete_search_time)
        return results_df

In [13]:
classification_estimators = {'Logistic Regression': LogisticRegression(),
                             'KNN': KNeighborsClassifier(),
                             'Decision Tree': DecisionTreeClassifier(),
                             'AdaBoost': AdaBoostClassifier(),}
#                             'MLP': MLPClassifier()}
classification_parameters = {'Logistic Regression': {'C': np.arange(0.1, 1.9, 0.2), 'max_iter': np.arange(25, 325, 25), 'fit_intercept': [False, True]},
                             'KNN': {'n_neighbors': np.arange(2, 13, 1), 'leaf_size': np.arange(10, 80, 5)},
                             'Decision Tree': {'min_samples_split': np.arange(2, 6, 1), 'max_depth': np.arange(25, 275, 25)},
                             'AdaBoost': {'n_estimators': np.arange(10, 65, 5), 'learning_rate': np.arange(0.25, 1.75, 0.25)},}
#                             'MLP': {'activation': ['logistic', 'tanh', 'relu'], 'max_iter': np.arange(100, 300, 50)}}

regression_estimators = {'Linear Regression': LinearRegression(),
                         'KNN': KNeighborsRegressor(),
                         'Decision Tree': DecisionTreeRegressor(),
                         'AdaBoost': AdaBoostRegressor(),}
#                         'MLP': MLPRegressor()}
regression_parameters = {'Linear Regression': {'fit_intercept': [False, True]},
                         'KNN': {'n_neighbors': np.arange(2, 13, 1), 'leaf_size': np.arange(10, 80, 5)},
                         'Decision Tree': {'min_samples_split': np.arange(2, 6, 1), 'max_depth': np.arange(25, 275, 25)},
                         'AdaBoost': {'n_estimators': np.arange(10, 65, 5), 'learning_rate': np.arange(0.25, 1.75, 0.25)},}
#                         'MLP': {'activation': ['logistic', 'tanh', 'relu'], 'max_iter': np.arange(100, 300, 50)}}

In [16]:
binary_data = load_data.load_binary()
binary_grid_search = GridSearch(classification_estimators, classification_parameters)

binary_grid_search.search(binary_data)
binary_results = binary_grid_search.print_results()

Searching heart_attack
Logistic Regression
KNN
Decision Tree
AdaBoost

Searching stroke
Logistic Regression
KNN
Decision Tree
AdaBoost

Searching telecom
Logistic Regression
KNN
Decision Tree
AdaBoost

Dataset heart_attack: 
Best Estimator: LogisticRegression() 
 Params: {'C': 1.1000000000000003, 'fit_intercept': True, 'max_iter': 75} 
Score: 0.8306122448979592 
Total time:122.6443600654602 
Testing score:0.8852459016393442 
Time Dict:{LogisticRegression(): 75.49915957450867, KNeighborsClassifier(): 10.380993843078613, DecisionTreeClassifier(): 2.145153045654297, AdaBoostClassifier(): 34.61905360221863}


Dataset stroke: 
Best Estimator: LogisticRegression() 
 Params: {'C': 1.3000000000000003, 'fit_intercept': True, 'max_iter': 100} 
Score: 0.9547455806172621 
Total time:234.19476532936096 
Testing score:0.9393346379647749 
Time Dict:{LogisticRegression(): 135.09029006958008, KNeighborsClassifier(): 43.89105558395386, DecisionTreeClassifier(): 3.1580138206481934, AdaBoostClassifier(): 

In [17]:
regression_data = load_data.load_regression()
regression_grid_search = GridSearch(regression_estimators, regression_parameters)

regression_grid_search.search(regression_data)
regression_results = regression_grid_search.print_results()

Searching california_housing
Linear Regression
KNN
Decision Tree
AdaBoost

Searching melbourne_housing
Linear Regression
KNN
Decision Tree
AdaBoost

Searching world_happiness
Linear Regression
KNN
Decision Tree
AdaBoost

Dataset california_housing: 
Best Estimator: DecisionTreeRegressor() 
 Params: {'max_depth': 225, 'min_samples_split': 5} 
Score: 0.6282821205770821 
Total time:396.8780517578125 
Testing score:0.6285227023873552 
Time Dict:{LinearRegression(): 0.08472180366516113, KNeighborsRegressor(): 35.20700478553772, DecisionTreeRegressor(): 33.241832971572876, AdaBoostRegressor(): 328.34449219703674}


Dataset melbourne_housing: 
Best Estimator: DecisionTreeRegressor() 
 Params: {'max_depth': 25, 'min_samples_split': 5} 
Score: 0.6281231108838001 
Total time:833.4771201610565 
Testing score:0.6966406496123827 
Time Dict:{LinearRegression(): 0.16603541374206543, KNeighborsRegressor(): 442.3555247783661, DecisionTreeRegressor(): 22.866878509521484, AdaBoostRegressor(): 368.0886814

In [18]:
multiclass_data = load_data.load_multiclass()
multiclass_grid_search = GridSearch(classification_estimators, classification_parameters)

multiclass_grid_search.search(multiclass_data)
multiclass_results = multiclass_grid_search.print_results()

Searching mnist
Logistic Regression
KNN
Decision Tree
AdaBoost

Searching forest_covertypes
Logistic Regression
KNN
Decision Tree
AdaBoost

Searching kepler_exoplanets
Logistic Regression
KNN
Decision Tree
AdaBoost

Dataset mnist: 
Best Estimator: KNeighborsClassifier() 
 Params: {'leaf_size': 10, 'n_neighbors': 3} 
Score: 0.9846883468834691 
Total time:632.2646894454956 
Testing score:0.9833333333333333 
Time Dict:{LogisticRegression(): 515.1806199550629, KNeighborsClassifier(): 31.34954833984375, DecisionTreeClassifier(): 6.331221580505371, AdaBoostClassifier(): 79.40329957008362}


Dataset forest_covertypes: 
Best Estimator: KNeighborsClassifier() 
 Params: {'leaf_size': 10, 'n_neighbors': 2} 
Score: 0.80325 
Total time:3408.58008146286 
Testing score:0.817 
Time Dict:{LogisticRegression(): 2446.368030309677, KNeighborsClassifier(): 721.2202088832855, DecisionTreeClassifier(): 27.567227125167847, AdaBoostClassifier(): 213.42461514472961}


Dataset kepler_exoplanets: 
Best Estimator: