## This notebook contains code to tune individual models

In [None]:
# it takes about 4000 to 10,000 seconds to find the optimal parameters for SGDRegressor
def find_optimal_parameters_SGDRegressor(X_input_scaled, y):
    start_time = time.time()
    print(f"Searching optimal parameters at {datetime.now()}")

    loss = ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'] 
    penalty = ['l1', 'l2', 'elasticnet'] 
    alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000] 
    learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive'] 
    eta0 = [0.1, 1, 10, 100] 
    param_distributions = dict(loss=loss, penalty=penalty, alpha=alpha, learning_rate=learning_rate, eta0=eta0) 
    from sklearn.model_selection import RandomizedSearchCV 
    sgd = SGDRegressor()
    random = RandomizedSearchCV(estimator=sgd, param_distributions=param_distributions, verbose=1, n_jobs=-1, n_iter=1000) 
    random_result = random.fit(X_input_scaled, y) 

    end_time = time.time()
    print(f"End time = {datetime.now()}, elapsed time = {end_time - start_time}")

    print(random_result.best_params_)

find_optimal_parameters_SGDRegressor(X_train_age_scaled, y_train_age) 
    
# weight {'penalty': 'l2', 'loss': 'squared_error', 'learning_rate': 'adaptive', 'eta0': 10, 'alpha': 0.01}
# age {'penalty': 'l2', 'loss': 'squared_error', 'learning_rate': 'adaptive', 'eta0': 100, 'alpha': 0.001}


In [None]:
# SVR is not accurate, even after tuning
def find_optimal_parameters_SVR():
    start_time = time.time()
    print(f"Searching optimal parameters at {datetime.now()}")

    parameters = {'kernel': ('linear', 'rbf','poly'), 'C':[1.5, 100],'gamma': [1e-7, 1e-4],
                  'degree': [2, 5], 'epsilon': [0.1,0.3,0.5], 'coef0': [0, 1]}
    svr = SVR()
    clf = RandomizedSearchCV(svr, parameters, verbose=2)
    clf.fit(X_train_from_file, y_train_weight)
    print(clf.best_params_)

    end_time = time.time()
    print(f"End time = {datetime.now()}, elapsed time = {end_time - start_time}")

    '''{'kernel': 'linear', 'gamma': 1e-07, 'epsilon': 0.1, 'degree': 2, 'coef0': 1, 'C': 100}'''


In [1]:
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
def find_optimal_random_forest_parameters(X_input_scaled, y):
    
    start_time = time.time()
    print(f"Searching optimal parameters at {datetime.now()}")

    # Number of trees in random forest
    n_estimators = [100, 400, 800]
    # Number of features to consider at every split
    # max_features = ['auto', 'sqrt'] # auto is deprecated
    # Maximum number of levels in tree
    max_depth = [10, 50, 100]
    # max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf}
    print(random_grid)

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X_input_scaled, y)

    end_time = time.time()
    print(f"End time = {datetime.now()}, elapsed time = {end_time - start_time}")

    print(rf_random.best_params_)


find_optimal_random_forest_parameters(X_train_age_scaled, y_train_age) 
''' age
{'n_estimators': 800, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 50}
'''
