In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
import warnings

warnings.filterwarnings("ignore")

data = np.loadtxt("/Users/efang/Desktop/coding/Intro-to-ML/CSDS340/data/auto-mpg-missing-data-removed.txt", comments='"')
X = data[:, 1:] 
y = data[:, 0]   

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    return {'loss': -score, 'status': STATUS_OK}
def elastic_net_objective(params):
    model = make_pipeline(
        PolynomialFeatures(degree=int(params['degree'])),
        StandardScaler(),
        ElasticNet(alpha=params['alpha'], l1_ratio=params['l1_ratio'])
    )
    return evaluate_model(model)
elastic_net_space = {
    'degree': hp.choice('degree', [1, 2, 3]),
    'alpha': hp.uniform('alpha', 0.01, 1.0),
    'l1_ratio': hp.uniform('l1_ratio', 0.1, 0.9)
}
elastic_net_trials = Trials()
best_elastic_net = fmin(
    fn=elastic_net_objective,
    space=elastic_net_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=elastic_net_trials
)
print("Best Elastic Net parameters:", best_elastic_net)

def svr_linear_objective(params):
    model = make_pipeline(
        StandardScaler(),
        SVR(kernel='linear', C=params['C'])
    )
    return evaluate_model(model)

svr_linear_space = {
    'C': hp.loguniform('svr_linear_C', np.log(0.01), np.log(100))
}
svr_linear_trials = Trials()
best_svr_linear = fmin(
    fn=svr_linear_objective,
    space=svr_linear_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=svr_linear_trials
)
print("Best SVR (Linear) parameters:", best_svr_linear)

def svr_rbf_objective(params):
    model = make_pipeline(
        StandardScaler(),
        SVR(kernel='rbf', C=params['C'], gamma=params['gamma'])
    )
    return evaluate_model(model)

svr_rbf_space = {
    'C': hp.loguniform('svr_rbf_C', np.log(0.01), np.log(100)),
    'gamma': hp.loguniform('gamma', np.log(0.001), np.log(1))
}

svr_rbf_trials = Trials()
best_svr_rbf = fmin(
    fn=svr_rbf_objective,
    space=svr_rbf_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=svr_rbf_trials
)
print("Best SVR (RBF) parameters:", best_svr_rbf)

def random_forest_objective(params):
    model = RandomForestRegressor(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        random_state=1
    )
    return evaluate_model(model)

random_forest_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 300, 10)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 30, 1))
}

random_forest_trials = Trials()
best_random_forest = fmin(
    fn=random_forest_objective,
    space=random_forest_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=random_forest_trials
)
print("Best Random Forest parameters:", best_random_forest)

def knn_objective(params):
    model = make_pipeline(
        StandardScaler(),
        KNeighborsRegressor(n_neighbors=int(params['n_neighbors']))
    )
    return evaluate_model(model)

knn_space = {
    'n_neighbors': scope.int(hp.quniform('n_neighbors', 1, 20, 1))
}

knn_trials = Trials()
best_knn = fmin(
    fn=knn_objective,
    space=knn_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=knn_trials
)
print("Best K-Nearest Neighbors parameters:", best_knn)


100%|██████████| 50/50 [00:01<00:00, 26.15trial/s, best loss: -0.8601345771552982]
Best Elastic Net parameters: {'alpha': 0.015100364958033616, 'degree': 2, 'l1_ratio': 0.6229427836002613}
100%|██████████| 50/50 [00:00<00:00, 125.97trial/s, best loss: -0.7906627995500516]
Best SVR (Linear) parameters: {'svr_linear_C': 12.714366611491277}
100%|██████████| 50/50 [00:00<00:00, 262.36trial/s, best loss: -0.8642187909145536]
Best SVR (RBF) parameters: {'gamma': 0.11848037385730707, 'svr_rbf_C': 48.06641678679899}
100%|██████████| 50/50 [00:04<00:00, 11.53trial/s, best loss: -0.8495557504512157]
Best Random Forest parameters: {'max_depth': 24.0, 'n_estimators': 80.0}
100%|██████████| 50/50 [00:00<00:00, 581.17trial/s, best loss: -0.8371175069143985]
Best K-Nearest Neighbors parameters: {'n_neighbors': 2.0}


In [6]:


def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    return score
    
best_elastic_net_model = make_pipeline(
    PolynomialFeatures(degree=best_elastic_net['degree'] + 1),  # Adjusting for hp.choice indexing
    StandardScaler(),
    ElasticNet(alpha=best_elastic_net['alpha'], l1_ratio=best_elastic_net['l1_ratio'])
)
elastic_net_score = evaluate_model(best_elastic_net_model, X_train, y_train, X_test, y_test)
print(f"Elastic Net Best R^2 Score: {elastic_net_score:.4f}")

best_svr_linear_model = make_pipeline(
    StandardScaler(),
    SVR(kernel='linear', C=best_svr_linear['svr_linear_C'])
)
svr_linear_score = evaluate_model(best_svr_linear_model, X_train, y_train, X_test, y_test)
print(f"SVR (Linear) Best R^2 Score: {svr_linear_score:.4f}")

best_svr_rbf_model = make_pipeline(
    StandardScaler(),
    SVR(kernel='rbf', C=best_svr_rbf['svr_rbf_C'], gamma=best_svr_rbf['gamma'])
)
svr_rbf_score = evaluate_model(best_svr_rbf_model, X_train, y_train, X_test, y_test)
print(f"SVR (RBF) Best R^2 Score: {svr_rbf_score:.4f}")

best_random_forest_model = RandomForestRegressor(
    n_estimators=int(best_random_forest['n_estimators']),
    max_depth=int(best_random_forest['max_depth']),
    random_state=1
)
random_forest_score = evaluate_model(best_random_forest_model, X_train, y_train, X_test, y_test)
print(f"Random Forest Best R^2 Score: {random_forest_score:.4f}")

best_knn_model = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor(n_neighbors=int(best_knn['n_neighbors']))
)
knn_score = evaluate_model(best_knn_model, X_train, y_train, X_test, y_test)
print(f"K-Nearest Neighbors Best R^2 Score: {knn_score:.4f}")

best_score = max(elastic_net_score, svr_linear_score, svr_rbf_score, random_forest_score, knn_score)
print(f"\nBest Model R^2 Score: {best_score:.4f}")


Elastic Net Best R^2 Score: 0.8601
SVR (Linear) Best R^2 Score: 0.7907
SVR (RBF) Best R^2 Score: 0.8642
Random Forest Best R^2 Score: 0.8496
K-Nearest Neighbors Best R^2 Score: 0.8371

Best Model R^2 Score: 0.8642
