#MODELS

###Baseline Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Drop non-numeric columns if present
X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

# Now train the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f'Linear Regression Mean Squared Error: {mse_lr}')
print(f'Linear Regression R^2 Score: {r2_lr}')


Linear Regression Mean Squared Error: 0.6305729013637178
Linear Regression R^2 Score: 0.36891234524741373


###Preliminary Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Random Forest Mean Squared Error: {mse_rf}')
print(f'Random Forest R^2 Score: {r2_rf}')


Random Forest Mean Squared Error: 0.6428914496705865
Random Forest R^2 Score: 0.35658374098274354


###Preliminary Neural Network model

In [None]:
from sklearn.neural_network import MLPRegressor

# Create a Neural Network model
nn_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

# Train the model
nn_model.fit(X_train, y_train)

# Make predictions
y_pred_nn = nn_model.predict(X_test)

# Evaluate the model
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print(f'Neural Network Mean Squared Error: {mse_nn}')
print(f'Neural Network R^2 Score: {r2_nn}')

Neural Network Mean Squared Error: 0.9253635095972481
Neural Network R^2 Score: 0.07388109161940704


###Random Forest (Bayesian)

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter space for Random Forest
rf_param_space = {
    'n_estimators': Integer(100, 1000),
    'max_features': Categorical(['sqrt', 'log2']),  # Adjusted
    'max_depth': Integer(10, 50),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 10),
    'bootstrap': Categorical([True, False])
}

# Initialize the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Initialize the BayesSearchCV
rf_bayes_search = BayesSearchCV(
    rf_model,
    search_spaces=rf_param_space,
    n_iter=50,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the model
rf_bayes_search.fit(X_train, y_train)

# Get the best parameters
best_rf_params = rf_bayes_search.best_params_
print("Best Parameters for Random Forest:", best_rf_params)

# Get the best model
best_rf_model = rf_bayes_search.best_estimator_

# Make predictions
y_pred_rf = best_rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Optimized Random Forest Mean Squared Error: {mse_rf}")
print(f"Optimized Random Forest R^2 Score: {r2_rf}")



Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi



Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Parameters for Random Forest: OrderedDict([('bootstrap', False), ('max_depth', 40), ('max_features', 'sqrt'), ('min_samples_leaf', 3), ('min_samples_split', 9), ('n_estimators', 1000)])
Optimized Random Forest Mean Squared Error: 0.6393886305734517
Optimized Random Forest R^2 Score: 0.36008941952403933


###Neural Network (Hyperparameter optimization)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

# Ensure y_train and y_test are one-dimensional
y_train = y_train.values if hasattr(y_train, 'values') else y_train
y_test = y_test.values if hasattr(y_test, 'values') else y_test

if y_train.ndim > 1 and y_train.shape[1] == 2:
    y_train = y_train[:, 0]

if y_test.ndim > 1 and y_test.shape[1] == 2:
    y_test = y_test[:, 0]

# Define the function to create the Keras model
def create_model(params):
    model = Sequential()
    model.add(tf.keras.layers.Input(shape=(X_train.shape[1],)))
    model.add(Dense(int(params['units1']), activation=params['activation']))
    if params['units2'] > 0:
        model.add(Dense(int(params['units2']), activation=params['activation']))
    model.add(Dense(1))  

    model.compile(optimizer=params['optimizer'],
                  loss='mean_squared_error')
    return model

# Define the objective function
def objective(params):
    model = create_model(params)
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

    pred = model.predict(X_test)

    # Check if the shapes are the same
    if pred.shape != y_test.shape:
        print(f"Shape mismatch: y_test {y_test.shape}, pred {pred.shape}")
        pred = pred.reshape(-1, 1)  

    mse = mean_squared_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    return {'loss': mse, 'status': STATUS_OK, 'r2': r2}

# Define the parameter space
space = {
    'units1': hp.quniform('units1', 32, 512, 32),
    'units2': hp.quniform('units2', 0, 512, 32),
    'activation': hp.choice('activation', ['relu', 'tanh']),
    'optimizer': hp.choice('optimizer', ['adam', 'sgd'])
}

# Optimize the hyperparameters
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

print('Best parameters:', best)

# Create the model with the best hyperparameters
best_params = {
    'units1': int(best['units1']),
    'units2': int(best['units2']),
    'activation': ['relu', 'tanh'][best['activation']],
    'optimizer': ['adam', 'sgd'][best['optimizer']]
}

best_model = create_model(best_params)
best_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

# Evaluate the best model
y_pred_nn = best_model.predict(X_test)

# Ensure y_pred_nn has the correct shape
if y_pred_nn.shape != y_test.shape:
    y_pred_nn = y_pred_nn.reshape(-1, 1)

mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print(f"Optimized Neural Network Mean Squared Error: {mse_nn}")
print(f"Optimized Neural Network R^2 Score: {r2_nn}")



[1m 1/39[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 160ms/step
[1m38/39[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 1ms/step  
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step

Shape mismatch: y_test (1226,), pred (1226, 1)
[1m 1/39[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 164ms/step
[1m35/39[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 1ms/step  
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step

Shape mismatch: y_test (1226,), pred (1226, 1)
[1m 1/39[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 163ms/step
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step  
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step

Shape mismatch: y_test (1226,), pred (1226, 1)
[1m 1/39[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m9s[0m 246ms/step
[