In [31]:
import pandas as pd
import numpy as np
import Functions as f
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [32]:
data = pd.read_csv('data/GPUbenchmark.csv').to_numpy()
random_state = np.random.seed(1)
x_train, x_val, y_train, y_val = train_test_split(data[:,1:], data[:,0], test_size=0.2, random_state=random_state)

In [33]:
num_of_features = x_train.shape[1]

models = f.feature_selection(x_train, y_train, num_of_features, LinearRegression)

mse_values = []
features_in_order = []
val_models = []
x_val_base = np.ones((x_val.shape[0], 1))
for i in range(len(models) - 1):
    # I am skipping the first model since it's a model with no features
    model = models[i + 1]
    numpy_model = np.array(model[0])
    feature_indexes = model[1]
    feature_index = feature_indexes[-1]
    x_val_base_with_new_col = np.hstack((x_val_base, x_val[:, feature_index].reshape((-1, 1))))
    x_val_base = x_val_base_with_new_col
    X_val_subset = x_val[:, feature_indexes]
    linreg = LinearRegression()
    linreg.fit(numpy_model, y_train)
    y_pred_val = linreg.predict(x_val_base)
    val_mse = ((y_pred_val - y_val) ** 2).mean()
    mse_values.append(val_mse)
    features_in_order.append(feature_index)
    val_models.append(x_val_base)

print("The smallest MSE value calculated from the validation set is: ", min(mse_values))
print("which corresponds to the model: Model", mse_values.index(min(mse_values)), "with features", features_in_order[:mse_values.index(min(mse_values))])
print("The features in order of importance are: ", features_in_order, "where the first feature is the most important one.")

The smallest MSE value calculated from the validation set is:  24400.669044911807
which corresponds to the model: Model 4 with features [4, 1, 5, 0]
The features in order of importance are:  [4, 1, 5, 0, 3, 2] where the first feature is the most important one.
