In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

# load the training and test data
X_train = np.load('xTrain.npy') # training data
X_test = np.load('xTest.npy') # test data
y_train = np.load('yTrain.npy') # target variable for training data

# define the number of folds for cross validation
n_splits = 10 # folds per repeat
n_repeats = 10 # repeats of cross-validation procedure

# define the model
model = RandomForestRegressor(n_estimators=100, random_state=42) # random forest regression model with 100 trees and fixed random state

# initialize the cross-validation object
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)

# initialize lists to store the RMSE and MAE for each repeat
rmse_list = [] # root mean squared error
mae_list = [] # mean absolute error

# initialize an array to store the predictions
y_pred = np.zeros_like(y_train)

# iterate over the folds
for i, (train_index, val_index) in enumerate(rkf.split(X_train)):
    # get the training and validation data for this fold
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index] # split the feature matrix
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index] # split the target variable
        
    # fit the model on the training data for this fold
    model.fit(X_train_fold, y_train_fold)
       
    # make predictions on the validation data for this fold
    y_val_pred = model.predict(X_val_fold)
    
    # store the predictions for this fold in the overall predictions array
    y_pred[val_index] = y_val_pred
    
    # calculate the RMSE and MAE for this fold
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_val_pred))
    mae = mean_absolute_error(y_val_fold, y_val_pred)
    
    # add the RMSE and MAE to the lists
    rmse_list.append(rmse)
    mae_list.append(mae)
    
    # print the RMSE and MAE for this fold
    print(f"Repeat {i//n_splits+1}, Fold {i%n_splits+1} - RMSE: {rmse:.2f}, MAE: {mae:.2f}")

# calculate the mean and standard deviation of the RMSE and MAE across all folds and repeats
mean_rmse = np.mean(rmse_list)
std_rmse = np.std(rmse_list)
mean_mae = np.mean(mae_list)
std_mae = np.std(mae_list)

# print the mean and standard deviation of the RMSE and MAE
print(f"\nMean RMSE: {mean_rmse:.2f} (std: {std_rmse:.2f})")
print(f"Mean MAE: {mean_mae:.2f} (std: {std_mae:.2f})")

# make predictions on the test data using the trained model
y_test_pred = model.predict(X_test)
print(y_test_pred)
np.save('yPred.npy',y_test_pred)

Repeat 1, Fold 1 - RMSE: 15.22, MAE: 10.91
Repeat 1, Fold 2 - RMSE: 16.56, MAE: 14.12
Repeat 1, Fold 3 - RMSE: 18.31, MAE: 14.10
Repeat 1, Fold 4 - RMSE: 16.08, MAE: 12.51
Repeat 1, Fold 5 - RMSE: 15.20, MAE: 12.47
Repeat 1, Fold 6 - RMSE: 24.13, MAE: 18.32
Repeat 1, Fold 7 - RMSE: 19.59, MAE: 16.17
Repeat 1, Fold 8 - RMSE: 18.93, MAE: 15.66
Repeat 1, Fold 9 - RMSE: 19.43, MAE: 16.15
Repeat 1, Fold 10 - RMSE: 15.80, MAE: 12.27
Repeat 2, Fold 1 - RMSE: 19.54, MAE: 15.38
Repeat 2, Fold 2 - RMSE: 17.95, MAE: 13.98
Repeat 2, Fold 3 - RMSE: 17.52, MAE: 14.58
Repeat 2, Fold 4 - RMSE: 20.55, MAE: 17.62
Repeat 2, Fold 5 - RMSE: 17.51, MAE: 13.75
Repeat 2, Fold 6 - RMSE: 15.64, MAE: 10.95
Repeat 2, Fold 7 - RMSE: 22.18, MAE: 15.87
Repeat 2, Fold 8 - RMSE: 16.30, MAE: 13.42
Repeat 2, Fold 9 - RMSE: 18.58, MAE: 15.09
Repeat 2, Fold 10 - RMSE: 16.82, MAE: 13.20
Repeat 3, Fold 1 - RMSE: 15.82, MAE: 13.42
Repeat 3, Fold 2 - RMSE: 21.90, MAE: 17.85
Repeat 3, Fold 3 - RMSE: 15.62, MAE: 12.56
Repeat 3,

NameError: name 'save' is not defined

In [2]:
np.save('yPred.npy',y_test_pred)