# Import necessary Libraries

In [34]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV

# Load the dataset and checking for missing data

In [35]:
dataset = pd.read_csv("/content/train.csv")
dataset.isna().sum()

id                0
Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
Age               0
dtype: int64

# Separating features (x) and target variable (y)

In [36]:
x = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values
feature_names = dataset.columns[1:-1].tolist()

# Apply One-Hot Encoding to categorical features

In [37]:
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])], remainder="passthrough")
x = np.array(ct.fit_transform(x))

# Split the dataset into training and testing sets

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

# Define the parameter grid for GridSearchCV

In [39]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}

# Creating the Random Forest Regressor

In [40]:
regressor = RandomForestRegressor()

# Perform Grid Search to find the best hyperparameters (tunning)

In [41]:
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, scoring="r2", cv=5)
grid_search.fit(x_train, y_train)

# Get the best parameters and estimator from the Grid Search

In [42]:
best_params = grid_search.best_params_
best_regressor = grid_search.best_estimator_

print("Best hyperparameters:", best_params)

Best hyperparameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}


# Fit the best regressor on the training data

In [43]:
best_regressor.fit(x_train, y_train)

# Make predictions on the test set and print performance metrics

In [44]:
y_pred = best_regressor.predict(x_test)
r2_test = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("R-squared on test dataset:", r2_test)
print("Mean Absolute Error (MAE) on test dataset:", mae)
print("Mean Squared Error (MSE) on test dataset:", mse)

R-squared on test dataset: 0.5780298841104891
Mean Absolute Error (MAE) on test dataset: 1.4039968047882272
Mean Squared Error (MSE) on test dataset: 4.254375052529738


# Feature Selection using the backward elimination method. The selected features are displayed at the end

In [45]:
selected_features = list(range(x_train.shape[1]))
initial_r2 = r2_test

for i in selected_features:
    features_to_use = [feature for feature in selected_features if feature != i]

    if len(features_to_use) > 0:
        x_subset = x_train[:, features_to_use]
        best_regressor.fit(x_subset, y_train)
        r2_subset = r2_score(y_train, best_regressor.predict(x_subset))

        if r2_subset > initial_r2:
            print(f"Removing feature in position {i} - R-squared improved to {r2_subset:.4f}")
            initial_r2 = r2_subset
            selected_features = features_to_use
        else:
            print(f"Keeping feature in position {i} - R-squared: {r2_subset:.4f}")
    else:
        print(f"All features removed - Terminating Process")
        break

print("Selected Features:")
for feature_index in selected_features:
    if feature_index < len(feature_names):
        print(feature_names[feature_index])
    else:
        print("Invalid Index")

Removing feature in position 0 - R-squared improved to 0.6439
Keeping feature in position 1 - R-squared: 0.6393
Keeping feature in position 2 - R-squared: 0.6424
Keeping feature in position 3 - R-squared: 0.6424
Keeping feature in position 4 - R-squared: 0.6425
Keeping feature in position 5 - R-squared: 0.6401
Keeping feature in position 6 - R-squared: 0.6268
Keeping feature in position 7 - R-squared: 0.5881
Keeping feature in position 8 - R-squared: 0.6402
Keeping feature in position 9 - R-squared: 0.6227
Selected Features:
Length
Diameter
Height
Weight
Shucked Weight
Viscera Weight
Shell Weight
Invalid Index
Invalid Index


# Fit the regressor using selected features

In [46]:
x_train_selected = x_train[:, selected_features]
best_regressor.fit(x_train_selected, y_train)

# Make predictions on the test set using selected features and evaluate performance on test set after backward elimination

In [47]:
x_test_selected = x_test[:, selected_features]
y_pred_after_backward_elimination = best_regressor.predict(x_test_selected)
r2_test_after_backward_elimination = r2_score(y_test, y_pred_after_backward_elimination)
mae_after_backward_elimination = mean_absolute_error(y_test, y_pred_after_backward_elimination)
mse_after_backward_elimination = mean_squared_error(y_test, y_pred_after_backward_elimination)
print("R-squared after backward elimination:", r2_test_after_backward_elimination)
print("Mean Absolute Error (MAE) after backward elimination:", mae_after_backward_elimination)
print("Mean Squared Error (MSE) after backward elimination:", mse_after_backward_elimination)

R-squared after backward elimination: 0.5774849765355581
Mean Absolute Error (MAE) after backward elimination: 1.4051440823312362
Mean Squared Error (MSE) after backward elimination: 4.259868904121181


# Make predictions on real data, combine real data with predictions and save to a CSV file, and read the final data with predictions from the saved CSV file

In [49]:
realdata = pd.read_csv("/content/test.csv")
x_realdata = np.array(ct.transform(realdata.iloc[:, 1:]))

x_realdata_selected = x_realdata[:, selected_features]
y_pred_realdata = best_regressor.predict(x_realdata_selected)

realdata_with_predictions = pd.DataFrame(x_realdata, columns=ct.get_feature_names_out(input_features=realdata.columns[1:]))
realdata_with_predictions["Predictions"] = y_pred_realdata

realdata_with_predictions.to_csv("realdata_with_predictions.csv", index=False)

final = pd.read_csv("/content/realdata_with_predictions.csv")
final



Unnamed: 0,encoder__Sex_F,encoder__Sex_I,encoder__Sex_M,remainder__Length,remainder__Diameter,remainder__Height,remainder__Weight,remainder__Shucked Weight,remainder__Viscera Weight,remainder__Shell Weight,Predictions
0,0.0,1.0,0.0,1.0500,0.7625,0.2750,8.618248,3.657085,1.729319,2.721552,7.554529
1,0.0,1.0,0.0,1.1625,0.8875,0.2750,15.507176,7.030676,3.246018,3.968930,7.794604
2,1.0,0.0,0.0,1.2875,0.9875,0.3250,14.571643,5.556502,3.883882,4.819415,10.863830
3,1.0,0.0,0.0,1.5500,0.9875,0.3875,28.377849,13.380964,6.548735,7.030676,9.848372
4,0.0,1.0,0.0,1.1125,0.8500,0.2625,11.765042,5.528153,2.466407,3.331066,7.460038
...,...,...,...,...,...,...,...,...,...,...,...
49363,1.0,0.0,0.0,1.3000,1.0375,0.3250,16.315137,6.690482,5.173784,3.756309,9.012051
49364,0.0,1.0,0.0,1.0375,0.7625,0.2625,10.276694,4.436697,1.998640,3.543687,8.437057
49365,1.0,0.0,0.0,1.4875,1.1625,0.3625,31.382897,11.396499,6.846404,8.788345,13.391495
49366,1.0,0.0,0.0,1.2375,0.9500,0.2875,15.663099,6.095142,3.727959,4.961163,10.007768
