I have another notebook where I examine data and explore various options.  
However, that takes a while to run and is full of extraneous information.
Here I distill everything down to a working example that I would like to submit...

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
file_path = '/kaggle/input/abalone/train.csv'
train_df = pd.read_csv(file_path)

In [3]:
# One-hot encode 'Sex'
train_df = pd.get_dummies(train_df, columns=['Sex'], drop_first=True)

# View the updated dataset
print(train_df.head())

   id  Length  Diameter  Height  Whole weight  Whole weight.1  Whole weight.2  \
0   0   0.550     0.430   0.150        0.7715          0.3285          0.1465   
1   1   0.630     0.490   0.145        1.1300          0.4580          0.2765   
2   2   0.160     0.110   0.025        0.0210          0.0055          0.0030   
3   3   0.595     0.475   0.150        0.9145          0.3755          0.2055   
4   4   0.555     0.425   0.130        0.7820          0.3695          0.1600   

   Shell weight  Rings  Sex_I  Sex_M  
0        0.2400     11  False  False  
1        0.3200     11  False  False  
2        0.0050      6   True  False  
3        0.2500     10  False   True  
4        0.1975      9   True  False  


In [4]:
# Retain only selected features
selected_features = ['Shell weight', 'Height', 'Diameter', 'Whole weight', 'Rings', 'Sex_I', 'Sex_M']
train_df_selected = train_df[selected_features]

# Confirm the updated dataset
print(train_df_selected.head())

   Shell weight  Height  Diameter  Whole weight  Rings  Sex_I  Sex_M
0        0.2400   0.150     0.430        0.7715     11  False  False
1        0.3200   0.145     0.490        1.1300     11  False  False
2        0.0050   0.025     0.110        0.0210      6   True  False
3        0.2500   0.150     0.475        0.9145     10  False   True
4        0.1975   0.130     0.425        0.7820      9   True  False


In [5]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = train_df_selected.drop(columns=['Rings'])  # Features
y = train_df_selected['Rings']                # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirm the split
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (72492, 6)
Testing set size: (18123, 6)


Now prep the final test data in the same way

Note that as this solution uses forests (and not linear regression) there is no value in using StandardScaler to scale data...

In [6]:
# Read the test data
submission = pd.read_csv('/kaggle/input/abalone/test.csv')

# One-hot encode 'Sex'
submission_df = pd.get_dummies(submission, columns=['Sex'], drop_first=True)

submission_df.head()

Unnamed: 0,id,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Sex_I,Sex_M
0,90615,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005,False,True
1,90616,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275,False,True
2,90617,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405,False,True
3,90618,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235,False,True
4,90619,0.415,0.325,0.11,0.358,0.1575,0.067,0.105,True,False


In [7]:
test_selected_features = ['Shell weight', 'Height', 'Diameter', 'Whole weight', 'Sex_I', 'Sex_M']

submission_selected = submission_df[test_selected_features]

#submission_scaled = submission_selected.copy()
#submission_scaled[numerical_features] = scaler.transform(submission_df[numerical_features])

submission_selected.head()

Unnamed: 0,Shell weight,Height,Diameter,Whole weight,Sex_I,Sex_M
0,0.3005,0.155,0.475,1.238,False,True
1,0.275,0.16,0.46,0.983,False,True
2,0.2405,0.14,0.42,0.8395,False,True
3,0.235,0.145,0.49,0.874,False,True
4,0.105,0.11,0.325,0.358,True,False


Build out the tree on the training data
The max depth and number of estimators is taken from a RandomizedSearchCV which is not done here, as it take a while to run

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize the model
rf_model = RandomForestRegressor(max_depth=16, n_estimators=297)

# Train the model
rf_model.fit(X_train, y_train)  # No scaling needed for tree-based models

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Metrics:")
print(f"R^2 Score: {r2_score(y_test, y_pred_rf):.3f}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_rf):.3f}")
print(f"Root Mean Squared Error (RMSE): {mean_squared_error(y_test, y_pred_rf, squared=False):.3f}")

Random Forest Metrics:
R^2 Score: 0.590
Mean Absolute Error (MAE): 1.394
Root Mean Squared Error (RMSE): 2.053


Use the model created to predict final results

In [9]:
submission_pred = rf_model.predict(submission_selected)

Combine output with ids in the proper format

In [10]:
id_df = submission_df.id
pred_df = pd.DataFrame(submission_pred, columns=['Rings'])

final_df = pd.concat([id_df, pred_df], axis=1)

final_df.head()

Unnamed: 0,id,Rings
0,90615,10.021296
1,90616,10.552638
2,90617,10.08676
3,90618,10.943734
4,90619,7.690498


In [11]:
final_df.to_csv('abalone-submission-01.csv', index=False)