In [1]:
%pip install --upgrade pip --index-url https://pypi.org/simple -q
%pip install -r requirements.txt --index-url https://pypi.org/simple -q

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import sklearn
import pickle
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("Scikit-learn version:", sklearn.__version__)

NumPy version: 1.24.3
Pandas version: 1.4.4
Scikit-learn version: 1.4.0


In [3]:
pd.set_option('display.max_columns', None)
file_path = '../resources/curated_input.csv'
df = pd.read_csv(file_path)
print(df.head())

          CarName fueltype aspiration doornumber    carbody drivewheel  \
0  toyota corolla   diesel        std       four  hatchback        fwd   
1   toyota carina      gas        std       four      wagon        4wd   
2  toyota corolla      gas        std        two  hatchback        rwd   
3   toyota corona      gas        std        two  hatchback        rwd   
4     nissan otti      gas        std       four      sedan        fwd   

  enginelocation  wheelbase    color  carlength  carwidth  carheight  \
0          front       95.7   yellow      166.3      64.4       52.8   
1          front       95.7   purple      169.7      63.6       59.1   
2          front       98.4     navy      176.2      65.6       52.0   
3          front      102.9  fuchsia      183.5      67.7       52.0   
4          front      100.4   yellow      184.6      66.5       55.1   

   curbweight  cylindernumber  enginesize  compressionratio  horsepower  \
0        2275             4.0         110      

In [4]:
# Your data preprocessing steps
df_encoded = pd.get_dummies(df, drop_first=True)
X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a base model
rf = RandomForestRegressor(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters and estimator
print("Best parameters found: ", grid_search.best_params_)
best_rf = grid_search.best_estimator_

# Predict and Evaluate performance metrics
y_pred = best_rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R^2:", r2)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters found:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
RMSE: 1947.9731057682432
R^2: 0.9599802288317854


In [5]:
# Save the model to a file
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
file_path = f'../models/best_random_forest_model_{timestamp}.json'
with open(file_path, 'wb') as file:
    pickle.dump(best_rf, file)