# Random Forest Regressor Pipeline

This notebook demonstrates the use of a Random Forest Regressor on a dataset. We will preprocess the data, train the model, tune hyperparameters, and evaluate the results.

In [14]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Suppress warnings
import warnings 
warnings.filterwarnings("ignore")    

### 1. Load the Dataset

In [15]:
# Load the dataset
df = pd.read_excel('sample_data.xlsx')
df.head(2)
# Assume the target variable in the Excel file is named "Target"
# Other variable names are not important here.    

FileNotFoundError: [Errno 2] No such file or directory: 'sample_data.xlsx'

### 2. Separate Features and Target

In [3]:
# Separate features and target variable from the dataset
X = df.drop(columns=["Target"])
y = df["Target"]

# Separate numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns    

### 3. Create Transformers for Data Preprocessing

In [4]:
# Create transformers for data preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)) ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])    

### 4. Create a Pipeline with a RandomForest Regressor

In [5]:
# Create a pipeline with a RandomForest Regressor
base_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])    

### 5. Perform a Train-Test Split

In [6]:
# Perform a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)    

### 6. Train and Evaluate the Base Model

In [7]:
# Train and evaluate the Base Model
base_pipeline.fit(X_train, y_train)

# Predictions and evaluation metrics
y_train_pred = base_pipeline.predict(X_train)
y_test_pred = base_pipeline.predict(X_test)

def evaluate_model(y_true, y_pred, data_type="Train"):
    print(f"Evaluation metrics for {data_type} data:")
    print(f"Mean Squared Error: {mean_squared_error(y_true, y_pred)}")
    print(f"R2 Score: {r2_score(y_true, y_pred)}")
    print("\n")

evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")    

Evaluation metrics for Train data:
Mean Squared Error: 0.492054196165078
R2 Score: 0.9813877089534043


Evaluation metrics for Test data:
Mean Squared Error: 1.7697670953627183
R2 Score: 0.9231724684996003




### 7. Hyperparameter Tuning

In [8]:
# Hyperparameter Tuning
param_grid = {
    'regressor__n_estimators': [50, 64, 100, 128, 200],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(base_pipeline, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")    

Best Hyperparameters: {'regressor__max_depth': 10, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 128}


### 8. Build and Evaluate the Best Model

In [9]:

# Build and evaluate the Best Model
best_pipeline = grid_search.best_estimator_

y_train_best_pred = best_pipeline.predict(X_train)
y_test_best_pred = best_pipeline.predict(X_test)

evaluate_model(y_train_best_pred, y_train, "Train (Best Model)")
evaluate_model(y_test_best_pred, y_test, "Test (Best Model)")    

Evaluation metrics for Train (Best Model) data:
Mean Squared Error: 0.7144567621407447
R2 Score: 0.9679005255194238


Evaluation metrics for Test (Best Model) data:
Mean Squared Error: 2.4235434189003717
R2 Score: 0.8653131960603034




### 9. Train the Final Model on the Entire Dataset

In [10]:
# Train the Final Model on the entire dataset
final_model = best_pipeline.fit(X, y)    

### 10. Save the Final Model

In [11]:
# Save the Final Model
joblib.dump(final_model, 'final_model.pkl')

print("Final model saved as 'final_model.pkl'")    

Final model saved as 'final_model.pkl'


In [12]:
# Done