In [12]:
# ✅ Import Libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib
import pandas as pd
import numpy as np

# ✅ Load Dataset
# Load the data from CSV file (replace with correct path)
file_path = '../data/raw_data.csv'
df = pd.read_csv(file_path)

# ✅ Data Cleaning
# Remove leading and trailing whitespaces in 'Traffic' column
df['Traffic'] = df['Traffic'].str.strip()

# Convert categorical values in 'Traffic' to numerical using LabelEncoder
encoder = LabelEncoder()
df['Traffic'] = encoder.fit_transform(df['Traffic'])

# ✅ Prepare Data
# Select relevant features (independent variables)
X = df[['Agent_Age', 'Agent_Rating', 'Traffic']]
# Select target variable (dependent variable)
y = df['Delivery_Time']

# ✅ Split Data
# Split data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# ✅ Train Model
# Initialize and train the RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# ✅ Predict
# Make predictions using the trained model
y_pred = model.predict(X_test)

# ✅ Evaluate Model
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
# Calculate R-Squared Score
r2 = r2_score(y_test, y_pred)

# ✅ Display Evaluation Metrics
print(f'Mean Squared Error: {mse:.2f}')
print(f'R2 Score: {r2:.2f}')

# ✅ Re-train Model to Confirm Output
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# ✅ Calculate and Display Final Metrics
rmse = np.sqrt(mean_squared_error(y_test, predictions))
final_r2 = r2_score(y_test, predictions)

print('Root Mean Squared Error (RMSE):', round(rmse, 2))
print('Final R2 Score:', round(final_r2, 2))

# ✅ Save Model and Processed Data
# Save the trained model using joblib
joblib.dump(model, '../models/rf_model.pkl')

# Save the processed dataset to a CSV file
df.to_csv('../output/processed_data.csv', index=False)

print("✅ Model saved as 'rf_model.pkl'")
print("✅ Processed data saved as 'processed_data.csv'")


Mean Squared Error: 1856.26
R2 Score: 0.30
Root Mean Squared Error (RMSE): 43.08
Final R2 Score: 0.3
✅ Model saved as 'rf_model.pkl'
✅ Processed data saved as 'processed_data.csv'
