In [2]:
# 03_model_building.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os

# Ensure plots appear inline
%matplotlib inline


# Load dataset

data_path = "../data/superstore_cleaned.csv"

if not os.path.exists(data_path):
    print(f" File not found at {data_path}")
else:
    print(f" File found: {data_path}")

df = pd.read_csv(data_path)
print("\n Dataset loaded successfully!")
print(f"Shape: {df.shape}")
display(df.head())

# Verify column names
print("\n Columns in dataset:")
print(df.columns)


# Encode categorical columns

cat_cols = df.select_dtypes(include=['object']).columns
print("\n Encoding categorical columns:", list(cat_cols))

encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

print("\n Encoding complete.")
display(df.head())


# Define features and target

X = df.drop(columns=["profit"])   # features
y = df["profit"]                  # target

print("\n Feature Matrix Shape:", X.shape)
print(" Target Shape:", y.shape)


# Split data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\n Data split complete.")
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


# Feature scaling

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("\n Feature scaling complete.")


# Model 1: Linear Regression

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print("\n Linear Regression Performance:")
print(f"MAE: {mae_lr:.2f}")
print(f"RMSE: {rmse_lr:.2f}")
print(f"R² Score: {r2_lr:.3f}")


# Model 2: Random Forest Regressor

rf = RandomForestRegressor(n_estimators=150, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print("\n Random Forest Performance:")
print(f"MAE: {mae_rf:.2f}")
print(f"RMSE: {rmse_rf:.2f}")
print(f"R² Score: {r2_rf:.3f}")


# Compare model performance

results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "MAE": [mae_lr, mae_rf],
    "RMSE": [rmse_lr, rmse_rf],
    "R² Score": [r2_lr, r2_rf]
})

print("\n Model Comparison:")
display(results)


 File found: ../data/superstore_cleaned.csv

 Dataset loaded successfully!
Shape: (9994, 25)


Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,sub-category,product_name,sales,quantity,discount,profit,profit_margin,cost,year,month
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,0.16,220.0464,2016,November
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582,0.3,512.358,2016,November
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714,0.47,7.7486,2016,June
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031,-0.4,1340.6085,2015,October
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164,0.1125,19.8516,2015,October



 Columns in dataset:
Index(['row_id', 'order_id', 'order_date', 'ship_date', 'ship_mode',
       'customer_id', 'customer_name', 'segment', 'country', 'city', 'state',
       'postal_code', 'region', 'product_id', 'category', 'sub-category',
       'product_name', 'sales', 'quantity', 'discount', 'profit',
       'profit_margin', 'cost', 'year', 'month'],
      dtype='object')

 Encoding categorical columns: ['order_id', 'order_date', 'ship_date', 'ship_mode', 'customer_id', 'customer_name', 'segment', 'country', 'city', 'state', 'region', 'product_id', 'category', 'sub-category', 'product_name', 'month']

 Encoding complete.


Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,sub-category,product_name,sales,quantity,discount,profit,profit_margin,cost,year,month
0,1,2500,864,929,2,143,166,0,0,194,...,4,386,261.96,2,0.0,41.9136,0.16,220.0464,2016,9
1,2,2500,864,929,2,143,166,0,0,194,...,5,839,731.94,3,0.0,219.582,0.3,512.358,2016,9
2,3,2296,732,787,2,237,201,1,0,266,...,10,1433,14.62,2,0.0,6.8714,0.47,7.7486,2016,6
3,4,4372,519,568,3,705,687,0,0,153,...,16,366,957.5775,5,0.45,-383.031,-0.4,1340.6085,2015,10
4,5,4372,519,568,3,705,687,0,0,153,...,14,573,22.368,2,0.2,2.5164,0.1125,19.8516,2015,10



 Feature Matrix Shape: (9994, 24)
 Target Shape: (9994,)

 Data split complete.
Train shape: (7995, 24)
Test shape: (1999, 24)

 Feature scaling complete.

 Linear Regression Performance:
MAE: 0.00
RMSE: 0.00
R² Score: 1.000

 Random Forest Performance:
MAE: 9.78
RMSE: 174.05
R² Score: 0.375

 Model Comparison:


Unnamed: 0,Model,MAE,RMSE,R² Score
0,Linear Regression,5.161234e-13,8.206906e-13,1.0
1,Random Forest,9.78086,174.0547,0.375168
