# 1. Import Necessary Libraries

In [5]:
import pandas as pd
import numpy as np
from src.data_preprocessing import preprocess_data
from src.model_training import train_model, evaluate_model, save_model
from src.model_inference import load_preprocessing_pipeline, load_model, preprocess_new_entry, predict_units
from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 2. Load the Data

In [6]:
# change picked_id field as you need
picked_id = 'CHO-MIL-31000'

df = pd.read_csv(f'./datasets/{picked_id}-Sales.csv')

# Display the first few rows
print("Raw Data:")
display(df.head())

Raw Data:


Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Country/Region,City,State/Province,Postal Code,Division,Region,Product ID,Product Name,Sales,Units,Gross Profit,Cost
0,1133,US-2021-138100-CHO-MIL-31000,2021-09-15,2027-03-13,Standard Class,138100,United States,New York City,New York,10011,Chocolate,Atlantic,CHO-MIL-31000,Wonka Bar - Milk Chocolate,9.75,3,6.33,3.42
1,3396,US-2022-121391-CHO-MIL-31000,2022-10-04,2028-03-29,First Class,121391,United States,San Francisco,California,94109,Chocolate,Pacific,CHO-MIL-31000,Wonka Bar - Milk Chocolate,6.5,2,4.22,2.28
2,4377,US-2023-103982-CHO-MIL-31000,2023-03-03,2028-08-28,Standard Class,103982,United States,Round Rock,Texas,78664,Chocolate,Interior,CHO-MIL-31000,Wonka Bar - Milk Chocolate,9.75,3,6.33,3.42
3,387,US-2021-158064-CHO-MIL-31000,2021-04-21,2026-10-16,Standard Class,158064,United States,Los Angeles,California,90008,Chocolate,Pacific,CHO-MIL-31000,Wonka Bar - Milk Chocolate,16.25,5,10.55,5.7
4,1397,US-2021-130729-CHO-MIL-31000,2021-10-24,2027-04-21,Standard Class,130729,United States,Rancho Cucamonga,California,91730,Chocolate,Pacific,CHO-MIL-31000,Wonka Bar - Milk Chocolate,9.75,3,6.33,3.42


# 3. Preprocess the Data

In [7]:
# Preprocess the data
X, y, preprocessing_pipeline = preprocess_data(df, pipeline=None, fit_pipeline=True)

# Display preprocessed features
print("\nPreprocessed Features:")
display(X.head())


Preprocessed Features:


Unnamed: 0,Ship Mode_First Class,Ship Mode_Same Day,Ship Mode_Second Class,Ship Mode_Standard Class,Country/Region_Canada,Country/Region_United States,Region_Atlantic,Region_Gulf,Region_Interior,Region_Pacific,...,State/Province_Freq,Sales,Gross Profit,Cost,Year,Month,Day,DayOfWeek,WeekOfYear,IsWeekend
0,-0.414247,-0.230712,-0.48209,0.792193,-0.178518,0.178518,1.531107,-0.429639,-0.551942,-0.672869,...,-0.377156,-0.377156,-1.499708,0.388299,-0.068705,-0.409438,0.370672,-0.679482,2.079488,0.551867
1,2.414017,-0.230712,-0.48209,-1.262318,-0.178518,0.178518,-0.653122,-0.429639,-0.551942,1.486173,...,-0.811415,-0.811415,-0.615848,0.687863,-1.334409,-0.878088,0.577457,-0.679482,0.678501,1.775479
2,-0.414247,-0.230712,-0.48209,0.792193,-0.178518,0.178518,-0.653122,-0.429639,1.811785,-0.672869,...,-0.377156,-0.377156,0.268012,-1.40909,-1.449473,0.527861,-1.559324,-0.679482,-0.833927,0.352675
3,-0.414247,-0.230712,-0.48209,0.792193,-0.178518,0.178518,-0.653122,-0.429639,-0.551942,1.486173,...,0.49136,0.49136,-1.499708,-1.109525,0.62168,-0.409438,-1.076825,-0.679482,1.777002,1.775479
4,-0.414247,-0.230712,-0.48209,0.792193,-0.178518,0.178518,-0.653122,-0.429639,-0.551942,1.486173,...,-0.377156,-0.377156,-1.499708,0.687863,0.966872,1.465161,0.715314,1.471709,-0.833927,1.775479


# 4. Split the Data into Training and Testing Sets

In [8]:
# First split: 60% train, 40% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

# Second split: from the temp set, take 50% for CV and 50% for test
X_cv, X_test, y_cv, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"CV set shape: {X_cv.shape}")
print(f"Testing set shape: {X_test.shape}")


Training set shape: (1282, 21)
CV set shape: (427, 21)
Testing set shape: (428, 21)


# 5. Train the ML Model

In [9]:
# Train the model
model = train_model(X_train, y_train)

# 6. Evaluate the Model

In [10]:
# Evaluate on CV set for model selection / hyperparameter tuning
cv_metrics = evaluate_model(model, X_cv, y_cv)
print("\nCV Evaluation Metrics:")
for metric, value in cv_metrics.items():
    print(f"{metric}: {value:.4f}")

# Evaluate on the test set
metrics = evaluate_model(model, X_test, y_test)

print("\nModel Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


CV Evaluation Metrics:
Mean Absolute Error (MAE): 0.0031
Root Mean Squared Error (RMSE): 0.0327
R² Score: 0.9998

Model Evaluation Metrics:
Mean Absolute Error (MAE): 0.0022
Root Mean Squared Error (RMSE): 0.0206
R² Score: 0.9999




# 7. Save the Preprocessing Pipeline and the Model

In [11]:
# Save the preprocessing pipeline
import pickle

preprocessing_pipeline_path = './models/preprocessing_pipeline.pkl'
with open(preprocessing_pipeline_path, 'wb') as f:
    pickle.dump(preprocessing_pipeline, f)

# Save the trained model
model_path = './models/model.pkl'
save_model(model, model_path)

print("\nPreprocessing pipeline and trained model have been saved.")


Preprocessing pipeline and trained model have been saved.


# 8. Make Predictions on New Data

In [12]:
# Example: Load new data for prediction
# Replace 'data/new_data.csv' with your actual new data file path
new_data_path = './datasets/new_data.csv'
new_df = pd.read_csv(new_data_path)

print("\nNew Data:")
display(new_df.head())

# Preprocess the new data
preprocessing_pipeline_loaded = load_preprocessing_pipeline(preprocessing_pipeline_path)
X_new_preprocessed = preprocess_new_entry(new_df, preprocessing_pipeline_loaded)

print("\nPreprocessed New Data:")
display(X_new_preprocessed.head())

# Load the trained model
model_loaded = load_model(model_path)

# Predict 'Units' for the new data
predictions = predict_units(model_loaded, X_new_preprocessed)

# Add predictions to the new data
new_df['Predicted Units'] = predictions

print("\nNew Data with Predictions:")
display(new_df[['Order ID', 'Predicted Units']].head())

# Optionally, save the predictions
output_path = 'models/predictions.csv'
new_df.to_csv(output_path, index=False)
print(f"\nPredictions have been saved to '{output_path}'.")


New Data:


Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Country/Region,City,State/Province,Postal Code,Division,Region,Product ID,Product Name,Sales,Gross Profit,Cost
0,1133,US-2021-138100-CHO-MIL-31000,2021-09-15,2027-03-13,Standard Class,138100,United States,New York City,New York,10011,Chocolate,Atlantic,CHO-MIL-31000,Wonka Bar - Milk Chocolate,9.75,6.33,3.42
1,3396,US-2022-121391-CHO-MIL-31000,2022-10-04,2028-03-29,First Class,121391,United States,San Francisco,California,94109,Chocolate,Pacific,CHO-MIL-31000,Wonka Bar - Milk Chocolate,6.5,4.22,2.28
2,4377,US-2023-103982-CHO-MIL-31000,2023-03-03,2028-08-28,Standard Class,103982,United States,Round Rock,Texas,78664,Chocolate,Interior,CHO-MIL-31000,Wonka Bar - Milk Chocolate,9.75,6.33,3.42
3,387,US-2021-158064-CHO-MIL-31000,2021-04-21,2026-10-16,Standard Class,158064,United States,Los Angeles,California,90008,Chocolate,Pacific,CHO-MIL-31000,Wonka Bar - Milk Chocolate,16.25,10.55,5.7
4,1397,US-2021-130729-CHO-MIL-31000,2021-10-24,2027-04-21,Standard Class,130729,United States,Rancho Cucamonga,California,91730,Chocolate,Pacific,CHO-MIL-31000,Wonka Bar - Milk Chocolate,9.75,6.33,3.42



Preprocessed New Data:


Unnamed: 0,Ship Mode_First Class,Ship Mode_Same Day,Ship Mode_Second Class,Ship Mode_Standard Class,Country/Region_Canada,Country/Region_United States,Region_Atlantic,Region_Gulf,Region_Interior,Region_Pacific,...,State/Province_Freq,Sales,Gross Profit,Cost,Year,Month,Day,DayOfWeek,WeekOfYear,IsWeekend
0,-0.414247,-0.230712,-0.48209,0.792193,-0.178518,0.178518,1.531107,-0.429639,-0.551942,-0.672869,...,-0.377156,-0.377156,-1.499708,0.388299,-0.068705,-0.409438,0.370672,-0.679482,2.079488,0.551867
1,2.414017,-0.230712,-0.48209,-1.262318,-0.178518,0.178518,-0.653122,-0.429639,-0.551942,1.486173,...,-0.811415,-0.811415,-0.615848,0.687863,-1.334409,-0.878088,0.577457,-0.679482,0.678501,1.775479
2,-0.414247,-0.230712,-0.48209,0.792193,-0.178518,0.178518,-0.653122,-0.429639,1.811785,-0.672869,...,-0.377156,-0.377156,0.268012,-1.40909,-1.449473,0.527861,-1.559324,-0.679482,-0.833927,0.352675
3,-0.414247,-0.230712,-0.48209,0.792193,-0.178518,0.178518,-0.653122,-0.429639,-0.551942,1.486173,...,0.49136,0.49136,-1.499708,-1.109525,0.62168,-0.409438,-1.076825,-0.679482,1.777002,1.775479
4,-0.414247,-0.230712,-0.48209,0.792193,-0.178518,0.178518,-0.653122,-0.429639,-0.551942,1.486173,...,-0.377156,-0.377156,-1.499708,0.687863,0.966872,1.465161,0.715314,1.471709,-0.833927,1.775479



New Data with Predictions:


Unnamed: 0,Order ID,Predicted Units
0,US-2021-138100-CHO-MIL-31000,3.0
1,US-2022-121391-CHO-MIL-31000,2.0
2,US-2023-103982-CHO-MIL-31000,3.0
3,US-2021-158064-CHO-MIL-31000,5.0
4,US-2021-130729-CHO-MIL-31000,3.0



Predictions have been saved to 'models/predictions.csv'.
