In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasales/processed_sales_dataa.csv


## -- Libraries --

In [2]:
# Libraries for data handling
import pandas as pd
import numpy as np

# Sklearn libraries for preprocessing, modeling, and evaluation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [3]:
data = pd.read_csv('/kaggle/input/datasales/processed_sales_dataa.csv')
data.head()

Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority,Order Date,Order ID,Ship Date,Units Sold,Unit Price,...,Total Revenue,Total Cost,Total Profit,Order Processing Time,Order Year,Order Month,Order Day,Ship Year,Ship Month,Ship Day
0,Middle East and North Africa,Algeria,Clothes,Online,L,2019-09-06,446019446,2019-09-21,5921,109.28,...,2.665851,212208.64,12.982732,15,2019,9,6,2019,9,21
1,Europe,Bosnia and Herzegovina,Cosmetics,Online,H,2019-09-12,619706857,2019-10-13,3885,437.2,...,2.730807,1023037.05,13.423188,31,2019,9,12,2019,10,13
2,Asia,Maldives,Meat,Offline,M,2014-08-14,519134318,2014-08-17,2236,421.89,...,2.69173,815446.84,11.759006,3,2014,8,14,2014,8,17
3,Central America and the Caribbean,The Bahamas,Office Supplies,Online,H,2020-03-20,211538159,2020-05-05,7697,651.21,...,2.798952,4040617.12,13.786851,46,2020,3,20,2020,5,5
4,Middle East and North Africa,Somalia,Snacks,Online,H,2016-05-05,734467208,2016-05-14,4023,152.58,...,2.662179,392001.12,12.309663,9,2016,5,5,2016,5,14


### - Separate Features and Target - 

In [4]:
target = 'Unit Price'
X = data.drop(columns=[target])
y = data[target]

In [5]:
X = X.drop(columns=['Order Date', 'Ship Date'])


In [6]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()  # Categorical features
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()  # Numerical features


In [7]:
# Print the number of categorical and numerical columns
print(f'Number of Categorical Columns: {len(categorical_cols)}')
print(f'Number of Numerical Columns: {len(numerical_cols)}')

Number of Categorical Columns: 5
Number of Numerical Columns: 13


## - Preprocessing (Encoding Categorical Features) - 

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Step 3: Preprocess the data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # Encode categorical columns
    ],
    remainder='passthrough'  # Keep numerical columns unchanged
)

# Step 4: Apply preprocessing to the features
X_processed = preprocessor.fit_transform(X)

# Step 5: Get feature names after transformation
categorical_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist()
feature_names = categorical_features + numerical_cols



In [9]:
print(f'Feature Names: {feature_names}')

Feature Names: ['Region_Asia', 'Region_Australia and Oceania', 'Region_Central America and the Caribbean', 'Region_Europe', 'Region_Middle East and North Africa', 'Region_North America', 'Region_Sub-Saharan Africa', 'Country_Afghanistan', 'Country_Albania', 'Country_Algeria', 'Country_Andorra', 'Country_Angola', 'Country_Antigua and Barbuda ', 'Country_Armenia', 'Country_Australia', 'Country_Austria', 'Country_Azerbaijan', 'Country_Bahrain', 'Country_Bangladesh', 'Country_Barbados', 'Country_Belarus', 'Country_Belgium', 'Country_Belize', 'Country_Benin', 'Country_Bhutan', 'Country_Bosnia and Herzegovina', 'Country_Botswana', 'Country_Brunei', 'Country_Bulgaria', 'Country_Burkina Faso', 'Country_Burundi', 'Country_Cambodia', 'Country_Cameroon', 'Country_Canada', 'Country_Cape Verde', 'Country_Central African Republic', 'Country_Chad', 'Country_China', 'Country_Comoros', 'Country_Costa Rica', "Country_Cote d'Ivoire", 'Country_Croatia', 'Country_Cuba', 'Country_Cyprus', 'Country_Czech Rep

## - Split the Data into Train and Test Sets - 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

## - Model Training - 

In [11]:
# Initialize the models
linear_model = LinearRegression()
rf_model = RandomForestRegressor(random_state=42)
xgb_model = XGBRegressor(random_state=42)

In [12]:
# Train Linear Regression model
linear_model.fit(X_train, y_train)

In [13]:
# Train Random Forest model
rf_model.fit(X_train, y_train)

In [14]:
# Train XGBoost model
xgb_model.fit(X_train, y_train)


## - Predictions on Test Data -

In [18]:
# Predictions
y_pred_linear = linear_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

## - Evaluate the Models - 

#### **Linear Regression Evaluation**

In [16]:
# Linear Regression Evaluation
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
r2_linear = r2_score(y_test, y_pred_linear)
mae_linear = mean_absolute_error(y_test, y_pred_linear)

print("Linear Regression - RMSE:", rmse_linear)
print("Linear Regression - R²:", r2_linear)
print("Linear Regression - MAE:", mae_linear)

Linear Regression - RMSE: 112.16196906328986
Linear Regression - R²: 0.7313504836489897
Linear Regression - MAE: 83.4783752716673


#### **Random Forest Evaluation**

In [19]:
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print("Random Forest - RMSE:", rmse_rf)
print("Random Forest - R²:", r2_rf)
print("Random Forest - MAE:", mae_rf)

Random Forest - RMSE: 1.1011663362390657e-11
Random Forest - R²: 1.0
Random Forest - MAE: 8.225909819969957e-12


#### **XGBoost Evaluation**

In [20]:
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)

print("XGBoost - RMSE:", rmse_xgb)
print("XGBoost - R²:", r2_xgb)
print("XGBoost - MAE:", mae_xgb)

XGBoost - RMSE: 1.9702124714234767e-05
XGBoost - R²: 0.9999999999999917
XGBoost - MAE: 1.5353431700287778e-05


## - Testing the Model on a New Sample - 

In [21]:
# Sample input for prediction
sample_input_modified = {
    "Order ID": 67890,
    "Units Sold": 1000,  # Increased Units Sold
    "Unit Cost": 50.0,   # Increased Unit Cost
    "Total Revenue": 50000.0,
    "Total Cost": 40000.0,
    "Total Profit": 10000.0,
    "Order Processing Time": 20,
    "Order Year": 2023,
    "Order Month": 8,
    "Order Day": 15,
    "Ship Year": 2023,
    "Ship Month": 8,
    "Ship Day": 18,
    "Region": "North America",  # Changed region
    "Country": "United States of America",  # Changed country
    "Item Type": "Meat",  # Changed item type
    "Sales Channel": "Online",  # Changed sales channel
    "Order Priority": "H"  # Changed order priority
}
# Convert the sample input to a DataFrame
sample_df = pd.DataFrame([sample_input_modified])

# Preprocess the sample input using the existing preprocessor from your pipeline
sample_processed = preprocessor.transform(sample_df)

# Predict using Linear Regression model
linear_pred = linear_model.predict(sample_processed)

# Predict using Random Forest model
rf_pred = rf_model.predict(sample_processed)

# Predict using XGBoost model
xgb_pred = xgb_model.predict(sample_processed)

# Print the results
print("Linear Regression - Prediction:", linear_pred[0])
print("Random Forest - Prediction:", rf_pred[0])
print("XGBoost - Prediction:", xgb_pred[0])

Linear Regression - Prediction: 239.13201041385196
Random Forest - Prediction: 91.64800000000282
XGBoost - Prediction: 81.73


In [22]:

sample_input_modified_2 = {
    "Order ID": 99999,  
    "Units Sold": 5000,  
    "Unit Cost": 100.0, 
    "Total Revenue": 500000.0,  
    "Total Cost": 300000.0,  
    "Total Profit": 200000.0,  # Increased Total Profit
    "Order Processing Time": 40,  
    "Order Year": 2024,  # Changed Order Year to future value
    "Order Month": 12,  # Changed Order Month
    "Order Day": 31,  # Changed Order Day to the end of the year
    "Ship Year": 2025,  # Changed Ship Year to future value
    "Ship Month": 1,  # Changed Ship Month to beginning of the next year
    "Ship Day": 10,  # Changed Ship Day
    "Region": "Asia",  # Changed region to Asia
    "Country": "China",  # Changed country to China
    "Item Type": "Office Supplies",  # Changed item type to Office Supplies
    "Sales Channel": "Offline",  # Changed sales channel to Offline
    "Order Priority": "C"  # Changed order priority to Critical
}


sample_df_modified_2 = pd.DataFrame([sample_input_modified_2])


sample_processed_modified_2 = preprocessor.transform(sample_df_modified_2)


linear_pred_modified_2 = linear_model.predict(sample_processed_modified_2)
rf_pred_modified_2 = rf_model.predict(sample_processed_modified_2)
xgb_pred_modified_2 = xgb_model.predict(sample_processed_modified_2)


print("Linear Regression - Significantly Modified Prediction:", linear_pred_modified_2[0])


Linear Regression - Significantly Modified Prediction: 149.84388148763907


In [23]:
# Sample input for prediction with another set of significantly modified values
sample_input_modified_3 = {
    "Order ID": 12345,  # Changed Order ID
    "Units Sold": 50,  # Decreased Units Sold to a smaller number
    "Unit Cost": 5.0,  # Decreased Unit Cost significantly
    "Total Revenue": 500.0,  # Reduced Total Revenue
    "Total Cost": 300.0,  # Reduced Total Cost
    "Total Profit": 200.0,  # Reduced Total Profit
    "Order Processing Time": 5,  # Reduced Order Processing Time to a smaller value
    "Order Year": 2021,  # Changed Order Year to a past value
    "Order Month": 1,  # Changed Order Month to the beginning of the year
    "Order Day": 5,  # Changed Order Day
    "Ship Year": 2021,  # Changed Ship Year to match the order year
    "Ship Month": 1,  # Changed Ship Month
    "Ship Day": 10,  # Changed Ship Day
    "Region": "Sub-Saharan Africa",  # Changed region to Sub-Saharan Africa
    "Country": "Nigeria",  # Changed country to Nigeria
    "Item Type": "Fruits",  # Changed item type to Fruits
    "Sales Channel": "Online",  # Changed sales channel to Online
    "Order Priority": "L"  # Changed order priority to Low
}


sample_df_modified_3 = pd.DataFrame([sample_input_modified_3])


sample_processed_modified_3 = preprocessor.transform(sample_df_modified_3)


linear_pred_modified_3 = linear_model.predict(sample_processed_modified_3)
rf_pred_modified_3 = rf_model.predict(sample_processed_modified_3)
xgb_pred_modified_3 = xgb_model.predict(sample_processed_modified_3)


print("Linear Regression - New Significantly Modified Prediction:", linear_pred_modified_3[0])


Linear Regression - New Significantly Modified Prediction: 264.4229037517383


In [24]:

encoded_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist() + numerical_cols
print("Encoded Feature Names:", encoded_feature_names)


Encoded Feature Names: ['Region_Asia', 'Region_Australia and Oceania', 'Region_Central America and the Caribbean', 'Region_Europe', 'Region_Middle East and North Africa', 'Region_North America', 'Region_Sub-Saharan Africa', 'Country_Afghanistan', 'Country_Albania', 'Country_Algeria', 'Country_Andorra', 'Country_Angola', 'Country_Antigua and Barbuda ', 'Country_Armenia', 'Country_Australia', 'Country_Austria', 'Country_Azerbaijan', 'Country_Bahrain', 'Country_Bangladesh', 'Country_Barbados', 'Country_Belarus', 'Country_Belgium', 'Country_Belize', 'Country_Benin', 'Country_Bhutan', 'Country_Bosnia and Herzegovina', 'Country_Botswana', 'Country_Brunei', 'Country_Bulgaria', 'Country_Burkina Faso', 'Country_Burundi', 'Country_Cambodia', 'Country_Cameroon', 'Country_Canada', 'Country_Cape Verde', 'Country_Central African Republic', 'Country_Chad', 'Country_China', 'Country_Comoros', 'Country_Costa Rica', "Country_Cote d'Ivoire", 'Country_Croatia', 'Country_Cuba', 'Country_Cyprus', 'Country_C

In [29]:
import joblib


joblib.dump(preprocessor, '/kaggle/working/preprocessor.pkl')
joblib.dump(linear_model, '/kaggle/working/linear_model.pkl')



['/kaggle/working/linear_model.pkl']