In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r'D:\Data Science\ML\Supplement_Sales_Weekly_Expanded.csv')

In [3]:
df.head()

Unnamed: 0,Date,Product Name,Category,Units Sold,Price,Revenue,Discount,Units Returned,Location,Platform
0,2020-01-06,Whey Protein,Protein,143,31.98,4573.14,0.03,2,Canada,Walmart
1,2020-01-06,Vitamin C,Vitamin,139,42.51,5908.89,0.04,0,UK,Amazon
2,2020-01-06,Fish Oil,Omega,161,12.91,2078.51,0.25,0,Canada,Amazon
3,2020-01-06,Multivitamin,Vitamin,140,16.07,2249.8,0.08,0,Canada,Walmart
4,2020-01-06,Pre-Workout,Performance,157,35.47,5568.79,0.25,3,Canada,iHerb


In [4]:
df.tail()

Unnamed: 0,Date,Product Name,Category,Units Sold,Price,Revenue,Discount,Units Returned,Location,Platform
4379,2025-03-31,Melatonin,Sleep Aid,160,47.79,7646.4,0.21,1,USA,iHerb
4380,2025-03-31,Biotin,Vitamin,154,38.12,5870.48,0.22,1,UK,Walmart
4381,2025-03-31,Green Tea Extract,Fat Burner,139,20.4,2835.6,0.12,3,USA,iHerb
4382,2025-03-31,Iron Supplement,Mineral,154,18.31,2819.74,0.23,2,Canada,Amazon
4383,2025-03-31,Electrolyte Powder,Hydration,178,39.12,6963.36,0.23,0,UK,iHerb


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4384 entries, 0 to 4383
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            4384 non-null   object 
 1   Product Name    4384 non-null   object 
 2   Category        4384 non-null   object 
 3   Units Sold      4384 non-null   int64  
 4   Price           4384 non-null   float64
 5   Revenue         4384 non-null   float64
 6   Discount        4384 non-null   float64
 7   Units Returned  4384 non-null   int64  
 8   Location        4384 non-null   object 
 9   Platform        4384 non-null   object 
dtypes: float64(3), int64(2), object(5)
memory usage: 342.6+ KB


In [6]:
df.isnull().sum()

Date              0
Product Name      0
Category          0
Units Sold        0
Price             0
Revenue           0
Discount          0
Units Returned    0
Location          0
Platform          0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df.shape

(4384, 10)

In [9]:
df.columns

Index(['Date', 'Product Name', 'Category', 'Units Sold', 'Price', 'Revenue',
       'Discount', 'Units Returned', 'Location', 'Platform'],
      dtype='object')

In [10]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

In [39]:
le = LabelEncoder()
for i in ['Category', 'Location', 'Platform']:
    df[i] = le.fit_transform(df[i])

df.drop(['Date', 'Product Name'], axis=1, inplace=True, errors='ignore')


In [40]:
# feature and target 
X = df.drop(columns='Revenue')
y = df['Revenue']

# Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [41]:
# Scaling 
scaler = StandardScaler()
scaler.fit(X_train)

In [42]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [43]:
#Models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'DecisionTree':DecisionTreeRegressor(),
    'KNN': KNeighborsRegressor()
}

# Results 
res = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_preds = model.predict(X_test_scaled)
    score = r2_score(y_test, y_preds)
    mse = mean_squared_error(y_test, y_preds)

    print(f'\n Performance: {name}')
    print(f'R square Score: {score:.4f}')
    print(f'Accuracy: {score * 100:.2f}%')
    print(f'MSE: {mse:.2f}')


 Performance: LinearRegression
R square Score: -23.3356
Accuracy: -2333.56%
MSE: 112732452.95

 Performance: Ridge
R square Score: -23.3382
Accuracy: -2333.82%
MSE: 112744550.45

 Performance: DecisionTree
R square Score: -3.1763
Accuracy: -317.63%
MSE: 19346387.68

 Performance: KNN
R square Score: -1.8113
Accuracy: -181.13%
MSE: 13022993.79
