In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
retail_data = pd.read_csv("C:/Users/HOU_User/OneDrive - Artra Technologies Pvt Ltd/Python_ML_Datasets/mock_kaggle.csv")

In [3]:
retail_data.head()

Unnamed: 0,data,venda,estoque,preco
0,2014-01-01,0,4972,1.29
1,2014-01-02,70,4902,1.29
2,2014-01-03,59,4843,1.29
3,2014-01-04,93,4750,1.29
4,2014-01-05,96,4654,1.29


In [5]:
retail_data = retail_data.rename(columns = {
    'data':'Date',
    'venda':'Sales',
    'estoque':'Stock',
    'preco':'Price',
})

In [6]:
retail_data

Unnamed: 0,Date,Sales,Stock,Price
0,2014-01-01,0,4972,1.29
1,2014-01-02,70,4902,1.29
2,2014-01-03,59,4843,1.29
3,2014-01-04,93,4750,1.29
4,2014-01-05,96,4654,1.29
...,...,...,...,...
932,2016-07-27,98,3179,2.39
933,2016-07-28,108,3071,2.39
934,2016-07-29,128,4095,2.39
935,2016-07-30,270,3825,2.39


In [7]:
retail_data.Date = pd.to_datetime(retail_data.Date)

In [11]:
retail_data.Date.dtype

dtype('<M8[ns]')

In [18]:
retail_data['Is_WeekEnd'] = retail_data.Date.dt.dayofweek >=5

In [21]:
retail_data[retail_data.Is_WeekEnd == True]

Unnamed: 0,Date,Sales,Stock,Price,Is_WeekEnd
3,2014-01-04,93,4750,1.29,True
4,2014-01-05,96,4654,1.29,True
10,2014-01-11,188,5239,1.09,True
11,2014-01-12,121,5118,1.09,True
16,2014-01-18,159,4464,1.19,True
...,...,...,...,...,...
922,2016-07-17,209,813,1.89,True
928,2016-07-23,133,1550,2.39,True
929,2016-07-24,130,1420,2.39,True
935,2016-07-30,270,3825,2.39,True


In [None]:
retail_data['Day_of_Week'] = retail_data.Date.dt.dayofweek # Monday=0, Sunday=6 So, Monday to Sunday = 0 to 6

In [None]:
retail_data 

Unnamed: 0,Date,Sales,Stock,Price,Is_WeekEnd,Day_of_Week
0,2014-01-01,0,4972,1.29,False,2
1,2014-01-02,70,4902,1.29,False,3
2,2014-01-03,59,4843,1.29,False,4
3,2014-01-04,93,4750,1.29,True,5
4,2014-01-05,96,4654,1.29,True,6
...,...,...,...,...,...,...
932,2016-07-27,98,3179,2.39,False,2
933,2016-07-28,108,3071,2.39,False,3
934,2016-07-29,128,4095,2.39,False,4
935,2016-07-30,270,3825,2.39,True,5


In [24]:
retail_data['Price_per_stock'] = retail_data.Price * retail_data.Stock

In [25]:
retail_data

Unnamed: 0,Date,Sales,Stock,Price,Is_WeekEnd,Day_of_Week,Price_per_stock
0,2014-01-01,0,4972,1.29,False,2,6413.88
1,2014-01-02,70,4902,1.29,False,3,6323.58
2,2014-01-03,59,4843,1.29,False,4,6247.47
3,2014-01-04,93,4750,1.29,True,5,6127.50
4,2014-01-05,96,4654,1.29,True,6,6003.66
...,...,...,...,...,...,...,...
932,2016-07-27,98,3179,2.39,False,2,7597.81
933,2016-07-28,108,3071,2.39,False,3,7339.69
934,2016-07-29,128,4095,2.39,False,4,9787.05
935,2016-07-30,270,3825,2.39,True,5,9141.75


In [26]:
retail_data['Sales_Per_Stock'] = retail_data.Sales / retail_data.Stock

In [27]:
retail_data

Unnamed: 0,Date,Sales,Stock,Price,Is_WeekEnd,Day_of_Week,Price_per_stock,Sales_Per_Stock
0,2014-01-01,0,4972,1.29,False,2,6413.88,0.000000
1,2014-01-02,70,4902,1.29,False,3,6323.58,0.014280
2,2014-01-03,59,4843,1.29,False,4,6247.47,0.012183
3,2014-01-04,93,4750,1.29,True,5,6127.50,0.019579
4,2014-01-05,96,4654,1.29,True,6,6003.66,0.020627
...,...,...,...,...,...,...,...,...
932,2016-07-27,98,3179,2.39,False,2,7597.81,0.030827
933,2016-07-28,108,3071,2.39,False,3,7339.69,0.035168
934,2016-07-29,128,4095,2.39,False,4,9787.05,0.031258
935,2016-07-30,270,3825,2.39,True,5,9141.75,0.070588


In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    retail_data.drop(columns=['Date','Sales']),
    retail_data['Sales'],
    test_size=0.2,
    random_state=42
)

In [29]:
X_train

Unnamed: 0,Stock,Price,Is_WeekEnd,Day_of_Week,Price_per_stock,Sales_Per_Stock
5,4509,1.29,False,0,5816.61,0.032158
54,6412,1.09,False,3,6989.08,0.005770
275,1934,1.29,False,3,2494.86,0.050155
884,2778,2.59,False,3,7195.02,0.072354
344,578,1.29,False,2,745.62,0.271626
...,...,...,...,...,...,...
106,1525,1.29,False,1,1967.25,0.000000
270,1730,1.29,True,5,2231.70,0.060694
860,80,2.59,False,0,207.20,0.000000
435,1263,1.29,False,2,1629.27,0.165479


In [41]:
retail_data.isna().sum()

Date               0
Sales              0
Stock              0
Price              0
Is_WeekEnd         0
Day_of_Week        0
Price_per_stock    0
Sales_Per_Stock    0
dtype: int64

In [40]:
(retail_data == np.inf).sum()

Date               0
Sales              0
Stock              0
Price              0
Is_WeekEnd         0
Day_of_Week        0
Price_per_stock    0
Sales_Per_Stock    0
dtype: int64

In [36]:
retail_data['Sales_Per_Stock'].replace([np.inf, -np.inf], np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  retail_data['Sales_Per_Stock'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [39]:
retail_data['Sales_Per_Stock'].fillna(retail_data['Sales_Per_Stock'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  retail_data['Sales_Per_Stock'].fillna(retail_data['Sales_Per_Stock'].mean(), inplace=True)


In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    retail_data.drop(columns=['Date','Sales']),
    retail_data['Sales'],
    test_size=0.2,
    random_state=42
)

In [45]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
y_pred = model.predict(X_test)

In [47]:
y_pred

array([0.0000e+00, 5.1130e+01, 4.4630e+01, 2.0441e+02, 9.9690e+01,
       1.2069e+02, 1.3404e+02, 1.0581e+02, 1.9850e+01, 0.0000e+00,
       1.2679e+02, 9.5840e+01, 1.9600e+00, 0.0000e+00, 6.2500e+01,
       0.0000e+00, 0.0000e+00, 3.0312e+02, 1.0331e+02, 6.5450e+01,
       2.1655e+02, 2.0631e+02, 2.1134e+02, 2.8548e+02, 0.0000e+00,
       1.0485e+02, 1.1921e+02, 2.8810e+02, 5.5690e+01, 8.6260e+01,
       2.3940e+01, 8.4740e+01, 1.8480e+01, 7.8890e+01, 6.2680e+01,
       2.5320e+01, 9.5760e+01, 9.0680e+01, 1.7078e+02, 4.5240e+01,
       8.3980e+01, 0.0000e+00, 1.5939e+02, 1.0733e+02, 1.8800e+00,
       2.5870e+01, 8.9220e+01, 4.1100e+01, 2.0330e+02, 1.4130e+01,
       8.1000e-01, 0.0000e+00, 4.8150e+01, 1.3988e+02, 1.8145e+02,
       6.1800e+01, 4.0090e+01, 0.0000e+00, 1.1145e+02, 1.0375e+02,
       7.4520e+01, 2.2282e+02, 1.6541e+02, 3.7660e+01, 3.9900e+01,
       1.6547e+02, 5.7310e+01, 6.2540e+01, 2.7419e+02, 0.0000e+00,
       0.0000e+00, 1.3424e+02, 0.0000e+00, 6.2700e+00, 8.0870e

In [48]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [49]:
mse

1423.3927819148935

In [50]:
mae

11.979148936170212

In [51]:
Comparison = pd.DataFrame(
    {
        'Actual': y_test,
        'Predicted': y_pred
    }
)

In [52]:
print(Comparison)

     Actual  Predicted
321       0       0.00
70       48      51.13
209      45      44.63
656     257     204.41
685      76      99.69
..      ...        ...
847     125     130.28
2        59      57.35
456      30      30.80
331       0       0.00
408      31      29.76

[188 rows x 2 columns]
