In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [2]:
choco_data=pd.read_csv(r"D:\RAZIQUE\ml\intern\Chocolate Sales (2).csv")
choco_data

Unnamed: 0,Sales Person,Country,Product,Date,Amount,Boxes Shipped
0,Jehu Rudeforth,UK,Mint Chip Choco,04/01/2022,"$5,320.00",180
1,Van Tuxwell,India,85% Dark Bars,01/08/2022,"$7,896.00",94
2,Gigi Bohling,India,Peanut Butter Cubes,07/07/2022,"$4,501.00",91
3,Jan Morforth,Australia,Peanut Butter Cubes,27/04/2022,"$12,726.00",342
4,Jehu Rudeforth,UK,Peanut Butter Cubes,24/02/2022,"$13,685.00",184
...,...,...,...,...,...,...
3277,Karlen McCaffrey,Australia,Spicy Special Slims,17/05/2024,"$5,303.58",354
3278,Jehu Rudeforth,USA,White Choc,07/06/2024,"$7,339.32",121
3279,Ches Bonnell,Canada,Organic Choco Syrup,26/07/2024,$616.09,238
3280,Dotty Strutley,India,Eclairs,28/07/2024,"$2,504.62",397


In [3]:
choco_data['Amount'] = choco_data['Amount'].str.replace('$', '', regex=False)
choco_data['Amount'] = choco_data['Amount'].str.replace(',', '', regex=False)

In [4]:
choco_data['Date'] = pd.to_datetime(choco_data['Date'], dayfirst=True)
choco_data['Day'] = choco_data['Date'].dt.day
choco_data['Month'] = choco_data['Date'].dt.month
choco_data['Year'] = choco_data['Date'].dt.year

In [5]:
choco_data = choco_data.drop(columns=['Date'])

In [6]:
le = LabelEncoder()
categorical_cols = ['Sales Person', 'Product', 'Country']
for col in categorical_cols:
    choco_data[col] = le.fit_transform(choco_data[col])

In [7]:
choco_data

Unnamed: 0,Sales Person,Country,Product,Amount,Boxes Shipped,Day,Month,Year
0,13,4,14,5320.00,180,4,1,2022
1,23,2,2,7896.00,94,1,8,2022
2,9,2,17,4501.00,91,7,7,2022
3,12,0,17,12726.00,342,27,4,2022
4,13,4,17,13685.00,184,24,2,2022
...,...,...,...,...,...,...,...,...
3277,15,0,20,5303.58,354,17,5,2024
3278,13,5,21,7339.32,121,7,6,2024
3279,5,1,16,616.09,238,26,7,2024
3280,8,2,10,2504.62,397,28,7,2024


In [8]:
x = choco_data.drop('Boxes Shipped',axis=1)
y = choco_data['Boxes Shipped'].astype(int)
print(x)
print(y)

      Sales Person  Country  Product    Amount  Day  Month  Year
0               13        4       14   5320.00    4      1  2022
1               23        2        2   7896.00    1      8  2022
2                9        2       17   4501.00    7      7  2022
3               12        0       17  12726.00   27      4  2022
4               13        4       17  13685.00   24      2  2022
...            ...      ...      ...       ...  ...    ...   ...
3277            15        0       20   5303.58   17      5  2024
3278            13        5       21   7339.32    7      6  2024
3279             5        1       16    616.09   26      7  2024
3280             8        2       10   2504.62   28      7  2024
3281            15        2        1   5915.87   23      5  2024

[3282 rows x 7 columns]
0       180
1        94
2        91
3       342
4       184
       ... 
3277    354
3278    121
3279    238
3280    397
3281    355
Name: Boxes Shipped, Length: 3282, dtype: int64


In [9]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,random_state=1,test_size=0.1)
print(xtrain.shape)
print(xtest.shape)

(2953, 7)
(329, 7)


In [38]:
model = RandomForestRegressor()
model.fit(xtrain, ytrain)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [39]:
model.score(xtest,ytest)

0.8549744419163454

In [22]:
pipeline=Pipeline([('clf',RandomForestRegressor(criterion='squared_error'))])
parameters={
    'clf__max_depth':(10,20,None),
    'clf__min_samples_leaf':(1,2,5,10),
    'clf__min_samples_split':(2,5,10),
    'clf__n_estimators':(100,200,500),
}

In [23]:
grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid=parameters, 
    n_jobs=-1, 
    verbose=1, 
    scoring='r2' 
)
grid_search.fit(xtrain, ytrain)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


0,1,2
,estimator,Pipeline(step...Regressor())])
,param_grid,"{'clf__max_depth': (10, ...), 'clf__min_samples_leaf': (1, ...), 'clf__min_samples_split': (2, ...), 'clf__n_estimators': (100, ...)}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
print('score',grid_search.best_score_)

score 0.7387437393743668


In [25]:
best_parameter=grid_search.best_estimator_.get_params()
best_parameter

{'memory': None,
 'steps': [('clf', RandomForestRegressor(n_estimators=500))],
 'transform_input': None,
 'verbose': False,
 'clf': RandomForestRegressor(n_estimators=500),
 'clf__bootstrap': True,
 'clf__ccp_alpha': 0.0,
 'clf__criterion': 'squared_error',
 'clf__max_depth': None,
 'clf__max_features': 1.0,
 'clf__max_leaf_nodes': None,
 'clf__max_samples': None,
 'clf__min_impurity_decrease': 0.0,
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 2,
 'clf__min_weight_fraction_leaf': 0.0,
 'clf__monotonic_cst': None,
 'clf__n_estimators': 500,
 'clf__n_jobs': None,
 'clf__oob_score': False,
 'clf__random_state': None,
 'clf__verbose': 0,
 'clf__warm_start': False}

In [26]:
best_max_depth=best_parameter['clf__max_depth']
best_sample_leaf=best_parameter['clf__min_samples_leaf']
best_n_estimator=best_parameter['clf__n_estimators']
best_sample_split=best_parameter['clf__min_samples_split']

In [27]:
print('depth',best_max_depth)
print('min sample leaf',best_sample_leaf)
print('n estimators',best_n_estimator)
print('min sample split',best_sample_split)

depth None
min sample leaf 1
n estimators 500
min sample split 2


In [33]:
model = RandomForestRegressor(n_estimators=500,max_depth=None,min_samples_leaf=1,min_samples_split=2)
model.fit(xtrain, ytrain)

0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [34]:
y_prediction = model.predict(xtest)
y_prediction

array([179.974, 349.694, 187.784, 132.212, 176.61 ,  66.558,  56.536,
       249.1  , 161.146, 244.386, 151.112, 252.808, 133.168, 267.738,
       142.61 ,  86.414, 239.36 , 345.146, 207.758, 182.352, 144.434,
        96.148, 151.35 , 147.338,  85.362, 139.484, 196.336, 274.68 ,
       117.776, 266.942, 137.638,  69.96 ,  76.45 , 109.32 ,  85.482,
       197.136,  88.884, 288.386, 141.458, 271.894, 182.828, 323.702,
       140.552, 127.708, 292.592, 109.034, 149.764, 196.97 , 110.686,
       274.174, 186.284, 215.928, 257.348,  56.466, 208.498, 237.258,
       300.236,  78.906, 171.768, 150.672, 100.856,  97.608, 327.676,
       334.252, 158.754,  39.16 , 104.946, 171.99 ,  77.234, 186.486,
       258.044,  84.734, 341.268, 172.412, 104.854, 220.662, 354.318,
        77.21 , 145.764,  72.186, 132.498,  61.446, 132.192, 110.09 ,
       152.238,  81.6  , 201.062,  98.582, 374.68 , 213.66 , 109.988,
       108.186,  58.414, 215.474,  96.238, 179.756, 229.486,  72.424,
       315.932,  51.

In [36]:
model.score(xtest,ytest)

0.8682978100631946

In [37]:
error = mean_absolute_error(ytest, y_prediction)
print(f"On average, the model is off by: ${error:.2f}")

On average, the model is off by: $32.85
