In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [2]:
choco_data=pd.read_csv(r"D:\RAZIQUE\ml\intern\Chocolate Sales (2).csv")
choco_data

Unnamed: 0,Sales Person,Country,Product,Date,Amount,Boxes Shipped
0,Jehu Rudeforth,UK,Mint Chip Choco,04/01/2022,"$5,320.00",180
1,Van Tuxwell,India,85% Dark Bars,01/08/2022,"$7,896.00",94
2,Gigi Bohling,India,Peanut Butter Cubes,07/07/2022,"$4,501.00",91
3,Jan Morforth,Australia,Peanut Butter Cubes,27/04/2022,"$12,726.00",342
4,Jehu Rudeforth,UK,Peanut Butter Cubes,24/02/2022,"$13,685.00",184
...,...,...,...,...,...,...
3277,Karlen McCaffrey,Australia,Spicy Special Slims,17/05/2024,"$5,303.58",354
3278,Jehu Rudeforth,USA,White Choc,07/06/2024,"$7,339.32",121
3279,Ches Bonnell,Canada,Organic Choco Syrup,26/07/2024,$616.09,238
3280,Dotty Strutley,India,Eclairs,28/07/2024,"$2,504.62",397


In [3]:
choco_data['Amount'] = choco_data['Amount'].str.replace('$', '', regex=False)
choco_data['Amount'] = choco_data['Amount'].str.replace(',', '', regex=False)

In [4]:
choco_data['Date'] = pd.to_datetime(choco_data['Date'], dayfirst=True)
choco_data['Day'] = choco_data['Date'].dt.day
choco_data['Month'] = choco_data['Date'].dt.month
choco_data['Year'] = choco_data['Date'].dt.year

In [5]:
choco_data = choco_data.drop(columns=['Date'])

In [6]:
le = LabelEncoder()
categorical_cols = ['Sales Person', 'Product', 'Country']
for col in categorical_cols:
    choco_data[col] = le.fit_transform(choco_data[col])

In [7]:
choco_data

Unnamed: 0,Sales Person,Country,Product,Amount,Boxes Shipped,Day,Month,Year
0,13,4,14,5320.00,180,4,1,2022
1,23,2,2,7896.00,94,1,8,2022
2,9,2,17,4501.00,91,7,7,2022
3,12,0,17,12726.00,342,27,4,2022
4,13,4,17,13685.00,184,24,2,2022
...,...,...,...,...,...,...,...,...
3277,15,0,20,5303.58,354,17,5,2024
3278,13,5,21,7339.32,121,7,6,2024
3279,5,1,16,616.09,238,26,7,2024
3280,8,2,10,2504.62,397,28,7,2024


In [8]:
x = choco_data.drop('Boxes Shipped',axis=1)
y = choco_data['Boxes Shipped'].astype(int)
print(x)
print(y)

      Sales Person  Country  Product    Amount  Day  Month  Year
0               13        4       14   5320.00    4      1  2022
1               23        2        2   7896.00    1      8  2022
2                9        2       17   4501.00    7      7  2022
3               12        0       17  12726.00   27      4  2022
4               13        4       17  13685.00   24      2  2022
...            ...      ...      ...       ...  ...    ...   ...
3277            15        0       20   5303.58   17      5  2024
3278            13        5       21   7339.32    7      6  2024
3279             5        1       16    616.09   26      7  2024
3280             8        2       10   2504.62   28      7  2024
3281            15        2        1   5915.87   23      5  2024

[3282 rows x 7 columns]
0       180
1        94
2        91
3       342
4       184
       ... 
3277    354
3278    121
3279    238
3280    397
3281    355
Name: Boxes Shipped, Length: 3282, dtype: int64


In [9]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,random_state=1,test_size=0.1)
print(xtrain.shape)
print(xtest.shape)

(2953, 7)
(329, 7)


In [10]:
model = RandomForestRegressor()
model.fit(xtrain, ytrain)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
model.score(xtest,ytest)

0.859417538970662

In [12]:
pipeline=Pipeline([('clf',RandomForestRegressor(criterion='squared_error'))])
parameters={
    'clf__max_depth':(10,20,None),
    'clf__min_samples_leaf':(1,2,5,10),
    'clf__min_samples_split':(2,5,10),
    'clf__n_estimators':(100,200,500),
}

In [13]:
grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid=parameters, 
    n_jobs=-1, 
    verbose=1, 
    scoring='r2' 
)
grid_search.fit(xtrain, ytrain)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


0,1,2
,estimator,Pipeline(step...Regressor())])
,param_grid,"{'clf__max_depth': (10, ...), 'clf__min_samples_leaf': (1, ...), 'clf__min_samples_split': (2, ...), 'clf__n_estimators': (100, ...)}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
print('score',grid_search.best_score_)

score 0.739495242592153


In [15]:
best_parameter=grid_search.best_estimator_.get_params()
best_parameter

{'memory': None,
 'steps': [('clf', RandomForestRegressor(max_depth=20, n_estimators=500))],
 'transform_input': None,
 'verbose': False,
 'clf': RandomForestRegressor(max_depth=20, n_estimators=500),
 'clf__bootstrap': True,
 'clf__ccp_alpha': 0.0,
 'clf__criterion': 'squared_error',
 'clf__max_depth': 20,
 'clf__max_features': 1.0,
 'clf__max_leaf_nodes': None,
 'clf__max_samples': None,
 'clf__min_impurity_decrease': 0.0,
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 2,
 'clf__min_weight_fraction_leaf': 0.0,
 'clf__monotonic_cst': None,
 'clf__n_estimators': 500,
 'clf__n_jobs': None,
 'clf__oob_score': False,
 'clf__random_state': None,
 'clf__verbose': 0,
 'clf__warm_start': False}

In [16]:
best_max_depth=best_parameter['clf__max_depth']
best_sample_leaf=best_parameter['clf__min_samples_leaf']
best_n_estimator=best_parameter['clf__n_estimators']
best_sample_split=best_parameter['clf__min_samples_split']

In [17]:
print('depth',best_max_depth)
print('min sample leaf',best_sample_leaf)
print('n estimators',best_n_estimator)
print('min sample split',best_sample_split)

depth 20
min sample leaf 1
n estimators 500
min sample split 2


In [18]:
model = RandomForestRegressor(n_estimators=500,max_depth=None,min_samples_leaf=1,min_samples_split=2)
model.fit(xtrain, ytrain)

0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
y_prediction = model.predict(xtest)
y_prediction

array([179.276, 347.948, 187.638, 137.26 , 173.028,  66.852,  48.606,
       247.478, 159.58 , 237.224, 153.712, 256.352, 128.904, 270.7  ,
       145.248,  85.686, 243.62 , 326.734, 209.14 , 186.164, 151.5  ,
        93.98 , 143.978, 149.624,  83.47 , 135.08 , 202.562, 279.942,
       109.312, 273.938, 137.016,  70.02 ,  74.336, 128.882,  88.362,
       194.612,  84.92 , 287.392, 137.96 , 265.124, 183.896, 323.024,
       136.428, 111.562, 293.208, 109.678, 153.366, 201.458, 115.318,
       272.318, 194.09 , 225.008, 255.302,  58.652, 208.626, 243.25 ,
       303.902,  79.758, 162.4  , 153.62 , 102.4  ,  93.954, 323.282,
       341.51 , 160.804,  40.068, 106.732, 174.15 ,  75.528, 186.688,
       254.608,  84.812, 323.802, 171.796,  97.012, 209.934, 362.856,
        73.042, 134.434,  72.48 , 129.182,  67.628, 142.142, 113.88 ,
       149.368,  72.302, 202.234, 103.4  , 381.852, 219.948, 108.82 ,
       100.736,  57.95 , 222.614, 100.806, 182.498, 230.626,  73.334,
       313.552,  51.

In [20]:
model.score(xtest,ytest)

0.8674919035352384

In [21]:
error = mean_absolute_error(ytest, y_prediction)
print(f"On average, the model is off by: ${error:.2f}")

On average, the model is off by: $32.58
